diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 414aa8a9289681e9a3c768ec8684434df1ab8eec..17db34582afd3ee8e78f4f01b0815ad3698c7f0c 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -61,6 +61,7 @@
 #define ARM_CPU_IMP_FUJITSU		0x46
 #define ARM_CPU_IMP_HISI		0x48
 #define ARM_CPU_IMP_AMPERE		0xC0
+#define ARM_CPU_IMP_PHYTIUM		0x70
 
 #define ARM_CPU_PART_AEM_V8		0xD0F
 #define ARM_CPU_PART_FOUNDATION		0xD00
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 6e20da9bf4f5924c5fd631d331a03827b6ce8ce1..abb5f76454b5baeb1529748a7b31c7602c0d7511 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -141,6 +141,28 @@ static bool kvm_is_device_pfn(unsigned long pfn)
 	return !pfn_valid(pfn);
 }
 
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+static bool stage2_pte_cacheable(u64 pte)
+{
+        u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+        return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN      BIT(54)
+static bool stage2_pte_executable(u64 pte)
+{
+         return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
+static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+        __clean_dcache_guest_page(pfn, size);
+}
+
+static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+        __invalidate_icache_guest_page(pfn, size);
+}
+
 /**
  * stage2_dissolve_pmd() - clear and flush huge PMD entry
  * @kvm:	pointer to kvm structure.
@@ -1173,6 +1195,13 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 		pmd_clear(pmd);
 		kvm_tlb_flush_vmid_ipa(kvm, addr);
 	} else {
+        /* Flush data cache before installation of the new PTE */
+		if (stage2_pte_cacheable(pmd_val(*new_pmd)))
+			kvm_flush_dcache_pmd(*new_pmd);
+
+        if (stage2_pte_executable(pmd_val(*new_pmd)))
+			invalidate_icache_guest_page(pmd_pfn(*new_pmd), S2_PMD_SIZE);
+
 		get_page(virt_to_page(pmd));
 	}
 
@@ -1359,6 +1388,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 		kvm_set_pte(pte, __pte(0));
 		kvm_tlb_flush_vmid_ipa(kvm, addr);
 	} else {
+		/* Flush data cache before installation of the new PTE */
+		if (stage2_pte_cacheable(pte_val(*new_pte)))
+			kvm_flush_dcache_pte(*new_pte);
+
+		if (stage2_pte_executable(pte_val(*new_pte)))
+			invalidate_icache_guest_page(pte_pfn(*new_pte), PAGE_SIZE);
+
 		get_page(virt_to_page(pte));
 	}
 
@@ -1650,16 +1686,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__clean_dcache_guest_page(pfn, size);
-}
-
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__invalidate_icache_guest_page(pfn, size);
-}
-
 static void kvm_send_hwpoison_signal(unsigned long address,
 				     struct vm_area_struct *vma)
 {
@@ -1855,12 +1881,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		kvm_set_pfn_dirty(pfn);
 
-	if (fault_status != FSC_PERM && !is_iomap(flags))
-		clean_dcache_guest_page(pfn, vma_pagesize);
-
-	if (exec_fault)
-		invalidate_icache_guest_page(pfn, vma_pagesize);
-
 	/*
 	 * If we took an execution fault we have made the
 	 * icache/dcache coherent above and should now let the s2
@@ -2165,7 +2185,6 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 	 * We've moved a page around, probably through CoW, so let's treat it
 	 * just like a translation fault and clean the cache to the PoC.
 	 */
-	clean_dcache_guest_page(pfn, PAGE_SIZE);
 	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
 	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
 
@@ -2529,9 +2548,21 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
 	 * If switching it off, need to clean the caches.
 	 * Clean + invalidate does the trick always.
 	 */
-	if (now_enabled != was_enabled)
-		stage2_flush_vm(vcpu->kvm);
+	if (now_enabled != was_enabled) {
 
+		/*
+		 * Due to Phytium CPU's cache consistency support,
+		 * just flush dcache on one vcpu not all vcpus in the VM.
+		 * This can reduce the number of flush dcaches and
+		 * improve the efficiency of SMP multi-core startup,
+		 * especially for the large VM with hugepages.
+		 */
+		if(read_cpuid_implementor() == ARM_CPU_IMP_PHYTIUM) {
+			if (vcpu->vcpu_id == 0)
+				stage2_flush_vm(vcpu->kvm);
+		} else
+			stage2_flush_vm(vcpu->kvm);
+	}
 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
 	if (now_enabled)
 		*vcpu_hcr(vcpu) &= ~HCR_TVM;