From 0e383cf8dfb1b591febdd558b03f137a0e983518 Mon Sep 17 00:00:00 2001 From: lishuai183 Date: Wed, 20 Dec 2023 17:03:50 +0800 Subject: [PATCH] moe tutel ut. --- tests/test_npu_moe_tutel.py | 24 ++++++++++++---- tests/test_npu_moe_tutel_backward.py | 42 ++++++++++++++++++---------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/tests/test_npu_moe_tutel.py b/tests/test_npu_moe_tutel.py index 8723ced2..4e7be4b8 100644 --- a/tests/test_npu_moe_tutel.py +++ b/tests/test_npu_moe_tutel.py @@ -11,12 +11,18 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] class TestMoeTutel(TestCase): # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def cpu_to_exec(self, x, gates, indices, locations, capacity, batch_size, sample_size, hidden, dtype): - result = torch.zeros([batch_size, capacity, hidden]).to(dtype) + if dtype != torch.float32: + x = x.to(torch.float32) + gates = gates.to(torch.float32) + + result = torch.zeros([batch_size, capacity, hidden]).to(torch.float32) for tensor_idx in range(batch_size): for i in range(sample_size): if locations[tensor_idx, i] < capacity and indices[tensor_idx, i] >= 0: result[int(indices[tensor_idx, i]), int(locations[tensor_idx, i]), :] = gates[tensor_idx, i] * x[i, :] + if dtype != torch.float32: + result = result.to(dtype) return result def npu_to_exec(self, x, gates, indices, locations, capacity): @@ -24,12 +30,12 @@ class TestMoeTutel(TestCase): return out.cpu() def gen_data(self, shape, dtype): - cpu_input = torch.rand(shape, dtype=dtype) + cpu_input = torch.randn(shape, dtype=dtype) npu_input = cpu_input.npu() return cpu_input, npu_input def gen_data_gates(self, shape, dtype): - cpu_input = torch.rand(shape).bool().to(dtype) + cpu_input = torch.randn(shape).bool().to(dtype) npu_input = cpu_input.npu() return cpu_input, npu_input @@ -55,10 +61,16 @@ class TestMoeTutel(TestCase): def test_moe_tutel(self): dtype_list = [torch.float16, torch.float32, torch.bfloat16] shape_list = [ + # small shape [[2, 5], [5, 16], 6], [[3, 6], [6, 16], 6], [[4, 7], [7, 32], 12], [[5, 8], [8, 32], 12], + # big shape + [[2, 16384], [16384, 64], 16384], + [[8, 256], [256, 128], 256], + [[2, 16384], [16384, 128], 16384], + [[8, 256], [256, 256], 256], [[2, 16384], [16384, 32], 16384], ] items = [ @@ -78,9 +90,9 @@ class TestMoeTutel(TestCase): cpu_out = self.cpu_to_exec(cpu_x, cpu_gates, cpu_indices, cpu_locations, capacity, batch_size, sample_size, hidden, dtype) npu_out = self.npu_to_exec(npu_x, npu_gates, npu_indices, npu_locations, capacity) - if dtype == torch.bfloat16 or dtype == torch.float16: - npu_out = npu_out.to(torch.float32) - cpu_out = cpu_out.to(torch.float32) + if dtype == torch.bfloat16: + npu_out = npu_out.to(torch.float16) + cpu_out = cpu_out.to(torch.float16) self.assertRtolEqual(npu_out.numpy(), cpu_out.numpy()) diff --git a/tests/test_npu_moe_tutel_backward.py b/tests/test_npu_moe_tutel_backward.py index ee401777..41cc3fc9 100644 --- a/tests/test_npu_moe_tutel_backward.py +++ b/tests/test_npu_moe_tutel_backward.py @@ -11,16 +11,16 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] class TestMoeTutel(TestCase): # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def cpu_to_exec(self, x, gates, indices, locations, capacity, batch_size, sample_size, hidden, dtype): - x.requires_grad = True - gates.requires_grad = True - out = torch.zeros([batch_size, capacity, hidden]).to(dtype) - for tensor_idx in range(batch_size): - for i in range(sample_size): + y_grad = torch.ones([batch_size, capacity, hidden]).to(torch.float32) + x_grad = torch.zeros([sample_size, hidden]).to(torch.float32) + gates_grad = torch.zeros([batch_size, sample_size]).to(torch.float32) + for i in range(sample_size): + for tensor_idx in range(batch_size): if locations[tensor_idx, i] < capacity and indices[tensor_idx, i] >= 0: - out[indices[tensor_idx, i], locations[tensor_idx, i], :] = gates[tensor_idx, i] * x[i, :] - out.backward(torch.ones_like(out)) - x_grad = x.grad - gates_grad = gates.grad + x_grad[i, :] += gates[tensor_idx, i] * y_grad[indices[tensor_idx, i], locations[tensor_idx, i], :] + gates_grad[tensor_idx, i] = sum( + y_grad[indices[tensor_idx, i], locations[tensor_idx, i], :] * x[i, :]) + return x_grad, gates_grad def npu_to_exec(self, x, gates, indices, locations, capacity): @@ -33,12 +33,12 @@ class TestMoeTutel(TestCase): return x_grad, gates_grad def gen_data(self, shape, dtype): - cpu_input = torch.rand(shape, dtype=dtype) + cpu_input = torch.randn(shape, dtype=dtype) npu_input = cpu_input.npu() return cpu_input, npu_input def gen_data_gates(self, shape, dtype): - cpu_input = torch.rand(shape).bool().to(dtype) + cpu_input = torch.randn(shape).bool().to(dtype) npu_input = cpu_input.npu() return cpu_input, npu_input @@ -64,10 +64,16 @@ class TestMoeTutel(TestCase): def test_moe_tutel(self): dtype_list = [torch.float16, torch.float32, torch.bfloat16] shape_list = [ + # small shape [[2, 5], [5, 16], 6], [[3, 6], [6, 16], 6], [[4, 7], [7, 32], 12], [[5, 8], [8, 32], 12], + # big shape + [[2, 16384], [16384, 64], 16384], + [[8, 256], [256, 128], 256], + [[2, 16384], [16384, 128], 16384], + [[8, 256], [256, 256], 256], [[2, 16384], [16384, 32], 16384], ] items = [ @@ -87,13 +93,21 @@ class TestMoeTutel(TestCase): cpu_grad1, cpu_grad2 = self.cpu_to_exec(cpu_x, cpu_gates, cpu_indices, cpu_locations, capacity, batch_size, sample_size, hidden, dtype) npu_grad1, npu_grad2 = self.npu_to_exec(npu_x, npu_gates, npu_indices, npu_locations, capacity) - if dtype == torch.bfloat16 or dtype == torch.float16: + if dtype != torch.float32: cpu_grad1 = cpu_grad1.to(torch.float32) cpu_grad2 = cpu_grad2.to(torch.float32) npu_grad1 = npu_grad1.to(torch.float32) npu_grad2 = npu_grad2.to(torch.float32) - self.assertRtolEqual(npu_grad1.detach().cpu().numpy(), cpu_grad1.numpy()) - self.assertRtolEqual(npu_grad2.detach().cpu().numpy(), cpu_grad2.numpy()) + + if dtype == torch.bfloat16: + self.assertRtolEqual(npu_grad1.detach().cpu().numpy(), cpu_grad1.detach().numpy(), 4.e-3) + self.assertRtolEqual(npu_grad2.detach().cpu().numpy(), cpu_grad2.detach().numpy(), 4.e-3) + elif dtype == torch.float16: + self.assertRtolEqual(npu_grad1.detach().cpu().numpy(), cpu_grad1.detach().numpy(), 1.e-3) + self.assertRtolEqual(npu_grad2.detach().cpu().numpy(), cpu_grad2.detach().numpy(), 1.e-3) + else: + self.assertRtolEqual(npu_grad1.detach().cpu().numpy(), cpu_grad1.detach().numpy()) + self.assertRtolEqual(npu_grad2.detach().cpu().numpy(), cpu_grad2.detach().numpy()) if __name__ == '__main__': -- Gitee