#### Why matrix multiply (float32_4x4) with armv8 NEON instructions is slower?

Below code is using NEON instructions (from UE4)

``````void matrixMultiplyNeon(float* ret, float32x4_t* A, float32x4_t* B) {

float32x4_t * R = (float32x4_t*)ret;
float32x4_t temp, r0, r1, r2, r3;

auto low  = vget_low_f32(A[0]);
auto high = vget_high_f32(A[0]);
temp = vmulq_lane_f32(      B[0], low, 0);
temp = vmlaq_lane_f32(temp, B[1], low, 1);
temp = vmlaq_lane_f32(temp, B[2], high, 0);
r0   = vmlaq_lane_f32(temp, B[3], high, 1);

low  = vget_low_f32(A[1]);
high = vget_high_f32(A[1]);
temp = vmulq_lane_f32(      B[0], low, 0);
temp = vmlaq_lane_f32(temp, B[1], low, 1);
temp = vmlaq_lane_f32(temp, B[2], high, 0);
r1   = vmlaq_lane_f32(temp, B[3], high, 1);

low  = vget_low_f32(A[2]);
high = vget_high_f32(A[2]);
temp = vmulq_lane_f32(      B[0], low, 0);
temp = vmlaq_lane_f32(temp, B[1], low, 1);
temp = vmlaq_lane_f32(temp, B[2], high, 0);
r2   = vmlaq_lane_f32(temp, B[3], high, 1);

low  = vget_low_f32(A[3]);
high = vget_high_f32(A[3]);
temp = vmulq_lane_f32(      B[0], low, 0);
temp = vmlaq_lane_f32(temp, B[1], low, 1);
temp = vmlaq_lane_f32(temp, B[2], high, 0);
r3   = vmlaq_lane_f32(temp, B[3], high, 1);

R[0] = r0;
R[1] = r1;
R[2] = r2;
R[3] = r3;
}
``````

Below code is my normal matrix multiply, use a array float[16]

``````void matrixMultiply(float* ret, float* m1, float* m2) {
float product[16];
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
product[8]  = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
product[9]  = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
memcpy(ret, product, sizeof(float) * 16);
}
``````

The test is a 1024*1024 times for loop, result is :
NO NEON 366ms
NEON 428ms

Why the NEON code is more slower and how to optimized ?

Source: Windows Questions C++