Why matrix multiply (float32_4x4) with armv8 NEON instructions is slower?

  android, arm, c++, neon, simd

Below code is using NEON instructions (from UE4)

void matrixMultiplyNeon(float* ret, float32x4_t* A, float32x4_t* B) {

    float32x4_t * R = (float32x4_t*)ret;
    float32x4_t temp, r0, r1, r2, r3;

    auto low  = vget_low_f32(A[0]);
    auto high = vget_high_f32(A[0]);
    temp = vmulq_lane_f32(      B[0], low, 0);
    temp = vmlaq_lane_f32(temp, B[1], low, 1);
    temp = vmlaq_lane_f32(temp, B[2], high, 0);
    r0   = vmlaq_lane_f32(temp, B[3], high, 1);

    low  = vget_low_f32(A[1]);
    high = vget_high_f32(A[1]);
    temp = vmulq_lane_f32(      B[0], low, 0);
    temp = vmlaq_lane_f32(temp, B[1], low, 1);
    temp = vmlaq_lane_f32(temp, B[2], high, 0);
    r1   = vmlaq_lane_f32(temp, B[3], high, 1);

    low  = vget_low_f32(A[2]);
    high = vget_high_f32(A[2]);
    temp = vmulq_lane_f32(      B[0], low, 0);
    temp = vmlaq_lane_f32(temp, B[1], low, 1);
    temp = vmlaq_lane_f32(temp, B[2], high, 0);
    r2   = vmlaq_lane_f32(temp, B[3], high, 1);

    low  = vget_low_f32(A[3]);
    high = vget_high_f32(A[3]);
    temp = vmulq_lane_f32(      B[0], low, 0);
    temp = vmlaq_lane_f32(temp, B[1], low, 1);
    temp = vmlaq_lane_f32(temp, B[2], high, 0);
    r3   = vmlaq_lane_f32(temp, B[3], high, 1);

    R[0] = r0;
    R[1] = r1;
    R[2] = r2;
    R[3] = r3;
}

Below code is my normal matrix multiply, use a array float[16]

void matrixMultiply(float* ret, float* m1, float* m2) {
    float product[16];
    product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
    product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
    product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
    product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
    product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
    product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
    product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
    product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
    product[8]  = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
    product[9]  = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
    product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
    product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
    product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
    product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
    product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
    product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
    memcpy(ret, product, sizeof(float) * 16);
}

The test is a 1024*1024 times for loop, result is :
NO NEON 366ms
NEON 428ms

Why the NEON code is more slower and how to optimized ?

Source: Windows Questions C++

LEAVE A COMMENT