Thiết kế website giá rẻ

Question

I’m having some issues using Neon operations to multiply two 4×4 int32_t matrices. I tried using the following code:

#include <arm_neon.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

void multiply_matrix_neon(int32_t *A, int32_t *B, int32_t *C, int n) {
    // Load the rows of matrix A
    int32x4_t A0 = vld1q_s32(A);
    int32x4_t A1 = vld1q_s32(A + 4);
    int32x4_t A2 = vld1q_s32(A + 8);
    int32x4_t A3 = vld1q_s32(A + 12);

    // Load the columns of matrix B
    int32x4_t B0 = vld1q_s32(B);
    int32x4_t B1 = vld1q_s32(B + 4);
    int32x4_t B2 = vld1q_s32(B + 8);
    int32x4_t B3 = vld1q_s32(B + 12);

    // Initialize accumulators for matrix C
    int32x4_t C0 = vmovq_n_s32(0);
    int32x4_t C1 = vmovq_n_s32(0);
    int32x4_t C2 = vmovq_n_s32(0);
    int32x4_t C3 = vmovq_n_s32(0);

    // Compute C0 = A * B0
    C0 = vmlaq_n_s32(C0, A0, vgetq_lane_s32(B0, 0));
    C0 = vmlaq_n_s32(C0, A1, vgetq_lane_s32(B0, 1));
    C0 = vmlaq_n_s32(C0, A2, vgetq_lane_s32(B0, 2));
    C0 = vmlaq_n_s32(C0, A3, vgetq_lane_s32(B0, 3));

    // Compute C1 = A * B1
    C1 = vmlaq_n_s32(C1, A0, vgetq_lane_s32(B1, 0));
    C1 = vmlaq_n_s32(C1, A1, vgetq_lane_s32(B1, 1));
    C1 = vmlaq_n_s32(C1, A2, vgetq_lane_s32(B1, 2));
    C1 = vmlaq_n_s32(C1, A3, vgetq_lane_s32(B1, 3));

    // Compute C2 = A * B2
    C2 = vmlaq_n_s32(C2, A0, vgetq_lane_s32(B2, 0));
    C2 = vmlaq_n_s32(C2, A1, vgetq_lane_s32(B2, 1));
    C2 = vmlaq_n_s32(C2, A2, vgetq_lane_s32(B2, 2));
    C2 = vmlaq_n_s32(C2, A3, vgetq_lane_s32(B2, 3));

    // Compute C3 = A * B3
    C3 = vmlaq_n_s32(C3, A0, vgetq_lane_s32(B3, 0));
    C3 = vmlaq_n_s32(C3, A1, vgetq_lane_s32(B3, 1));
    C3 = vmlaq_n_s32(C3, A2, vgetq_lane_s32(B3, 2));
    C3 = vmlaq_n_s32(C3, A3, vgetq_lane_s32(B3, 3));

    // Store the results into C
    vst1q_s32(C, C0);
    vst1q_s32(C + 4, C1);
    vst1q_s32(C + 8, C2);
    vst1q_s32(C + 12, C3);
}

void multiply_matrix_normal(int32_t *A, int32_t *B, int32_t *result, int n) {
    int32_t temp[16] = {0}; // Using int32_t to prevent overflow during multiplication

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            for (int k = 0; k < n; ++k) {
                temp[i * n + j] += A[i * n + k] * B[k * n + j];
            }
        }
    }

    memcpy(result, temp, n * n * sizeof(int32_t));
}

// Function to print a matrix
void print_matrix(int32_t* matrix, int n) {
    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < n; j++) {
            printf("%d ", matrix[i * n + j]);
        }
        printf("n");
    }
    printf("n");
}

int main() {
    // Define two 4x4 matrices
    int32_t A[16] = {
        1, 2, 3, 4,
        5, 6, 7, 8,
        9, 10, 11, 12,
        13, 14, 15, 16
    };

    int32_t B[16] = {
        17, 18, 19, 20,
        21, 22, 23, 24,
        25, 26, 27, 28,
        29, 30, 31, 32
    };

    int32_t neon[16] = {0};
    multiply_matrix_neon(A, B, neon, 4);
    printf("Neon matrix:n");
    print_matrix(neon, 4);

    int32_t normal[16] = {0};
    multiply_matrix_normal(A, B, normal, 4);
    printf("Normal matrix:n");
    print_matrix(normal, 4);

    return 0;
}

Which I adapted from following the ARM developer website (which uses floats): https://developer.arm.com/documentation/102467/latest/Example—matrix-multiplication
(Note here that I cannot simply do C0 = vfmaq_laneq_s32(C0, A0, B0, 0); as they do because that instruction is not compatible with ARMv7)

And this GitHub that I found, which uses int32s: https://github.com/ruthreshx/Matrix_Multiply_using_Arm_Neon_and_Avx/blob/main/int32mul.c

However, the functions give different results, with the normal one being correct.

Neon matrix:
538 612 686 760
650 740 830 920
762 868 974 1080
874 996 1118 1240

Normal matrix:
250 260 270 280
618 644 670 696
986 1028 1070 1112
1354 1412 1470 1528

Does anyone have a working solution for multiplying 2 4×4 int32_t matrices using Neon?

Thiết kế website giá rẻ

Danh mục

Matrix Multiplication with Neon on ARMv7-a (4×4 int32_t)