I’m having some issues using Neon operations to multiply two 4×4 int32_t matrices. I tried using the following code:
#include <arm_neon.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
void multiply_matrix_neon(int32_t *A, int32_t *B, int32_t *C, int n) {
// Load the rows of matrix A
int32x4_t A0 = vld1q_s32(A);
int32x4_t A1 = vld1q_s32(A + 4);
int32x4_t A2 = vld1q_s32(A + 8);
int32x4_t A3 = vld1q_s32(A + 12);
// Load the columns of matrix B
int32x4_t B0 = vld1q_s32(B);
int32x4_t B1 = vld1q_s32(B + 4);
int32x4_t B2 = vld1q_s32(B + 8);
int32x4_t B3 = vld1q_s32(B + 12);
// Initialize accumulators for matrix C
int32x4_t C0 = vmovq_n_s32(0);
int32x4_t C1 = vmovq_n_s32(0);
int32x4_t C2 = vmovq_n_s32(0);
int32x4_t C3 = vmovq_n_s32(0);
// Compute C0 = A * B0
C0 = vmlaq_n_s32(C0, A0, vgetq_lane_s32(B0, 0));
C0 = vmlaq_n_s32(C0, A1, vgetq_lane_s32(B0, 1));
C0 = vmlaq_n_s32(C0, A2, vgetq_lane_s32(B0, 2));
C0 = vmlaq_n_s32(C0, A3, vgetq_lane_s32(B0, 3));
// Compute C1 = A * B1
C1 = vmlaq_n_s32(C1, A0, vgetq_lane_s32(B1, 0));
C1 = vmlaq_n_s32(C1, A1, vgetq_lane_s32(B1, 1));
C1 = vmlaq_n_s32(C1, A2, vgetq_lane_s32(B1, 2));
C1 = vmlaq_n_s32(C1, A3, vgetq_lane_s32(B1, 3));
// Compute C2 = A * B2
C2 = vmlaq_n_s32(C2, A0, vgetq_lane_s32(B2, 0));
C2 = vmlaq_n_s32(C2, A1, vgetq_lane_s32(B2, 1));
C2 = vmlaq_n_s32(C2, A2, vgetq_lane_s32(B2, 2));
C2 = vmlaq_n_s32(C2, A3, vgetq_lane_s32(B2, 3));
// Compute C3 = A * B3
C3 = vmlaq_n_s32(C3, A0, vgetq_lane_s32(B3, 0));
C3 = vmlaq_n_s32(C3, A1, vgetq_lane_s32(B3, 1));
C3 = vmlaq_n_s32(C3, A2, vgetq_lane_s32(B3, 2));
C3 = vmlaq_n_s32(C3, A3, vgetq_lane_s32(B3, 3));
// Store the results into C
vst1q_s32(C, C0);
vst1q_s32(C + 4, C1);
vst1q_s32(C + 8, C2);
vst1q_s32(C + 12, C3);
}
void multiply_matrix_normal(int32_t *A, int32_t *B, int32_t *result, int n) {
int32_t temp[16] = {0}; // Using int32_t to prevent overflow during multiplication
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
for (int k = 0; k < n; ++k) {
temp[i * n + j] += A[i * n + k] * B[k * n + j];
}
}
}
memcpy(result, temp, n * n * sizeof(int32_t));
}
// Function to print a matrix
void print_matrix(int32_t* matrix, int n) {
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
printf("%d ", matrix[i * n + j]);
}
printf("n");
}
printf("n");
}
int main() {
// Define two 4x4 matrices
int32_t A[16] = {
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16
};
int32_t B[16] = {
17, 18, 19, 20,
21, 22, 23, 24,
25, 26, 27, 28,
29, 30, 31, 32
};
int32_t neon[16] = {0};
multiply_matrix_neon(A, B, neon, 4);
printf("Neon matrix:n");
print_matrix(neon, 4);
int32_t normal[16] = {0};
multiply_matrix_normal(A, B, normal, 4);
printf("Normal matrix:n");
print_matrix(normal, 4);
return 0;
}
Which I adapted from following the ARM developer website (which uses floats): https://developer.arm.com/documentation/102467/latest/Example—matrix-multiplication
(Note here that I cannot simply do C0 = vfmaq_laneq_s32(C0, A0, B0, 0);
as they do because that instruction is not compatible with ARMv7)
And this GitHub that I found, which uses int32s: https://github.com/ruthreshx/Matrix_Multiply_using_Arm_Neon_and_Avx/blob/main/int32mul.c
However, the functions give different results, with the normal
one being correct.
Neon matrix:
538 612 686 760
650 740 830 920
762 868 974 1080
874 996 1118 1240
Normal matrix:
250 260 270 280
618 644 670 696
986 1028 1070 1112
1354 1412 1470 1528
Does anyone have a working solution for multiplying 2 4×4 int32_t matrices using Neon?