This is one of my first code in Neon
, I want to know if you have any comments or suggestions to improve the code to run faster, that’s why I translate the code to that low level.
I posted the code on Code Review
site but I didn’t get any reply and my post was the only one with Neon
post so I thought to post it here.
Here is a working code. This code applies a lookup table for an image. My goal is to make the code run faster on ARM cortex A53 CPU
.
#include <opencv2/opencv.hpp>
#include <iostream>
#include <vector>
#include <numeric> // For std::iota
#include <array> // Structure to hold cached parameters
#ifdef __ARM_NEON
#include <arm_neon.h>
#else
#error "ARM Compiler required."
#endif
struct Cache {
std::array<uchar, 256> lut_b;
std::array<uchar, 256> lut_g;
std::array<uchar, 256> lut_r;
};
// Function to compute simple example data and lookup tables
void compute_data(const cv::Mat& image, Cache& cache) {
for (int i = 0; i < 256; i++) {
cache.lut_b[i] = static_cast<uchar>(i);
cache.lut_g[i] = static_cast<uchar>(i);
cache.lut_r[i] = static_cast<uchar>(i);
}
}
// Function to apply lookup table. vy[i] = vtable[vx[i]]
static inline uint8x16_t lookup_neon(const uint8x16x4_t vtable[4], uint8x16_t vx) {
const uint8x16_t voffset = vmovq_n_u8(64);
uint8x16_t vy = vqtbl4q_u8(vtable[0], vx);
vx = vsubq_u8(vx, voffset);
vy = vqtbx4q_u8(vy, vtable[1], vx);
vx = vsubq_u8(vx, voffset);
vy = vqtbx4q_u8(vy, vtable[2], vx);
vx = vsubq_u8(vx, voffset);
vy = vqtbx4q_u8(vy, vtable[3], vx);
return vy;
}
void hist(cv::Mat& image, Cache& cache, bool use_cache) {
if (!use_cache) {
compute_data(image, cache);
}
// Load cache in registers. (4x4 128-bit registers)
const uint8x16x4_t vtable_b[4] = {
vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 0),
vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 1),
vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 2),
vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 3)
};
const uint8x16x4_t vtable_g[4] = {
vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 0),
vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 1),
vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 2),
vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 3)
};
const uint8x16x4_t vtable_r[4] = {
vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 0),
vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 1),
vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 2),
vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 3)
};
for (int i = 0; i < image.rows; ++i) {
uint8_t* row_ptr = image.ptr(i);
int j = 0;
// Apply transformation on elements multiple of 16.
for (; (j + 16) <= image.cols; j += 16) {
// Load and deinterleave the elements.
uint8x16x3_t vec = vld3q_u8(row_ptr);
vec.val[0] = lookup_neon(vtable_b, vec.val[0]);
vec.val[1] = lookup_neon(vtable_g, vec.val[1]);
vec.val[2] = lookup_neon(vtable_r, vec.val[2]);
vst3q_u8(row_ptr, vec);
// Interleave and stores the elements.
row_ptr += 3 * 16;
}
// Apply transformation on leftover elements.
for (; j < image.cols; ++j) {
row_ptr[0] = cache.lut_b[row_ptr[0]];
row_ptr[1] = cache.lut_g[row_ptr[1]];
row_ptr[2] = cache.lut_r[row_ptr[2]];
row_ptr += 3;
}
}
}
int main(int argc, char** argv) {
// Open the video file
cv::VideoCapture cap("video.mp4");
if (!cap.isOpened()) {
std::cerr << "Error opening video file" << std::endl;
return -1;
}
// Get the frame rate of the video
double fps = cap.get(cv::CAP_PROP_FPS);
int delay = static_cast<int>(1000 / fps);
// Create a window to display the video
cv::namedWindow("Processed Video", cv::WINDOW_NORMAL);
cv::Mat frame;
Cache cache;
int frame_count = 0;
int recompute_interval = 5; // Recompute every 5 frames
while (true) {
cap >> frame;
if (frame.empty()) {
break;
}
// Determine whether to use the cache or recompute the data
bool use_cache = (frame_count % recompute_interval != 0);
// Process the frame using cached or recomputed parameters
hist(frame, cache, use_cache);
// Display the processed frame
cv::imshow("Processed Video", frame);
// Break the loop if 'q' is pressed
if (cv::waitKey(delay) == 'q') {
break;
}
frame_count++;
}
cap.release();
cv::destroyAllWindows();
return 0;
}