Thiết kế website giá rẻ

Question

This is one of my first code in Neon, I want to know if you have any comments or suggestions to improve the code to run faster, that’s why I translate the code to that low level.

I posted the code on Code Review site but I didn’t get any reply and my post was the only one with Neon post so I thought to post it here.

Here is a working code. This code applies a lookup table for an image. My goal is to make the code run faster on ARM cortex A53 CPU.

#include <opencv2/opencv.hpp>
#include <iostream>
#include <vector>
#include <numeric> // For std::iota
#include <array> // Structure to hold cached parameters

#ifdef __ARM_NEON
#include <arm_neon.h>
#else
#error "ARM Compiler required."
#endif

struct Cache {
    std::array<uchar, 256> lut_b;
    std::array<uchar, 256> lut_g;
    std::array<uchar, 256> lut_r;
};

// Function to compute simple example data and lookup tables
void compute_data(const cv::Mat& image, Cache& cache) {
    for (int i = 0; i < 256; i++) {
        cache.lut_b[i] = static_cast<uchar>(i);
        cache.lut_g[i] = static_cast<uchar>(i);
        cache.lut_r[i] = static_cast<uchar>(i);
    }
}

// Function to apply lookup table. vy[i] = vtable[vx[i]]
static inline uint8x16_t lookup_neon(const uint8x16x4_t vtable[4], uint8x16_t vx) {
    const uint8x16_t voffset = vmovq_n_u8(64);
    uint8x16_t vy = vqtbl4q_u8(vtable[0], vx);
    vx = vsubq_u8(vx, voffset);
    vy = vqtbx4q_u8(vy, vtable[1], vx);
    vx = vsubq_u8(vx, voffset);
    vy = vqtbx4q_u8(vy, vtable[2], vx);
    vx = vsubq_u8(vx, voffset);
    vy = vqtbx4q_u8(vy, vtable[3], vx);
    return vy;
}

void hist(cv::Mat& image, Cache& cache, bool use_cache) {
    if (!use_cache) {
        compute_data(image, cache);
    }
    // Load cache in registers. (4x4 128-bit registers)
    const uint8x16x4_t vtable_b[4] = {
        vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 0),
        vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 1),
        vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 2),
        vld1q_u8_x4(cache.lut_b.data() + 16 * 4 * 3)
    };
    const uint8x16x4_t vtable_g[4] = {
        vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 0),
        vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 1),
        vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 2),
        vld1q_u8_x4(cache.lut_g.data() + 16 * 4 * 3)
    };
    const uint8x16x4_t vtable_r[4] = {
        vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 0),
        vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 1),
        vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 2),
        vld1q_u8_x4(cache.lut_r.data() + 16 * 4 * 3)
    };
    for (int i = 0; i < image.rows; ++i) {
        uint8_t* row_ptr = image.ptr(i);
        int j = 0;
        // Apply transformation on elements multiple of 16.
        for (; (j + 16) <= image.cols; j += 16) {
            // Load and deinterleave the elements.
            uint8x16x3_t vec = vld3q_u8(row_ptr);
            vec.val[0] = lookup_neon(vtable_b, vec.val[0]);
            vec.val[1] = lookup_neon(vtable_g, vec.val[1]);
            vec.val[2] = lookup_neon(vtable_r, vec.val[2]);
            vst3q_u8(row_ptr, vec);
            // Interleave and stores the elements.
            row_ptr += 3 * 16;
        }
        // Apply transformation on leftover elements.
        for (; j < image.cols; ++j) {
            row_ptr[0] = cache.lut_b[row_ptr[0]];
            row_ptr[1] = cache.lut_g[row_ptr[1]];
            row_ptr[2] = cache.lut_r[row_ptr[2]];
            row_ptr += 3;
        }
    }
}

int main(int argc, char** argv) {
    // Open the video file
    cv::VideoCapture cap("video.mp4");
    if (!cap.isOpened()) {
        std::cerr << "Error opening video file" << std::endl;
        return -1;
    }

    // Get the frame rate of the video
    double fps = cap.get(cv::CAP_PROP_FPS);
    int delay = static_cast<int>(1000 / fps);

    // Create a window to display the video
    cv::namedWindow("Processed Video", cv::WINDOW_NORMAL);

    cv::Mat frame;
    Cache cache;
    int frame_count = 0;
    int recompute_interval = 5; // Recompute every 5 frames

    while (true) {
        cap >> frame;
        if (frame.empty()) {
            break;
        }

        // Determine whether to use the cache or recompute the data
        bool use_cache = (frame_count % recompute_interval != 0);

        // Process the frame using cached or recomputed parameters
        hist(frame, cache, use_cache);

        // Display the processed frame
        cv::imshow("Processed Video", frame);

        // Break the loop if 'q' is pressed
        if (cv::waitKey(delay) == 'q') {
            break;
        }

        frame_count++;
    }

    cap.release();
    cv::destroyAllWindows();

    return 0;
}

Thiết kế website giá rẻ

Danh mục

Optimizing a for loop with lookup-table using ARM Neon instructions