Hanjie's Blog

macOS M1平台下编译使用ONNX Runtime

Posted on 2024-06-25 In Tech , Mac Disqus:

git clone --recursive --depth=1 --branch v1.18.0 https://github.com/microsoft/onnxruntime.git   
cd onnxruntime  

conda create --name onnx python=3.10 
conda activate onnx

# 如果系统中已经存在protobuf，nlohmann_json，会在编译时与onnxruntime内部使用的版本发生冲突，为了避免这种情况，在编译前先暂时删除掉
# brew remove protobuf nlohmann-json

./build.sh --config Release --use_coreml --build_shared_lib --parallel --compile_no_warning_as_error --skip_submodule_sync --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64

Please note that these instructions build the debug build, which may have performance tradeoffs. The “–config” parameter has four valid values: Debug, Release, RelWithDebInfo and MinSizeRel. Compared to “Release”, “RelWithDebInfo” not only has debug info, it also disables some inlines to make the binary easier to debug. Thus RelWithDebInfo is slower than Release.

Cross Android Compiling

./build.sh --parallel --config Release --android --build_shared_lib --android_sdk_path /Users/luohanjie/Library/Android/sdk --android_ndk_path /Users/luohanjie/Library/Android/sdk/ndk/27.0.12077973 --android_abi arm64-v8a --android_api 29

Please use NDK 26 as it has a version of clang that handles the bfloat16 type.

使用NEON指令集优化矩阵运算代码

Posted on 2024-05-29 In Tech , Code Disqus:

有一个矩阵运算\(Y=J^tMJ\)，其中\(J\)为[30576, 8] 大的矩阵，\(M\)是[30576, 30576]的对角矩阵，最终输出的\(Y\)为[8, 8]大的对称矩阵。使用Eigen库，如下实现矩阵运算：

1	Y.noalias() = J.transpose() * M.asDiagonal() * J; // 这里M为一个30576长的vector

我们希望能够能够通过NEON指令集实现该矩阵运算，获得更短的运算耗时。

NEON优化版

void CalcAtMA(const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> &A,
              const Eigen::VectorXf &M,
              Eigen::Matrix<float, 8, 8> &At_M_A) {
    // 参考: https://developer.arm.com/documentation/102107a/0100/Single-precision-4x4-matrix-multiplication

    int At_idx;
    int A_idx;
    int At_M_A_idx;

    // these are the columns of a 4x4 sub matrix of At
    float32x4_t At0;
    float32x4_t At1;
    float32x4_t At2;
    float32x4_t At3;

    // these are the columns of a 4x4 sub matrix of A
    float32x4_t A0;
    float32x4_t A1;
    float32x4_t A2;
    float32x4_t A3;

    // these are the columns of a 4x4 sub matrix of At_M_A
    float32x4_t At_M_A0;
    float32x4_t At_M_A1;
    float32x4_t At_M_A2;
    float32x4_t At_M_A3;

    // [8, 8] = [8, 30576] * [30576, 30576] * [30576, 8]
    //           n    k       k        k        k     m
    uint32_t n = A.cols();  // = m
    uint32_t k = A.rows();
    uint32_t k_step = k - 4;

    const float *A_ptr = A.data();
    const float *M_ptr = M.data();
    float *At_M_A_ptr = At_M_A.data();

    for (int i_idx = 0; i_idx < n; i_idx += 4) {
        for (int j_idx = i_idx; j_idx < n; j_idx += 4) {
            // Zero accumulators before matrix op
            At_M_A0 = vmovq_n_f32(0);
            At_M_A1 = vmovq_n_f32(0);
            At_M_A2 = vmovq_n_f32(0);
            At_M_A3 = vmovq_n_f32(0);
            int k_idx = 0;
            for (; k_idx <= k_step; k_idx += 4) {

                // Compute base index to 4x4 block
                At_idx = k * i_idx + k_idx;
                A_idx = k * j_idx + k_idx;

                // Load most current At values in row
                At0 = vld1q_f32(A_ptr + At_idx);
                At1 = vld1q_f32(A_ptr + At_idx + k);
                At2 = vld1q_f32(A_ptr + At_idx + 2 * k);
                At3 = vld1q_f32(A_ptr + At_idx + 3 * k);

                MatTransposeInp4x4NeonF32(At0, At1, At2, At3, At0, At1, At2, At3);

                At0 = vmulq_n_f32(At0, M_ptr[k_idx]);
                At1 = vmulq_n_f32(At1, M_ptr[k_idx + 1]);
                At2 = vmulq_n_f32(At2, M_ptr[k_idx + 2]);
                At3 = vmulq_n_f32(At3, M_ptr[k_idx + 3]);

                // Multiply accumulate in 4x1 blocks, i.e. each column in C
                // Load most current A values in col
                A0 = vld1q_f32(A_ptr + A_idx);
                At_M_A0 = vfmaq_laneq_f32(At_M_A0, At0, A0, 0);
                At_M_A0 = vfmaq_laneq_f32(At_M_A0, At1, A0, 1);
                At_M_A0 = vfmaq_laneq_f32(At_M_A0, At2, A0, 2);
                At_M_A0 = vfmaq_laneq_f32(At_M_A0, At3, A0, 3);

                A1 = vld1q_f32(A_ptr + A_idx + k);
                At_M_A1 = vfmaq_laneq_f32(At_M_A1, At0, A1, 0);
                At_M_A1 = vfmaq_laneq_f32(At_M_A1, At1, A1, 1);
                At_M_A1 = vfmaq_laneq_f32(At_M_A1, At2, A1, 2);
                At_M_A1 = vfmaq_laneq_f32(At_M_A1, At3, A1, 3);

                A2 = vld1q_f32(A_ptr + A_idx + 2 * k);
                At_M_A2 = vfmaq_laneq_f32(At_M_A2, At0, A2, 0);
                At_M_A2 = vfmaq_laneq_f32(At_M_A2, At1, A2, 1);
                At_M_A2 = vfmaq_laneq_f32(At_M_A2, At2, A2, 2);
                At_M_A2 = vfmaq_laneq_f32(At_M_A2, At3, A2, 3);

                A3 = vld1q_f32(A_ptr + A_idx + 3 * k);
                At_M_A3 = vfmaq_laneq_f32(At_M_A3, At0, A3, 0);
                At_M_A3 = vfmaq_laneq_f32(At_M_A3, At1, A3, 1);
                At_M_A3 = vfmaq_laneq_f32(At_M_A3, At2, A3, 2);
                At_M_A3 = vfmaq_laneq_f32(At_M_A3, At3, A3, 3);
            }
            // Compute base index for stores
            At_M_A_idx = n * j_idx + i_idx;
            vst1q_f32(At_M_A_ptr + At_M_A_idx, At_M_A0);
            vst1q_f32(At_M_A_ptr + At_M_A_idx + n, At_M_A1);
            vst1q_f32(At_M_A_ptr + At_M_A_idx + 2 * n, At_M_A2);
            vst1q_f32(At_M_A_ptr + At_M_A_idx + 3 * n, At_M_A3);

            for (; k_idx < k; k_idx++) {  
                for (int jp_idx = 0; jp_idx < 4; jp_idx++) {
                    for (int ip_idx = 0; ip_idx < 4; ip_idx++) {
                        At_M_A_ptr[At_M_A_idx + jp_idx * n + ip_idx] += A_ptr[(i_idx + ip_idx) * k + k_idx] * A_ptr[(j_idx + jp_idx)  * k + k_idx];
                    }
                }
            }
        }
    }
    
    // 补全下三角形矩阵
    for (int i_idx = 0; i_idx < n; i_idx ++) {
        At_M_A_idx = i_idx * n;
        for (int j_idx = i_idx + 1; j_idx < n; j_idx ++) {
            At_M_A_ptr[At_M_A_idx + j_idx] = At_M_A_ptr[j_idx * n + i_idx];
        }
    }
}

耗时测试

方法	M1 Max, 时间(ms)	RK3588, 时间(ms)
Eigen	0.15472	0.604365
NEON	0.114444	0.19703

在M1 Max平台上，加速比(0.15472 - 0.114444) / 0.15472 = 26%；RK3588平台上，加速比(0.604365 - 0.19703) / 0.604365 = 67.4%。可以看到RK3588平台上的加速效果更加明显。

Unity中通过shader实现OpenCV的Remap功能

Posted on 2023-10-20 In Tech , Vision Disqus:

我们希望对输入图片根据Distort程序进行畸变处理：

void Distort(const float &x_src,
             const float &y_src,
             const cv::Mat &control_points,
             const float &r2,
             const cv::Mat &W,
             float &x_dst,
             float &y_dst) {
    
    int pts_num = control_points.rows;

    x_dst = 0;
    y_dst = 0;
    for (int i = 0; i < pts_num; i++) {
        float x_diff = x_src - control_points.at<cv::Vec2f>(i)[0];
        float y_diff = y_src - control_points.at<cv::Vec2f>(i)[1];

        float kernel = 1.f / sqrt(x_diff * x_diff + y_diff * y_diff + r2);

        x_dst += kernel * W.at<float>(i, 0);
        y_dst += kernel * W.at<float>(i, 1);
    }

    x_dst += (W.at<float>(pts_num, 0) + W.at<float>(pts_num + 1, 0) * x_src + W.at<float>(pts_num + 2, 0) * y_src);
    y_dst += (W.at<float>(pts_num, 1) + W.at<float>(pts_num + 1, 1) * x_src + W.at<float>(pts_num + 2, 1) * y_src);
}

输入图像	处理后图像

可以根据Distort程序，生成OpenCV中cv::remap函数所需的map1，map2映射矩阵，然后再使用cv::remap对输入图片进行处理。我们希望在Unity中，实现类似于cv::remap函数的功能。

首先我们根据Distort()生成像素点的LUT映射矩阵，并且保存到screen_calibration_lut.bin文件中：

void GenerateLut(const cv::Mat &control_points,
                 const float &r2,
                 const cv::Mat &W,
                 const int width,
                 const int height,
                 cv::Mat &lut) {
    // Generate Texcoords LUT
    // -------------------------------------------------------------------------------------------
    // By default, gl_FragCoord assumes a lower-left origin for window coordinates and assumes pixel centers are located at half-pixel coordinates. For example, the (x, y) location (0.5, 0.5) is returned for the lower-left-most pixel in a window"
    lut = cv::Mat::zeros(height, width, CV_32FC2);
    float x_distort, y_distort, x_cv, y_cv, x_tex, y_tex;
    // float x_frag, y_frag;
    for (int y = 0; y < height; y++) {
        // y_frag = y + 0.5f;
        // y_cv = float(img_height) - y_frag - 0.5f;
        y_cv = float(height) - y - 1;
        for (int x = 0; x < width; x++) {
            // x_frag = x + 0.5f;
            // x_cv = x_frag - 0.5f;
            x_cv = x;

            Distort(x_cv, y_cv, control_points, r2, W, x_distort, y_distort);
            // std::cout<<"["<<x<<", "<<y<<"] -> ["<<x_distort<<", "<<y_distort<<"]"<<std::endl;

            x_tex = (x_distort + 0.5f) / float(width);
            y_tex = 1.f - (y_distort + 0.5f) / float(height);

            lut.at<cv::Vec2f>(y, x)[0] = x_tex;
            lut.at<cv::Vec2f>(y, x)[1] = y_tex;
        }
    }
    // -------------------------------------------------------------------------------------------
}

int main(int argc, char *argv[]) {
    // ...
    cv::Mat lut;
    GenerateLut(control_points, r2, weights, screen_size.width, screen_size.height, lut); 

    std::string screen_calibration_lut_file = calibration_result_save_path + "/screen_calibration_lut.bin";

    std::ofstream out(screen_calibration_lut_file, std::ios::out | std::ios::binary | std::ios::trunc);
    if (!lut.isContinuous()) {lut = lut.clone();}
    out.write((char *)lut.data, 2 * lut.cols * lut.rows * sizeof(float));
    out.close();
    // ...
}

在Unity端中，读取screen_calibration_lut.bin文件，并且通过Unity Shader屏幕特效函数OnRenderImage()¹，在图像渲染完成后对图像进行Remap处理：

using System.Collections;
using System.Collections.Generic;
using System.IO;
using UnityEngine;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

public class ScreenUndistortion : MonoBehaviour
{
    public bool enable_screen_undistortion = true;
    public string screen_calibration_file = "screen_calibration_lut.bin";
    public int screen_width = 1920;
    public int screen_height = 1080;

    private Material material;


    void Start()
    {
        if (enable_screen_undistortion == false) return;
        
        // ReadCalibrationData
        // ------------------------------------------------------------------------
        float[] lut = new float[screen_width * screen_height * 2];
        if (!ReadCalibrationData(screen_calibration_file, lut))
        {
            Debug.Log("Can not open file: " + screen_calibration_file);
            return;
        }
        // ------------------------------------------------------------------------

        // material
        // ------------------------------------------------------------------------
        Shader shader = Shader.Find("Hidden/ScreenUndistortion");
        if (shader == null) {
            Debug.Log("Can not found shader!");
			return;
		}

        material = new Material(shader);
    
        Texture2D tex = new Texture2D(screen_width, screen_height, TextureFormat.RGFloat, true);
        // public void SetPixelData(NativeArray<T> data, int mipLevel, int sourceDataStartIndex = 0);
        // - data	Data array to initialize texture pixels with.
        // - mipLevel	Mip level to fill.
        // - sourceDataStartIndex	Index in the source array to start copying from (default 0).
        tex.SetPixelData(lut, 0, 0);
        // public void Apply(bool updateMipmaps = true, bool makeNoLongerReadable = false);
        // - updateMipmaps	When set to true, mipmap levels are recalculated.
        // - makeNoLongerReadable	When set to true, Unity discards the copy of pixel data in CPU-addressable memory after this operation.
        tex.Apply(false, false);
        material.SetTexture("_LutTex", tex);

        MeshRenderer meshRenderer = gameObject.AddComponent<MeshRenderer>();      
        meshRenderer.material = material;

    }

    bool ReadCalibrationData(string file,
                             float[] lut)
    {
        if (lut == null || !File.Exists(file))
        {
            return false;
        }

        using(BinaryReader reader = new BinaryReader(File.Open(file, FileMode.Open)))
        {
            for(int i = 0; i < lut.Length; i++)
            {
                lut[i] = reader.ReadSingle();
            }
        }

        // Debug.Log(lut[0] + "," + lut[1] + "," + lut[2] + "," + lut[3]);

        return true;
    }

    void OnRenderImage(RenderTexture src, RenderTexture dest)
    {
        if (enable_screen_undistortion == false || material == null)
        {
            Graphics.Blit(src, dest);
        }
        else
        {
            Graphics.Blit(src, dest, material);
        }
    }

}

其中，所使用的shader文件ScreenUndistortion.shader为：

Shader "Hidden/ScreenUndistortion"
{
    Properties
    {
        _MainTex ("Texture", 2D) = "white" {}
        _LutTex("Texture", 2D) = "white" {}
    }
    SubShader
    {
        // No culling or depth
        Cull Off ZWrite Off ZTest Always

        Pass
        {
            CGPROGRAM
            #pragma vertex vert
            #pragma fragment frag

            #include "UnityCG.cginc"

            struct appdata
            {
                float4 vertex : POSITION;
                float2 uv : TEXCOORD0;
            };

            struct v2f
            {
                // Unity stores UVs in 0-1 space. [0,0] represents the bottom-left corner of the texture, and [1,1] represents the top-right. Values are not clamped; you can use values below 0 and above 1 if needed.
                float2 uv : TEXCOORD0;
                float4 vertex : SV_POSITION;
            };

            v2f vert (appdata v)
            {
                v2f o;
                o.vertex = UnityObjectToClipPos(v.vertex);
                o.uv = v.uv;
                return o;
            }

            sampler2D _MainTex;
            sampler2D _LutTex;

            fixed4 frag (v2f i) : SV_Target
            {
                float4 uv_distort = tex2D(_LutTex, i.uv);

                float4 col;

                if (uv_distort.x <= 0.0f || uv_distort.y <= 0.0f || uv_distort.x >= 1.0f || uv_distort.y >= 1.0f)
                {
                    col = float4(0, 0, 0, 1);
                }
                else
                {
                    col = tex2D(_MainTex, float2(uv_distort.x, uv_distort.y));
                } 
                
                return col;
            }
            ENDCG
        }
    }
}

https://docs.unity3d.com/ScriptReference/MonoBehaviour.OnRenderImage.html↩︎