Hanjie's Blog

一只有理想的羊驼

增强现实(Augmented Reality,简称 AR),是一种实时定位并加上相应 图像的技术,是一种将真实世界信息和虚拟世界信息集成的新技术,这种技术的目标是在屏幕上把虚拟世界叠加现实世界中并进行互动。

AR 眼镜是以眼镜的形态作为一种个人移动计算平台。它拥有一个半透明的光学屏幕,它一方面像普通眼镜一样可以透过外部的环境光,使用户可以看到 眼前的真实世界,同时可以显示计算机生成的虚拟图像。人眼透过光学屏幕,能够看到虚拟影像和现实世界两者重叠。AR 眼镜还拥有一些传感器,如双目摄像头,TOF摄像头,激光雷达等用来实现眼镜的姿态估计,环境感知和物体跟踪等功能。

AR 系统中存在虚拟影像空间和现实世界空间两个空间。在实际使用中,我们希望虚拟物体与实际物体能够正确地叠加在一起。这需要我们对 AR 眼镜进行 虚实结合标定,获得传感器与光学屏幕,人眼之间的几何关系。

1
2
3
4
5
6
7
8
9
10
git clone --recursive --depth=1 --branch v1.18.0 https://github.com/microsoft/onnxruntime.git   
cd onnxruntime

conda create --name onnx python=3.10
conda activate onnx

# 如果系统中已经存在protobuf,nlohmann_json,会在编译时与onnxruntime内部使用的版本发生冲突,为了避免这种情况,在编译前先暂时删除掉
# brew remove protobuf nlohmann-json

./build.sh --config Release --use_coreml --build_shared_lib --parallel --compile_no_warning_as_error --skip_submodule_sync --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64

Please note that these instructions build the debug build, which may have performance tradeoffs. The “–config” parameter has four valid values: Debug, Release, RelWithDebInfo and MinSizeRel. Compared to “Release”, “RelWithDebInfo” not only has debug info, it also disables some inlines to make the binary easier to debug. Thus RelWithDebInfo is slower than Release.

Cross Android Compiling

1
./build.sh --parallel --config Release --android --build_shared_lib --android_sdk_path /Users/luohanjie/Library/Android/sdk --android_ndk_path /Users/luohanjie/Library/Android/sdk/ndk/27.0.12077973 --android_abi arm64-v8a --android_api 29 

Please use NDK 26 as it has a version of clang that handles the bfloat16 type.

有一个矩阵运算\(Y=J^tMJ\),其中\(J\)为[30576, 8] 大的矩阵,\(M\)是[30576, 30576]的对角矩阵,最终输出的\(Y\)为[8, 8]大的对称矩阵。使用Eigen库,如下实现矩阵运算:

1
Y.noalias() = J.transpose() * M.asDiagonal() * J; // 这里M为一个30576长的vector

我们希望能够能够通过NEON指令集实现该矩阵运算,获得更短的运算耗时。

NEON优化版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
void CalcAtMA(const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> &A,
const Eigen::VectorXf &M,
Eigen::Matrix<float, 8, 8> &At_M_A) {
// 参考: https://developer.arm.com/documentation/102107a/0100/Single-precision-4x4-matrix-multiplication

int At_idx;
int A_idx;
int At_M_A_idx;

// these are the columns of a 4x4 sub matrix of At
float32x4_t At0;
float32x4_t At1;
float32x4_t At2;
float32x4_t At3;

// these are the columns of a 4x4 sub matrix of A
float32x4_t A0;
float32x4_t A1;
float32x4_t A2;
float32x4_t A3;

// these are the columns of a 4x4 sub matrix of At_M_A
float32x4_t At_M_A0;
float32x4_t At_M_A1;
float32x4_t At_M_A2;
float32x4_t At_M_A3;

// [8, 8] = [8, 30576] * [30576, 30576] * [30576, 8]
// n k k k k m
uint32_t n = A.cols(); // = m
uint32_t k = A.rows();
uint32_t k_step = k - 4;

const float *A_ptr = A.data();
const float *M_ptr = M.data();
float *At_M_A_ptr = At_M_A.data();

for (int i_idx = 0; i_idx < n; i_idx += 4) {
for (int j_idx = i_idx; j_idx < n; j_idx += 4) {
// Zero accumulators before matrix op
At_M_A0 = vmovq_n_f32(0);
At_M_A1 = vmovq_n_f32(0);
At_M_A2 = vmovq_n_f32(0);
At_M_A3 = vmovq_n_f32(0);
int k_idx = 0;
for (; k_idx <= k_step; k_idx += 4) {

// Compute base index to 4x4 block
At_idx = k * i_idx + k_idx;
A_idx = k * j_idx + k_idx;

// Load most current At values in row
At0 = vld1q_f32(A_ptr + At_idx);
At1 = vld1q_f32(A_ptr + At_idx + k);
At2 = vld1q_f32(A_ptr + At_idx + 2 * k);
At3 = vld1q_f32(A_ptr + At_idx + 3 * k);

MatTransposeInp4x4NeonF32(At0, At1, At2, At3, At0, At1, At2, At3);

At0 = vmulq_n_f32(At0, M_ptr[k_idx]);
At1 = vmulq_n_f32(At1, M_ptr[k_idx + 1]);
At2 = vmulq_n_f32(At2, M_ptr[k_idx + 2]);
At3 = vmulq_n_f32(At3, M_ptr[k_idx + 3]);

// Multiply accumulate in 4x1 blocks, i.e. each column in C
// Load most current A values in col
A0 = vld1q_f32(A_ptr + A_idx);
At_M_A0 = vfmaq_laneq_f32(At_M_A0, At0, A0, 0);
At_M_A0 = vfmaq_laneq_f32(At_M_A0, At1, A0, 1);
At_M_A0 = vfmaq_laneq_f32(At_M_A0, At2, A0, 2);
At_M_A0 = vfmaq_laneq_f32(At_M_A0, At3, A0, 3);

A1 = vld1q_f32(A_ptr + A_idx + k);
At_M_A1 = vfmaq_laneq_f32(At_M_A1, At0, A1, 0);
At_M_A1 = vfmaq_laneq_f32(At_M_A1, At1, A1, 1);
At_M_A1 = vfmaq_laneq_f32(At_M_A1, At2, A1, 2);
At_M_A1 = vfmaq_laneq_f32(At_M_A1, At3, A1, 3);

A2 = vld1q_f32(A_ptr + A_idx + 2 * k);
At_M_A2 = vfmaq_laneq_f32(At_M_A2, At0, A2, 0);
At_M_A2 = vfmaq_laneq_f32(At_M_A2, At1, A2, 1);
At_M_A2 = vfmaq_laneq_f32(At_M_A2, At2, A2, 2);
At_M_A2 = vfmaq_laneq_f32(At_M_A2, At3, A2, 3);

A3 = vld1q_f32(A_ptr + A_idx + 3 * k);
At_M_A3 = vfmaq_laneq_f32(At_M_A3, At0, A3, 0);
At_M_A3 = vfmaq_laneq_f32(At_M_A3, At1, A3, 1);
At_M_A3 = vfmaq_laneq_f32(At_M_A3, At2, A3, 2);
At_M_A3 = vfmaq_laneq_f32(At_M_A3, At3, A3, 3);
}
// Compute base index for stores
At_M_A_idx = n * j_idx + i_idx;
vst1q_f32(At_M_A_ptr + At_M_A_idx, At_M_A0);
vst1q_f32(At_M_A_ptr + At_M_A_idx + n, At_M_A1);
vst1q_f32(At_M_A_ptr + At_M_A_idx + 2 * n, At_M_A2);
vst1q_f32(At_M_A_ptr + At_M_A_idx + 3 * n, At_M_A3);

for (; k_idx < k; k_idx++) {
for (int jp_idx = 0; jp_idx < 4; jp_idx++) {
for (int ip_idx = 0; ip_idx < 4; ip_idx++) {
At_M_A_ptr[At_M_A_idx + jp_idx * n + ip_idx] += A_ptr[(i_idx + ip_idx) * k + k_idx] * A_ptr[(j_idx + jp_idx) * k + k_idx];
}
}
}
}
}

// 补全下三角形矩阵
for (int i_idx = 0; i_idx < n; i_idx ++) {
At_M_A_idx = i_idx * n;
for (int j_idx = i_idx + 1; j_idx < n; j_idx ++) {
At_M_A_ptr[At_M_A_idx + j_idx] = At_M_A_ptr[j_idx * n + i_idx];
}
}
}

耗时测试

方法 M1 Max, 时间(ms) RK3588, 时间(ms)
Eigen 0.15472 0.604365
NEON 0.114444 0.19703

在M1 Max平台上,加速比(0.15472 - 0.114444) / 0.15472 = 26%;RK3588平台上,加速比(0.604365 - 0.19703) / 0.604365 = 67.4%。可以看到RK3588平台上的加速效果更加明显。

0%