0%

Building libTorch using CMake[1]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
git clone -b master --recurse-submodule https://github.com/pytorch/pytorch.git
cd pytorch
git checkout nightly
mkdir libtorch_build
cd libtorch_build
cmake -D BUILD_SHARED_LIBS:BOOL=ON \
-D CMAKE_BUILD_TYPE:STRING=Release \
-D PYTHON_EXECUTABLE:PATH=`which python3` \
-D BUILD_PYTHON=OFF \
-D USE_CUDA=OFF \
-D USE_PYTORCH_METAL_EXPORT=ON \
-D USE_OPENCV=ON \
-D BUILD_CUSTOM_PROTOBUF=OFF \
-D CMAKE_INSTALL_PREFIX:PATH=../pytorch-install \
..
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
-- 
-- ******** Summary ********
-- General:
-- CMake version : 3.24.2
-- CMake command : /opt/homebrew/Cellar/cmake/3.24.2/bin/cmake
-- System : Darwin
-- C++ compiler : /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++
-- C++ compiler id : AppleClang
-- C++ compiler version : 14.0.0.14000029
-- Using ccache if found : ON
-- Found ccache : CCACHE_PROGRAM-NOTFOUND
-- CXX flags : -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wvla-extension -Wno-range-loop-analysis -Wno-pass-failed -Wsuggest-override -Wno-error=pedantic -Wno-error=old-style-cast -Wconstant-conversion -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-missing-braces -Wunused-lambda-capture -Qunused-arguments -fcolor-diagnostics -fdiagnostics-color=always -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -DUSE_MPS -fno-objc-arc -Wno-unguarded-availability-new -Wno-unused-private-field -Wno-missing-braces
-- Build type : Release
-- Compile definitions : ONNX_ML=1;ONNXIFI_ENABLE_EXT=1;ONNX_NAMESPACE=onnx_torch;HAVE_MMAP=1;_FILE_OFFSET_BITS=64;HAVE_SHM_OPEN=1;HAVE_SHM_UNLINK=1;USE_EXTERNAL_MZCRC;MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
-- CMAKE_PREFIX_PATH :
-- CMAKE_INSTALL_PREFIX : ../pytorch-install
-- USE_GOLD_LINKER : OFF
--
-- TORCH_VERSION : 2.1.0
-- CAFFE2_VERSION : 2.1.0
-- BUILD_CAFFE2 : OFF
-- BUILD_CAFFE2_OPS : OFF
-- BUILD_STATIC_RUNTIME_BENCHMARK: OFF
-- BUILD_TENSOREXPR_BENCHMARK: OFF
-- BUILD_NVFUSER_BENCHMARK: OFF
-- BUILD_BINARY : OFF
-- BUILD_CUSTOM_PROTOBUF : OFF
-- Protobuf compiler :
-- Protobuf includes :
-- Protobuf libraries :
-- BUILD_DOCS : OFF
-- BUILD_PYTHON : OFF
-- BUILD_SHARED_LIBS : ON
-- CAFFE2_USE_MSVC_STATIC_RUNTIME : OFF
-- BUILD_TEST : OFF
-- BUILD_JNI : OFF
-- BUILD_MOBILE_AUTOGRAD : OFF
-- BUILD_LITE_INTERPRETER: OFF
-- CROSS_COMPILING_MACOSX :
-- INTERN_BUILD_MOBILE :
-- TRACING_BASED : OFF
-- USE_BLAS : 1
-- BLAS : accelerate
-- BLAS_HAS_SBGEMM :
-- USE_LAPACK : 1
-- LAPACK : accelerate
-- USE_ASAN : OFF
-- USE_TSAN : OFF
-- USE_CPP_CODE_COVERAGE : OFF
-- USE_CUDA : OFF
-- USE_ROCM : OFF
-- BUILD_NVFUSER : OFF
-- USE_EIGEN_FOR_BLAS : ON
-- USE_FBGEMM : OFF
-- USE_FAKELOWP : OFF
-- USE_KINETO : ON
-- USE_FFMPEG : OFF
-- USE_GFLAGS : OFF
-- USE_GLOG : OFF
-- USE_LEVELDB : OFF
-- USE_LITE_PROTO : OFF
-- USE_LMDB : OFF
-- USE_METAL : OFF
-- USE_PYTORCH_METAL : OFF
-- USE_PYTORCH_METAL_EXPORT : ON
-- USE_MPS : ON
-- USE_FFTW : ON
-- USE_MKL : OFF
-- USE_MKLDNN : OFF
-- USE_UCC : OFF
-- USE_ITT : OFF
-- USE_NCCL : OFF
-- USE_NNPACK : ON
-- USE_NUMPY : ON
-- USE_OBSERVERS : OFF
-- USE_OPENCL : OFF
-- USE_OPENCV : ON
-- OpenCV version : 4.7.0
-- USE_OPENMP : OFF
-- USE_TBB : OFF
-- USE_VULKAN : OFF
-- USE_PROF : OFF
-- USE_QNNPACK : OFF
-- USE_PYTORCH_QNNPACK : ON
-- USE_XNNPACK : ON
-- USE_REDIS : OFF
-- USE_ROCKSDB : OFF
-- USE_ZMQ : OFF
-- USE_DISTRIBUTED : OFF
-- Public Dependencies :
-- Private Dependencies : Threads::Threads;pthreadpool;cpuinfo;pytorch_qnnpack;nnpack;XNNPACK;opencv_core;opencv_highgui;opencv_imgproc;opencv_imgcodecs;opencv_optflow;opencv_videoio;opencv_video;fp16;foxi_loader;fmt::fmt-header-only;kineto
-- Public CUDA Deps. :
-- Private CUDA Deps. :
-- USE_COREML_DELEGATE : OFF
-- BUILD_LAZY_TS_BACKEND : ON
-- TORCH_DISABLE_GPU_ASSERTS : OFF
-- Configuring done
-- Generating done
-- Build files have been written to: /Users/luohanjie/Softwares/pytorch/libtorch_build

注意系统已有protobuf可能会导致编译错误[2]

1
cmake --build . --target install --parallel 20

CMake测试程序[3]

1
2
3
4
5
6
7
8
9
10
11
set(TORCH_SRC /Users/luohanjie/Softwares/pytorch/pytorch-install)

set(TORCH_INCLUDE_DIRS ${TORCH_SRC}/include/torch/csrc/api/include ${TORCH_SRC}/include)
file(GLOB TORCH_LIBS ${TORCH_SRC}/lib/*.dylib ${TORCH_SRC}/lib/*.a)

message(${TORCH_INCLUDE_DIRS})

include_directories(${TORCH_INCLUDE_DIRS} )

add_executable(test_libtorch test_libtorch.cpp)
target_link_libraries(test_libtorch ${TORCH_LIBS})

程序1:

1
2
3
4
5
6
7
8
9
#include <torch/torch.h>
#include <iostream>

int main(int argc, char* argv[]) {
std::cout<<"MPS? "<<torch::mps::is_available()<<std::endl;

torch::Tensor tensor = torch::rand({2, 3}).to("mps");;
std::cout << tensor << std::endl;
}

输出:

1
2
3
4
MPS? 1
0.1982 0.2995 0.5541
0.4153 0.2684 0.4655
[ MPSFloatType{2,3} ]

程序2:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include <time.h>
#include <torch/torch.h>
#include <iostream>

#define USE_MPS 1

using namespace std;

struct Net : torch::nn::Module {
Net() {
conv1 = register_module("conv1", torch::nn::Conv2d(3, 64, 3));
conv2 = register_module("conv2", torch::nn::Conv2d(64, 128, 3));
conv3 = register_module("conv3", torch::nn::Conv2d(128, 256, 3));
fc1 = register_module("fc1", torch::nn::Linear(256, 128));
fc2 = register_module("fc2", torch::nn::Linear(128, 56));
fc3 = register_module("fc3", torch::nn::Linear(56, 10));
global_pool = register_module("global_pool", torch::nn::AdaptiveAvgPool2d(1));
}

torch::Tensor forward(torch::Tensor x) {
x = torch::relu(conv1->forward(x));
x = torch::max_pool2d(x, {2, 2});
x = torch::relu(conv2->forward(x));
x = torch::max_pool2d(x, {2, 2});
x = torch::relu(conv3->forward(x));
x = torch::max_pool2d(x, {2, 2});
x = global_pool->forward(x);
x = torch::relu(fc1->forward(x.reshape({x.size(0), -1})));
x = torch::relu(fc2->forward(x));
x = torch::log_softmax(fc3->forward(x), 1);

return x;
}

torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
torch::nn::Conv2d conv1{nullptr}, conv2{nullptr}, conv3{nullptr};
torch::nn::AdaptiveAvgPool2d global_pool{nullptr};
};

int main(int argc, char* argv[]) {
auto net = std::make_shared<Net>();
torch::Tensor data = torch::ones({8, 3, 128, 128});

#ifdef USE_MPS
net->to(torch::Device(torch::kMPS));
data = data.to("mps");
// torch::Tensor data = torch::ones({8, 3, 128, 128}).to("mps");
#endif

torch::Tensor y;
clock_t start, end;
start = clock();
for (int i = 0; i < 100; ++i) {
y = net->forward(data);
}
end = clock();
cout << "Time: " << double(end - start) / CLOCKS_PER_SEC << endl;

return 0;
}
Device Time
CPU 15.36
MPS 0.2671

读取模型[4]

生成TorchScript

A PyTorch model’s journey from Python to C++ is enabled by Torch Script, a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler.

1
2
3
4
5
6
7
8
9
10
11
12
13
import torch
import torchvision

# An instance of your model.
model = torchvision.models.resnet18()

# An example input you would normally provide to your model's forward() method.
example = torch.rand(1, 3, 224, 224)

# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(model, example)

traced_script_module.save("traced_resnet_model.pt")

c++中读取Model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#include <torch/script.h> // One-stop header.

#include <iostream>
#include <memory>

int main(int argc, const char* argv[]) {
if (argc != 2) {
std::cerr << "usage: example-app <path-to-exported-script-module>\n";
return -1;
}

torch::jit::script::Module module;
try {
// Deserialize the ScriptModule from a file using torch::jit::load().
module = torch::jit::load(argv[1]);
}
catch (const c10::Error& e) {
std::cerr << "error loading the model\n";
return -1;
}

std::cout << "ok\n";
}

  1. https://github.com/pytorch/pytorch/blob/master/docs/libtorch.rst ↩︎

  2. https://github.com/pytorch/pytorch/issues/64645 ↩︎

  3. https://pytorch.org/cppdocs/installing.html ↩︎

  4. https://pytorch.org/tutorials/advanced/cpp_export.html ↩︎

1
brew install abseil google-benchmark
1
2
3
git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
cd tensorflow_src
git checkout v2.9.3

更高级的版本可能会出现编译错误,或者调用gpu时出现问题。

修改tensorflow/lite/c/CMakeLists.txt中的common.c,改为common.cc[1]

1
2
3
4
mkdir build_mac
cd build_mac
cmake ../tensorflow/lite/c -D TFLITE_KERNEL_TEST=ON -D TFLITE_ENABLE_GPU=ON -D ABSL_PROPAGATE_CXX_STD=ON -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -D LIBRARY_OUTPUT_PATH=/Users/luohanjie/Softwares/tensorflow_src/build_mac/lib
cmake --build . -j

编译测试程序benchmark_model,测试模型model_opt.tflite,使用cpu:

1
cmake --build . -j -t benchmark_model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
./tensorflow-lite/tools/benchmark/benchmark_model --graph=/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite --verbose=true --num_threads=4 -use_gpu=false

STARTING!
Log parameter values verbosely: [1]
Min num runs: [50]
Min runs duration (seconds): [1]
Max runs duration (seconds): [150]
Inter-run delay (seconds): [-1]
Number of prorated runs per second: [-1]
Num threads: [4]
Use caching: [0]
Benchmark name: []
Output prefix: []
Min warmup runs: [1]
Min warmup runs duration (seconds): [0.5]
Run w/o invoking kernels: [0]
Report the peak memory footprint: [0]
Memory footprint check interval (ms): [50]
Graph: [/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite]
Input layers: []
Input shapes: []
Input value ranges: []
Input value files: []
Allow fp16: [0]
Require full delegation: [0]
Enable op profiling: [0]
Max initial profiling buffer entries: [1024]
Allow dynamic increase on profiling buffer entries: [0]
CSV File to export profiling data to: []
Print pre-invoke interpreter state: [0]
Print post-invoke interpreter state: [0]
Release dynamic tensor memory: [0]
Use dynamic tensor for large tensors: [0]
print out all supported flags: [0]
#threads used for CPU inference: [4]
Max number of delegated partitions: [0]
Min nodes per partition: [0]
Directory for delegate serialization: []
Model-specific token/key for delegate serialization.: []
Use xnnpack: [0]
External delegate path: []
External delegate options: []
Use gpu: [0]
Allow lower precision in gpu: [1]
Enable running quant models in gpu: [1]
Prefer maximizing the throughput in gpu: [0]
GPU backend: []
Loaded model /Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
The input model file size (MB): 66.3383
Initialized session in 41.498ms.
Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
count=13 first=43827 curr=38759 min=38662 max=45293 avg=39973.3 std=1998

Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
count=50 first=39240 curr=38747 min=38470 max=40766 avg=39654.3 std=635

Inference timings in us: Init: 41498, First inference: 43827, Warmup (avg): 39973.3, Inference (avg): 39654.3

使用gpu:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
./tensorflow-lite/tools/benchmark/benchmark_model --graph=/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite --verbose=true --num_threads=4 --use_gpu=true

STARTING!
Log parameter values verbosely: [1]
Min num runs: [50]
Min runs duration (seconds): [1]
Max runs duration (seconds): [150]
Inter-run delay (seconds): [-1]
Number of prorated runs per second: [-1]
Num threads: [4]
Use caching: [0]
Benchmark name: []
Output prefix: []
Min warmup runs: [1]
Min warmup runs duration (seconds): [0.5]
Run w/o invoking kernels: [0]
Report the peak memory footprint: [0]
Memory footprint check interval (ms): [50]
Graph: [/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite]
Input layers: []
Input shapes: []
Input value ranges: []
Input value files: []
Allow fp16: [0]
Require full delegation: [0]
Enable op profiling: [0]
Max initial profiling buffer entries: [1024]
Allow dynamic increase on profiling buffer entries: [0]
CSV File to export profiling data to: []
Print pre-invoke interpreter state: [0]
Print post-invoke interpreter state: [0]
Release dynamic tensor memory: [0]
Use dynamic tensor for large tensors: [0]
print out all supported flags: [0]
#threads used for CPU inference: [4]
Max number of delegated partitions: [0]
Min nodes per partition: [0]
Directory for delegate serialization: []
Model-specific token/key for delegate serialization.: []
Use xnnpack: [0]
External delegate path: []
External delegate options: []
Use gpu: [1]
Allow lower precision in gpu: [1]
Enable running quant models in gpu: [1]
Prefer maximizing the throughput in gpu: [0]
GPU backend: []
Loaded model /Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite
INFO: Created TensorFlow Lite delegate for GPU.
GPU delegate created.
INFO: Initialized OpenCL-based API.
INFO: Created 1 GPU delegate kernels.
Explicitly applied GPU delegate, and the model graph will be completely executed by the delegate.
The input model file size (MB): 66.3383
Initialized session in 129.521ms.
Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
count=40 first=40053 curr=11752 min=11744 max=40053 avg=12579.9 std=4400

Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
count=85 first=11880 curr=11836 min=11567 max=12276 avg=11839.5 std=93

Inference timings in us: Init: 129521, First inference: 40053, Warmup (avg): 12579.9, Inference (avg): 11839.5

  1. https://github.com/tensorflow/tensorflow/pull/54566 ↩︎

安装

下载Vulkan Sdk,双击并且安装。

卸载方法:sudo path_to_vulkan_sdk/uninstall.sh

1
brew install ncnn

Cmake测试程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# #OpenMP flags for MACOS
if (APPLE)
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
set(OpenMP_C "${CMAKE_C_COMPILER}")
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -Wno-unused-command-line-argument -I/opt/homebrew/opt/libomp/include")
set(OpenMP_C_LIB_NAMES "libomp")
set(OpenMP_libomp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
endif ()
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -Wno-unused-command-line-argument -I/opt/homebrew/opt/libomp/include")
set(OpenMP_CXX_LIB_NAMES "libomp")
set(OpenMP_libomp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
endif ()
endif ()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

find_package(ncnn REQUIRED)
find_package(OpenCV REQUIRED)

include_directories(${ncnn_INCLUDE} ${OpenCV_LIBRARY_DIRS})

add_executable(test_ncnn test_ncnn.cpp)
target_link_libraries(test_ncnn ncnn ${OpenCV_LIBS})

测试网络为midas_v21_small-int8

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#include "net.h"
#include "mat.h"
#include "cpu.h"
#include <opencv2/opencv.hpp>
#include <sys/time.h>

int main(int argc, char* argv[]) {
std::string img_file = "/Users/luohanjie/Workspace/Vision/depth_estimation/MiDaS/input/squirrel_iphone_sample3.png";
std::string param_file = "/Users/luohanjie/Workspace/Vision/my_slam/data/models/midas_v21_small-int8.param";
std::string model_file = "/Users/luohanjie/Workspace/Vision/my_slam/data/models/midas_v21_small-int8.bin";
int target_size = 256;
float scale = 0.33333f;

cv::Mat img = cv::imread(img_file);
cv::resize(img, img, cv::Size(), scale, scale);

int img_width = img.cols;
int img_height = img.rows;

ncnn::Net net;
ncnn::set_cpu_powersave(0); // 0 = all cores enabled(default)
ncnn::set_omp_num_threads(ncnn::get_cpu_count());
net.opt = ncnn::Option();
net.opt.use_vulkan_compute = false;
net.opt.num_threads = ncnn::get_cpu_count();

net.load_param(param_file.c_str());
net.load_model(model_file.c_str());

// https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-opencv.md
// cv::Mat CV_8UC3 -> ncnn::Mat 3 channel + swap RGB/BGR
ncnn::Mat img_in = ncnn::Mat::from_pixels_resize(img.data, ncnn::Mat::PIXEL_BGR2RGB, img_width, img_height, target_size, target_size);

// substract_mean_normalize(const float* mean_vals, const float* norm_vals): substract channel-wise mean values, then multiply by normalize values, pass 0 to skip in ncnn.
const float mean_vals[3] = {123.675f, 116.28f, 103.53f};
const float norm_vals[3] = {0.01712475383f, 0.0175070028f, 0.01742919389f};
img_in.substract_mean_normalize(mean_vals, norm_vals);

ncnn::Extractor ex = net.create_extractor();
ex.set_light_mode(true);

ncnn::Mat img_out;

ex.input("input.1", img_in);
ex.extract("649", img_out);
ncnn::resize_bilinear(img_out, img_out, img_width, img_height);

cv::Mat cv_out(img_out.h, img_out.w, CV_8UC1);
img_out.to_pixels(cv_out.data, ncnn::Mat::PIXEL_GRAY);

cv::imshow("cv_out", cv_out);
cv::waitKey(0);
return 0;
}