Hanjie's Blog

一只有理想的羊驼

For macOS

Building libTorch using CMake 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
git clone -b master --recurse-submodule https://github.com/pytorch/pytorch.git
cd pytorch
git checkout nightly
mkdir libtorch_build
cd libtorch_build
cmake -D BUILD_SHARED_LIBS:BOOL=ON \
-D CMAKE_BUILD_TYPE:STRING=Release \
-D PYTHON_EXECUTABLE:PATH=`which python3` \
-D BUILD_PYTHON=OFF \
-D USE_CUDA=OFF \
-D USE_PYTORCH_METAL_EXPORT=ON \
-D USE_OPENCV=ON \
-D BUILD_CUSTOM_PROTOBUF=OFF \
-D CMAKE_INSTALL_PREFIX:PATH=../pytorch-install \
..
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
-- 
-- ******** Summary ********
-- General:
-- CMake version : 3.24.2
-- CMake command : /opt/homebrew/Cellar/cmake/3.24.2/bin/cmake
-- System : Darwin
-- C++ compiler : /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++
-- C++ compiler id : AppleClang
-- C++ compiler version : 14.0.0.14000029
-- Using ccache if found : ON
-- Found ccache : CCACHE_PROGRAM-NOTFOUND
-- CXX flags : -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wvla-extension -Wno-range-loop-analysis -Wno-pass-failed -Wsuggest-override -Wno-error=pedantic -Wno-error=old-style-cast -Wconstant-conversion -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-missing-braces -Wunused-lambda-capture -Qunused-arguments -fcolor-diagnostics -fdiagnostics-color=always -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -DUSE_MPS -fno-objc-arc -Wno-unguarded-availability-new -Wno-unused-private-field -Wno-missing-braces
-- Build type : Release
-- Compile definitions : ONNX_ML=1;ONNXIFI_ENABLE_EXT=1;ONNX_NAMESPACE=onnx_torch;HAVE_MMAP=1;_FILE_OFFSET_BITS=64;HAVE_SHM_OPEN=1;HAVE_SHM_UNLINK=1;USE_EXTERNAL_MZCRC;MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
-- CMAKE_PREFIX_PATH :
-- CMAKE_INSTALL_PREFIX : ../pytorch-install
-- USE_GOLD_LINKER : OFF
--
-- TORCH_VERSION : 2.1.0
-- CAFFE2_VERSION : 2.1.0
-- BUILD_CAFFE2 : OFF
-- BUILD_CAFFE2_OPS : OFF
-- BUILD_STATIC_RUNTIME_BENCHMARK: OFF
-- BUILD_TENSOREXPR_BENCHMARK: OFF
-- BUILD_NVFUSER_BENCHMARK: OFF
-- BUILD_BINARY : OFF
-- BUILD_CUSTOM_PROTOBUF : OFF
-- Protobuf compiler :
-- Protobuf includes :
-- Protobuf libraries :
-- BUILD_DOCS : OFF
-- BUILD_PYTHON : OFF
-- BUILD_SHARED_LIBS : ON
-- CAFFE2_USE_MSVC_STATIC_RUNTIME : OFF
-- BUILD_TEST : OFF
-- BUILD_JNI : OFF
-- BUILD_MOBILE_AUTOGRAD : OFF
-- BUILD_LITE_INTERPRETER: OFF
-- CROSS_COMPILING_MACOSX :
-- INTERN_BUILD_MOBILE :
-- TRACING_BASED : OFF
-- USE_BLAS : 1
-- BLAS : accelerate
-- BLAS_HAS_SBGEMM :
-- USE_LAPACK : 1
-- LAPACK : accelerate
-- USE_ASAN : OFF
-- USE_TSAN : OFF
-- USE_CPP_CODE_COVERAGE : OFF
-- USE_CUDA : OFF
-- USE_ROCM : OFF
-- BUILD_NVFUSER : OFF
-- USE_EIGEN_FOR_BLAS : ON
-- USE_FBGEMM : OFF
-- USE_FAKELOWP : OFF
-- USE_KINETO : ON
-- USE_FFMPEG : OFF
-- USE_GFLAGS : OFF
-- USE_GLOG : OFF
-- USE_LEVELDB : OFF
-- USE_LITE_PROTO : OFF
-- USE_LMDB : OFF
-- USE_METAL : OFF
-- USE_PYTORCH_METAL : OFF
-- USE_PYTORCH_METAL_EXPORT : ON
-- USE_MPS : ON
-- USE_FFTW : ON
-- USE_MKL : OFF
-- USE_MKLDNN : OFF
-- USE_UCC : OFF
-- USE_ITT : OFF
-- USE_NCCL : OFF
-- USE_NNPACK : ON
-- USE_NUMPY : ON
-- USE_OBSERVERS : OFF
-- USE_OPENCL : OFF
-- USE_OPENCV : ON
-- OpenCV version : 4.7.0
-- USE_OPENMP : OFF
-- USE_TBB : OFF
-- USE_VULKAN : OFF
-- USE_PROF : OFF
-- USE_QNNPACK : OFF
-- USE_PYTORCH_QNNPACK : ON
-- USE_XNNPACK : ON
-- USE_REDIS : OFF
-- USE_ROCKSDB : OFF
-- USE_ZMQ : OFF
-- USE_DISTRIBUTED : OFF
-- Public Dependencies :
-- Private Dependencies : Threads::Threads;pthreadpool;cpuinfo;pytorch_qnnpack;nnpack;XNNPACK;opencv_core;opencv_highgui;opencv_imgproc;opencv_imgcodecs;opencv_optflow;opencv_videoio;opencv_video;fp16;foxi_loader;fmt::fmt-header-only;kineto
-- Public CUDA Deps. :
-- Private CUDA Deps. :
-- USE_COREML_DELEGATE : OFF
-- BUILD_LAZY_TS_BACKEND : ON
-- TORCH_DISABLE_GPU_ASSERTS : OFF
-- Configuring done
-- Generating done
-- Build files have been written to: /Users/luohanjie/Softwares/pytorch/libtorch_build

注意系统已有protobuf可能会导致编译错误2

1
cmake --build . --target install --parallel 20

CMake测试程序 3

1
2
3
4
5
6
7
8
9
10
11
set(TORCH_SRC /Users/luohanjie/Softwares/pytorch/pytorch-install)

set(TORCH_INCLUDE_DIRS ${TORCH_SRC}/include/torch/csrc/api/include ${TORCH_SRC}/include)
file(GLOB TORCH_LIBS ${TORCH_SRC}/lib/*.dylib ${TORCH_SRC}/lib/*.a)

message(${TORCH_INCLUDE_DIRS})

include_directories(${TORCH_INCLUDE_DIRS} )

add_executable(test_libtorch test_libtorch.cpp)
target_link_libraries(test_libtorch ${TORCH_LIBS})

程序1:

1
2
3
4
5
6
7
8
9
#include <torch/torch.h>
#include <iostream>

int main(int argc, char* argv[]) {
std::cout<<"MPS? "<<torch::mps::is_available()<<std::endl;

torch::Tensor tensor = torch::rand({2, 3}).to("mps");;
std::cout << tensor << std::endl;
}

输出:

1
2
3
4
MPS? 1
0.1982 0.2995 0.5541
0.4153 0.2684 0.4655
[ MPSFloatType{2,3} ]

程序2:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include <time.h>
#include <torch/torch.h>
#include <iostream>

#define USE_MPS 1

using namespace std;

struct Net : torch::nn::Module {
Net() {
conv1 = register_module("conv1", torch::nn::Conv2d(3, 64, 3));
conv2 = register_module("conv2", torch::nn::Conv2d(64, 128, 3));
conv3 = register_module("conv3", torch::nn::Conv2d(128, 256, 3));
fc1 = register_module("fc1", torch::nn::Linear(256, 128));
fc2 = register_module("fc2", torch::nn::Linear(128, 56));
fc3 = register_module("fc3", torch::nn::Linear(56, 10));
global_pool = register_module("global_pool", torch::nn::AdaptiveAvgPool2d(1));
}

torch::Tensor forward(torch::Tensor x) {
x = torch::relu(conv1->forward(x));
x = torch::max_pool2d(x, {2, 2});
x = torch::relu(conv2->forward(x));
x = torch::max_pool2d(x, {2, 2});
x = torch::relu(conv3->forward(x));
x = torch::max_pool2d(x, {2, 2});
x = global_pool->forward(x);
x = torch::relu(fc1->forward(x.reshape({x.size(0), -1})));
x = torch::relu(fc2->forward(x));
x = torch::log_softmax(fc3->forward(x), 1);

return x;
}

torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
torch::nn::Conv2d conv1{nullptr}, conv2{nullptr}, conv3{nullptr};
torch::nn::AdaptiveAvgPool2d global_pool{nullptr};
};

int main(int argc, char* argv[]) {
auto net = std::make_shared<Net>();
torch::Tensor data = torch::ones({8, 3, 128, 128});

#ifdef USE_MPS
net->to(torch::Device(torch::kMPS));
data = data.to("mps");
// torch::Tensor data = torch::ones({8, 3, 128, 128}).to("mps");
#endif

torch::Tensor y;
clock_t start, end;
start = clock();
for (int i = 0; i < 100; ++i) {
y = net->forward(data);
}
end = clock();
cout << "Time: " << double(end - start) / CLOCKS_PER_SEC << endl;

return 0;
}
Device Time
CPU 15.36
MPS 0.2671

生成TorchScript 4 5

A PyTorch model’s journey from Python to C++ is enabled by Torch Script, a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import torch
import torchvision

# An instance of your model.
model = torchvision.models.resnet18()

# An example input you would normally provide to your model's forward() method.
example = torch.rand(1, 3, 224, 224)

# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(model, example)

# full jit version model (not compatible mobile interpreter)
traced_script_module.save("traced_resnet_model.pt")

# Export mobile interpreter version model (compatible with mobile interpreter)
from torch.utils.mobile_optimizer import optimize_for_mobile
traced_script_module_lite = optimize_for_mobile(traced_script_module)
traced_script_module_lite._save_for_lite_interpreter("traced_resnet_model_lite.pt")

By default, for the CPU backend, optimize_for_mobile performs the following types of optimizations: * Conv2D and BatchNorm fusion which folds Conv2d-BatchNorm2d into Conv2d; * Insert and fold prepacked ops which rewrites the model graph to replace 2D convolutions and linear ops with their prepacked counterparts. * ReLU and hardtanh fusion which rewrites graph by finding ReLU/hardtanh ops and fuses them together. * Dropout removal which removes dropout nodes from this module when training is false. * Conv packed params hoisting which moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics.

For the Vulkan backend,optimize_for_mobile performs the following type of optimization: * Automatic GPU transfer which rewrites the graph so that moving input and output data to and from the GPU becomes part of the model.

Optimization types can be disabled by passing an optimization blocklist as an argument to optimize_for_mobile.

c++中读取TorchScript并Inference

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#include <torch/script.h> // One-stop header.

#include <iostream>
#include <memory>

int main(int argc, const char* argv[]) {
if (argc != 2) {
std::cerr << "usage: example-app <path-to-exported-script-module>\n";
return -1;
}

torch::jit::script::Module module;
try {
// Deserialize the ScriptModule from a file using torch::jit::load().
module = torch::jit::load(argv[1]);
}
catch (const c10::Error& e) {
std::cerr << "error loading the model\n";
return -1;
}

std::cout << "ok\n";
}

PyTorch to ONNX

1
conda install onnx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import torch
import torchvision

dummy_input = torch.randn(10, 3, 224, 224, device="cpu")
model = torchvision.models.alexnet(pretrained=True).cpu()

# Providing input and output names sets the display names for values
# within the model's graph. Setting these does not change the semantics
# of the graph; it is only for readability.
#
# The inputs to the network consist of the flat list of inputs (i.e.
# the values you would pass to the forward() method) followed by the
# flat list of parameters. You can partially specify names, i.e. provide
# a list here shorter than the number of inputs to the model, and we will
# only set that subset of names, starting from the beginning.
input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
output_names = [ "output1" ]

torch.onnx.export(model, dummy_input, "alexnet.onnx", verbose=True, input_names=input_names, output_names=output_names)

测试:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import onnx
import onnxruntime as ort
import numpy as np

# Load the ONNX model
model = onnx.load("alexnet.onnx")

# Check that the model is well formed
onnx.checker.check_model(model)

# Print a human readable representation of the graph
print(onnx.helper.printable_graph(model.graph))


ort_session = ort.InferenceSession("alexnet.onnx")

outputs = ort_session.run(
None,
{"actual_input_1": np.random.randn(10, 3, 224, 224).astype(np.float32)},
)
print(outputs[0])

For Android

Cross Compiling for Android NDK

修改/Users/luohanjie/Softwares/pytorch/scripts/build_android.sh:

1
CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=ON")
1
2
3
4
5
6
7
8
9
10
brew install automake libtool

cd pytorch
git checkout nightly
export ANDROID_NDK=/Users/luohanjie/Library/Android/sdk/ndk/21.4.7075529
export ANDROID_TOOLCHAIN=clang
export ANDROID_ABI=arm64-v8a
export BUILD_SHARED_LIBS=ON
export PYTHON_EXECUTABLE=`which python3`
sh scripts/build_android.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
-- ******** Summary ********
-- General:
-- CMake version : 3.24.2
-- CMake command : /opt/homebrew/Cellar/cmake/3.24.2/bin/cmake
-- System : Android
-- C++ compiler : /Users/luohanjie/Library/Android/sdk/ndk/21.4.7075529/toolchains/llvm/prebuilt/darwin-x86_64/bin/clang++
-- C++ compiler id : Clang
-- C++ compiler version : 9.0
-- Using ccache if found : ON
-- Found ccache : CCACHE_PROGRAM-NOTFOUND
-- CXX flags : -g -DANDROID -fdata-sections -ffunction-sections -funwind-tables -fstack-protector-strong -no-canonical-prefixes -D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -frtti -fexceptions -ffunction-sections -fdata-sections -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DUSE_VULKAN_WRAPPER -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_VULKAN -DUSE_VULKAN_API -DBUILD_LITE_INTERPRETER -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wvla-extension -Wno-range-loop-analysis -Wno-pass-failed -Wno-error=pedantic -Wno-error=old-style-cast -Wconstant-conversion -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-missing-braces -Wunused-lambda-capture -Qunused-arguments -fcolor-diagnostics -fdiagnostics-color=always -fno-math-errno -fno-trapping-math -Werror=format -g0
-- Build type : Release
-- Compile definitions :
-- CMAKE_PREFIX_PATH : /opt/homebrew/Caskroom/miniforge/base/envs/tf/lib/python3.10/site-packages;/Users/luohanjie/Library/Android/sdk/ndk/21.4.7075529/toolchains/llvm/prebuilt/darwin-x86_64
-- CMAKE_INSTALL_PREFIX : /Users/luohanjie/Softwares/pytorch/build_android/install
-- USE_GOLD_LINKER : OFF
--
-- TORCH_VERSION : 2.1.0
-- CAFFE2_VERSION : 2.1.0
-- BUILD_CAFFE2 : OFF
-- BUILD_CAFFE2_OPS : OFF
-- BUILD_STATIC_RUNTIME_BENCHMARK: OFF
-- BUILD_TENSOREXPR_BENCHMARK: OFF
-- BUILD_NVFUSER_BENCHMARK: OFF
-- BUILD_BINARY : OFF
-- BUILD_CUSTOM_PROTOBUF : OFF
-- Protobuf compiler :
-- Protobuf includes :
-- Protobuf libraries :
-- BUILD_DOCS : OFF
-- BUILD_PYTHON : OFF
-- BUILD_SHARED_LIBS : ON
-- CAFFE2_USE_MSVC_STATIC_RUNTIME : OFF
-- BUILD_TEST : OFF
-- BUILD_JNI : OFF
-- BUILD_MOBILE_AUTOGRAD : OFF
-- BUILD_LITE_INTERPRETER: ON
-- INTERN_BUILD_MOBILE : ON
-- TRACING_BASED : OFF
-- USE_BLAS : 1
-- BLAS :
-- BLAS_HAS_SBGEMM :
-- USE_LAPACK : 0
-- USE_ASAN : OFF
-- USE_TSAN : OFF
-- USE_CPP_CODE_COVERAGE : OFF
-- USE_CUDA : OFF
-- USE_ROCM : OFF
-- BUILD_NVFUSER : OFF
-- USE_EIGEN_FOR_BLAS : ON
-- USE_FBGEMM : OFF
-- USE_FAKELOWP : OFF
-- USE_KINETO : ON
-- USE_FFMPEG : OFF
-- USE_GFLAGS : OFF
-- USE_GLOG : OFF
-- USE_LEVELDB : OFF
-- USE_LITE_PROTO : OFF
-- USE_LMDB : OFF
-- USE_METAL : OFF
-- USE_PYTORCH_METAL : OFF
-- USE_PYTORCH_METAL_EXPORT : OFF
-- USE_MPS : OFF
-- USE_FFTW : OFF
-- USE_MKL :
-- USE_MKLDNN : OFF
-- USE_UCC : OFF
-- USE_ITT : OFF
-- USE_NCCL : OFF
-- USE_NNPACK : ON
-- USE_NUMPY : ON
-- USE_OBSERVERS : OFF
-- USE_OPENCL : OFF
-- USE_OPENCV : OFF
-- USE_OPENMP : OFF
-- USE_TBB : OFF
-- USE_VULKAN : ON
-- USE_VULKAN_FP16_INFERENCE : OFF
-- USE_VULKAN_RELAXED_PRECISION : OFF
-- USE_PROF : OFF
-- USE_QNNPACK : OFF
-- USE_PYTORCH_QNNPACK : ON
-- USE_XNNPACK : ON
-- USE_REDIS : OFF
-- USE_ROCKSDB : OFF
-- USE_ZMQ : OFF
-- USE_DISTRIBUTED : OFF
-- Public Dependencies :
-- Private Dependencies : Threads::Threads;eigen_blas;pthreadpool;cpuinfo;pytorch_qnnpack;nnpack;XNNPACK;VulkanWrapper;fp16;log;fmt::fmt-header-only;kineto;dl
-- Public CUDA Deps. :
-- Private CUDA Deps. :
-- USE_COREML_DELEGATE : OFF
-- BUILD_LAZY_TS_BACKEND : OFF
-- TORCH_DISABLE_GPU_ASSERTS : OFF
-- Configuring done
-- Generating done
-- Build files have been written to: /Users/luohanjie/Softwares/pytorch/build_android

Building PyTorch with Vulkan 6

Build PyTorch

PyTorch supports the ability to run model inference on GPUs that support the Vulkan graphics and compute API. The primary target devices are mobile GPUs on Android devices. Vulkan backend is not included by default. The main switch to include Vulkan backend is cmake option USE_VULKAN, that can be set by environment variable USE_VULKAN. To use PyTorch with Vulkan backend, we need to build it from source with additional settings.

下载Vulkan Sdk,双击并且安装。

卸载方法:sudo path_to_vulkan_sdk/uninstall.sh

1
2
3
4
5
6
7
8
vulkaninfo

==========
VULKANINFO
==========

Vulkan Instance Version: 1.3.239
...

Build PyTorch with Vulkan:

1
2
3
4
5
6
7
8
9
conda create --name pytorch_vulkan python=3.10
conda activate pytorch_vulkan

conda install pkg-config libuv pyyaml typing-extensions sympy
brew install google-benchmark

cd PYTORCH_ROOT
git checkout nightly
git submodule update --recursive --remote

修改CMakeLists.txt

1
2
3
4
5
6
option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" OFF)

option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." ON)

#添加
string(APPEND CMAKE_CXX_FLAGS " -D_LIBCPP_DISABLE_AVAILABILITY")

修改c10/CMakeLists.txt

1
#add_subdirectory(benchmark)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
USE_VULKAN=1 USE_VULKAN_SHADERC_RUNTIME=1 USE_VULKAN_WRAPPER=0 MACOSX_DEPLOYMENT_TARGET=10.11 CC=clang CXX=clang++ python setup.py install


```bash
-- ******** Summary ********
-- General:
-- CMake version : 3.24.2
-- CMake command : /opt/homebrew/Cellar/cmake/3.24.2/bin/cmake
-- System : Darwin
-- C++ compiler : /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang++
-- C++ compiler id : AppleClang
-- C++ compiler version : 14.0.0.14000029
-- Using ccache if found : ON
-- Found ccache : CCACHE_PROGRAM-NOTFOUND
-- CXX flags : -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_VULKAN -DUSE_VULKAN_API -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -D_LIBCPP_DISABLE_AVAILABILITY -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wvla-extension -Wno-range-loop-analysis -Wno-pass-failed -Wsuggest-override -Wno-error=pedantic -Wno-error=old-style-cast -Wconstant-conversion -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-missing-braces -Wunused-lambda-capture -Qunused-arguments -fcolor-diagnostics -fdiagnostics-color=always -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -DUSE_MPS -fno-objc-arc -Wno-unguarded-availability-new -Wno-unused-private-field -Wno-missing-braces
-- Build type : Release
-- Compile definitions : ONNX_ML=1;ONNXIFI_ENABLE_EXT=1;ONNX_NAMESPACE=onnx_torch;HAVE_MMAP=1;_FILE_OFFSET_BITS=64;HAVE_SHM_OPEN=1;HAVE_SHM_UNLINK=1;USE_EXTERNAL_MZCRC;MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
-- CMAKE_PREFIX_PATH : /opt/homebrew/Caskroom/miniforge/base/envs/pytorch_vulkan/lib/python3.10/site-packages;/opt/homebrew/Caskroom/miniforge/base/envs/pytorch_vulkan
-- CMAKE_INSTALL_PREFIX : /Users/luohanjie/Softwares/pytorch/torch
-- USE_GOLD_LINKER : OFF
--
-- TORCH_VERSION : 2.1.0
-- CAFFE2_VERSION : 2.1.0
-- BUILD_CAFFE2 : OFF
-- BUILD_CAFFE2_OPS : OFF
-- BUILD_STATIC_RUNTIME_BENCHMARK: OFF
-- BUILD_TENSOREXPR_BENCHMARK: OFF
-- BUILD_NVFUSER_BENCHMARK: OFF
-- BUILD_BINARY : OFF
-- BUILD_CUSTOM_PROTOBUF : OFF
-- Protobuf compiler :
-- Protobuf includes :
-- Protobuf libraries :
-- BUILD_DOCS : OFF
-- BUILD_PYTHON : True
-- Python version : 3.10.9
-- Python executable : /opt/homebrew/Caskroom/miniforge/base/envs/pytorch_vulkan/bin/python3
-- Pythonlibs version : 3.10.9
-- Python library : /opt/homebrew/Caskroom/miniforge/base/envs/pytorch_vulkan/lib/libpython3.10.a
-- Python includes : /opt/homebrew/Caskroom/miniforge/base/envs/pytorch_vulkan/include/python3.10
-- Python site-packages: lib/python3.10/site-packages
-- BUILD_SHARED_LIBS : ON
-- CAFFE2_USE_MSVC_STATIC_RUNTIME : OFF
-- BUILD_TEST : True
-- BUILD_JNI : OFF
-- BUILD_MOBILE_AUTOGRAD : OFF
-- BUILD_LITE_INTERPRETER: OFF
-- CROSS_COMPILING_MACOSX :
-- INTERN_BUILD_MOBILE :
-- TRACING_BASED : OFF
-- USE_BLAS : 1
-- BLAS : accelerate
-- BLAS_HAS_SBGEMM :
-- USE_LAPACK : 1
-- LAPACK : accelerate
-- USE_ASAN : OFF
-- USE_TSAN : OFF
-- USE_CPP_CODE_COVERAGE : OFF
-- USE_CUDA : OFF
-- USE_ROCM : OFF
-- BUILD_NVFUSER : OFF
-- USE_EIGEN_FOR_BLAS : ON
-- USE_FBGEMM : OFF
-- USE_FAKELOWP : OFF
-- USE_KINETO : ON
-- USE_FFMPEG : OFF
-- USE_GFLAGS : OFF
-- USE_GLOG : OFF
-- USE_LEVELDB : OFF
-- USE_LITE_PROTO : OFF
-- USE_LMDB : OFF
-- USE_METAL : OFF
-- USE_PYTORCH_METAL : OFF
-- USE_PYTORCH_METAL_EXPORT : OFF
-- USE_MPS : ON
-- USE_FFTW : ON
-- USE_MKL : OFF
-- USE_MKLDNN : OFF
-- USE_UCC : OFF
-- USE_ITT : OFF
-- USE_NCCL : OFF
-- USE_NNPACK : ON
-- USE_NUMPY : OFF
-- USE_OBSERVERS : ON
-- USE_OPENCL : OFF
-- USE_OPENCV : OFF
-- USE_OPENMP : OFF
-- USE_TBB : OFF
-- USE_VULKAN : 1
-- USE_VULKAN_FP16_INFERENCE : OFF
-- USE_VULKAN_RELAXED_PRECISION : OFF
-- USE_PROF : OFF
-- USE_QNNPACK : OFF
-- USE_PYTORCH_QNNPACK : ON
-- USE_XNNPACK : ON
-- USE_REDIS : OFF
-- USE_ROCKSDB : OFF
-- USE_ZMQ : OFF
-- USE_DISTRIBUTED : OFF
-- Public Dependencies :
-- Private Dependencies : Threads::Threads;pthreadpool;cpuinfo;pytorch_qnnpack;nnpack;XNNPACK;/usr/local/lib/libvulkan.dylib;fp16;foxi_loader;fmt::fmt-header-only;kineto
-- Public CUDA Deps. :
-- Private CUDA Deps. :
-- USE_COREML_DELEGATE : OFF
-- BUILD_LAZY_TS_BACKEND : ON
-- TORCH_DISABLE_GPU_ASSERTS : OFF
-- Configuring done
-- Generating done
1
2
3
4
import torch
print(torch.__version__)

2.1.0a0+git517a432

生成TorchScript

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import torch
import torchvision

# An instance of your model.
model = torchvision.models.resnet18()

# An example input you would normally provide to your model's forward() method.
example = torch.rand(1, 3, 224, 224)

# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(model, example)

# Export mobile interpreter version model (compatible with mobile interpreter)
from torch.utils.mobile_optimizer import optimize_for_mobile
traced_script_module_vulkan = optimize_for_mobile(traced_script_module, backend='vulkan')
traced_script_module_vulkan._save_for_lite_interpreter("traced_script_module_vulkan.pt")

If you see the error message: PytorchStreamReader failed locating file bytecode.pkl: file not found (), likely you are using a torch script model that requires the use of the PyTorch JIT interpreter (a version of our PyTorch interpreter that is not as size-efficient). In order to leverage our efficient interpreter, please regenerate the model by running: module._save_for_lite_interpreter(${model_path}).

If bytecode.pkl is missing, likely the model is generated with the api: module.save(${model_psth}).

The api _load_for_lite_interpreter(${model_psth}) can be helpful to validate model with the efficient mobile interpreter.


  1. https://github.com/pytorch/pytorch/blob/master/docs/libtorch.rst↩︎

  2. https://github.com/pytorch/pytorch/issues/64645↩︎

  3. https://pytorch.org/cppdocs/installing.html↩︎

  4. https://pytorch.org/tutorials/advanced/cpp_export.html↩︎

  5. https://pytorch.org/tutorials/recipes/script_optimized.html↩︎

  6. https://pytorch.org/tutorials/prototype/vulkan_workflow.html↩︎

1
brew install abseil google-benchmark
1
2
3
git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
cd tensorflow_src
git checkout v2.9.3

更高级的版本可能会出现编译错误,或者调用gpu时出现问题。

修改tensorflow/lite/c/CMakeLists.txt中的common.c,改为common.cc1

1
2
3
4
mkdir build_mac
cd build_mac
cmake ../tensorflow/lite/c -D TFLITE_KERNEL_TEST=ON -D TFLITE_ENABLE_GPU=ON -D ABSL_PROPAGATE_CXX_STD=ON -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -D LIBRARY_OUTPUT_PATH=/Users/luohanjie/Softwares/tensorflow_src/build_mac/lib
cmake --build . -j

编译测试程序benchmark_model,测试模型model_opt.tflite,使用cpu:

1
cmake --build . -j -t benchmark_model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
./tensorflow-lite/tools/benchmark/benchmark_model --graph=/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite --verbose=true --num_threads=4 -use_gpu=false

STARTING!
Log parameter values verbosely: [1]
Min num runs: [50]
Min runs duration (seconds): [1]
Max runs duration (seconds): [150]
Inter-run delay (seconds): [-1]
Number of prorated runs per second: [-1]
Num threads: [4]
Use caching: [0]
Benchmark name: []
Output prefix: []
Min warmup runs: [1]
Min warmup runs duration (seconds): [0.5]
Run w/o invoking kernels: [0]
Report the peak memory footprint: [0]
Memory footprint check interval (ms): [50]
Graph: [/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite]
Input layers: []
Input shapes: []
Input value ranges: []
Input value files: []
Allow fp16: [0]
Require full delegation: [0]
Enable op profiling: [0]
Max initial profiling buffer entries: [1024]
Allow dynamic increase on profiling buffer entries: [0]
CSV File to export profiling data to: []
Print pre-invoke interpreter state: [0]
Print post-invoke interpreter state: [0]
Release dynamic tensor memory: [0]
Use dynamic tensor for large tensors: [0]
print out all supported flags: [0]
#threads used for CPU inference: [4]
Max number of delegated partitions: [0]
Min nodes per partition: [0]
Directory for delegate serialization: []
Model-specific token/key for delegate serialization.: []
Use xnnpack: [0]
External delegate path: []
External delegate options: []
Use gpu: [0]
Allow lower precision in gpu: [1]
Enable running quant models in gpu: [1]
Prefer maximizing the throughput in gpu: [0]
GPU backend: []
Loaded model /Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
The input model file size (MB): 66.3383
Initialized session in 41.498ms.
Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
count=13 first=43827 curr=38759 min=38662 max=45293 avg=39973.3 std=1998

Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
count=50 first=39240 curr=38747 min=38470 max=40766 avg=39654.3 std=635

Inference timings in us: Init: 41498, First inference: 43827, Warmup (avg): 39973.3, Inference (avg): 39654.3

使用gpu:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
./tensorflow-lite/tools/benchmark/benchmark_model --graph=/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite --verbose=true --num_threads=4 --use_gpu=true

STARTING!
Log parameter values verbosely: [1]
Min num runs: [50]
Min runs duration (seconds): [1]
Max runs duration (seconds): [150]
Inter-run delay (seconds): [-1]
Number of prorated runs per second: [-1]
Num threads: [4]
Use caching: [0]
Benchmark name: []
Output prefix: []
Min warmup runs: [1]
Min warmup runs duration (seconds): [0.5]
Run w/o invoking kernels: [0]
Report the peak memory footprint: [0]
Memory footprint check interval (ms): [50]
Graph: [/Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite]
Input layers: []
Input shapes: []
Input value ranges: []
Input value files: []
Allow fp16: [0]
Require full delegation: [0]
Enable op profiling: [0]
Max initial profiling buffer entries: [1024]
Allow dynamic increase on profiling buffer entries: [0]
CSV File to export profiling data to: []
Print pre-invoke interpreter state: [0]
Print post-invoke interpreter state: [0]
Release dynamic tensor memory: [0]
Use dynamic tensor for large tensors: [0]
print out all supported flags: [0]
#threads used for CPU inference: [4]
Max number of delegated partitions: [0]
Min nodes per partition: [0]
Directory for delegate serialization: []
Model-specific token/key for delegate serialization.: []
Use xnnpack: [0]
External delegate path: []
External delegate options: []
Use gpu: [1]
Allow lower precision in gpu: [1]
Enable running quant models in gpu: [1]
Prefer maximizing the throughput in gpu: [0]
GPU backend: []
Loaded model /Users/luohanjie/Workspace/Vision/my_slam/data/models/model_opt.tflite
INFO: Created TensorFlow Lite delegate for GPU.
GPU delegate created.
INFO: Initialized OpenCL-based API.
INFO: Created 1 GPU delegate kernels.
Explicitly applied GPU delegate, and the model graph will be completely executed by the delegate.
The input model file size (MB): 66.3383
Initialized session in 129.521ms.
Running benchmark for at least 1 iterations and at least 0.5 seconds but terminate if exceeding 150 seconds.
count=40 first=40053 curr=11752 min=11744 max=40053 avg=12579.9 std=4400

Running benchmark for at least 50 iterations and at least 1 seconds but terminate if exceeding 150 seconds.
count=85 first=11880 curr=11836 min=11567 max=12276 avg=11839.5 std=93

Inference timings in us: Init: 129521, First inference: 40053, Warmup (avg): 12579.9, Inference (avg): 11839.5

  1. https://github.com/tensorflow/tensorflow/pull/54566↩︎

安装

下载Vulkan Sdk,双击并且安装。

卸载方法:sudo path_to_vulkan_sdk/uninstall.sh

1
brew install ncnn

Cmake测试程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# #OpenMP flags for MACOS
if (APPLE)
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
set(OpenMP_C "${CMAKE_C_COMPILER}")
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -Wno-unused-command-line-argument -I/opt/homebrew/opt/libomp/include")
set(OpenMP_C_LIB_NAMES "libomp")
set(OpenMP_libomp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
endif ()
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -Wno-unused-command-line-argument -I/opt/homebrew/opt/libomp/include")
set(OpenMP_CXX_LIB_NAMES "libomp")
set(OpenMP_libomp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
endif ()
endif ()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

find_package(ncnn REQUIRED)
find_package(OpenCV REQUIRED)

include_directories(${ncnn_INCLUDE} ${OpenCV_LIBRARY_DIRS})

add_executable(test_ncnn test_ncnn.cpp)
target_link_libraries(test_ncnn ncnn ${OpenCV_LIBS})

测试网络为midas_v21_small-int8

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#include "net.h"
#include "mat.h"
#include "cpu.h"
#include <opencv2/opencv.hpp>
#include <sys/time.h>

int main(int argc, char* argv[]) {
std::string img_file = "/Users/luohanjie/Workspace/Vision/depth_estimation/MiDaS/input/squirrel_iphone_sample3.png";
std::string param_file = "/Users/luohanjie/Workspace/Vision/my_slam/data/models/midas_v21_small-int8.param";
std::string model_file = "/Users/luohanjie/Workspace/Vision/my_slam/data/models/midas_v21_small-int8.bin";
int target_size = 256;
float scale = 0.33333f;

cv::Mat img = cv::imread(img_file);
cv::resize(img, img, cv::Size(), scale, scale);

int img_width = img.cols;
int img_height = img.rows;

ncnn::Net net;
ncnn::set_cpu_powersave(0); // 0 = all cores enabled(default)
ncnn::set_omp_num_threads(ncnn::get_cpu_count());
net.opt = ncnn::Option();
net.opt.use_vulkan_compute = false;
net.opt.num_threads = ncnn::get_cpu_count();

net.load_param(param_file.c_str());
net.load_model(model_file.c_str());

// https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-opencv.md
// cv::Mat CV_8UC3 -> ncnn::Mat 3 channel + swap RGB/BGR
ncnn::Mat img_in = ncnn::Mat::from_pixels_resize(img.data, ncnn::Mat::PIXEL_BGR2RGB, img_width, img_height, target_size, target_size);

// substract_mean_normalize(const float* mean_vals, const float* norm_vals): substract channel-wise mean values, then multiply by normalize values, pass 0 to skip in ncnn.
const float mean_vals[3] = {123.675f, 116.28f, 103.53f};
const float norm_vals[3] = {0.01712475383f, 0.0175070028f, 0.01742919389f};
img_in.substract_mean_normalize(mean_vals, norm_vals);

ncnn::Extractor ex = net.create_extractor();
ex.set_light_mode(true);

ncnn::Mat img_out;

ex.input("input.1", img_in);
ex.extract("649", img_out);
ncnn::resize_bilinear(img_out, img_out, img_width, img_height);

cv::Mat cv_out(img_out.h, img_out.w, CV_8UC1);
img_out.to_pixels(cv_out.data, ncnn::Mat::PIXEL_GRAY);

cv::imshow("cv_out", cv_out);
cv::waitKey(0);
return 0;
}
0%