Skip to content

编译 NCNN

1. 编译 CMake

NCNN 编译要求 CMake 版本大于 3.13(当前 2023-02-02),而 Ubuntu 18 源很快停止维护,将维持 3.10 版本,所以我们需要从源码安装。

卸载 CMake:

bash
apt-get remove cmake

安装 OpenSSL 库:

bash
sudo apt-get install openssl libssl-dev

CMake: download 页面找到最新版本,下载源代码:

bash
wget https://github.com/Kitware/CMake/releases/download/v3.25.2/cmake-3.25.2.tar.gz
tar xzvf cmake-3.25.2.tar.gz
cd cmake-3.25.2

编译安装:

bash
./bootstrap
make -j $(nproc)
sudo make install

下面测试 CMake 版本:

bash
cmake --version

2. 编译 NCNN

2.1 正常编译

克隆仓库,如果克隆失败建议使用代理:

bash
git clone https://github.com/Tencent/ncnn.git
cd ncnn
git submodule update --init

现在准备编译:

bash
mkdir -p build
cd build
cmake \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_TOOLCHAIN_FILE=../toolchains/jetson.toolchain.cmake \
    -DNCNN_VULKAN=ON \
    -DNCNN_SIMPLEOCV=ON \
    -DNCNN_BUILD_EXAMPLES=ON \
    ..
make -j $(nproc)

CUDA 错误

TODO 在未指定 -DNCNN_SIMPLEOCV=ON 时将使用系统的 OpenCV,如果系统的 OpenCV 包含 CUDA 支持,那么此时需要指定 CUDA 的位置信息。但是通过各种方法仍然不能编译通过,出现找不到 CUDA 库信息的报告。当前使用 CUDA 10.2。

2.2 使用 Clang 套件

需要编辑 jetson.toolchain.cmake 来禁用默认的 GCC 套件:

cmake
# set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
# set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")

# set ${CMAKE_C_FLAGS} and ${CMAKE_CXX_FLAGS}flag for cross-compiled process
# -march=armv8-a could work on Jetson, but will compile without some extra cpu features
# set(CMAKE_C_FLAGS "-march=native ${CMAKE_C_FLAGS}")
# set(CMAKE_CXX_FLAGS "-march=native ${CMAKE_CXX_FLAGS}")

下面使用 Clang 套件编译:

bash
cmake \
    -DCMAKE_CXX_COMPILER=$(which clang++) \
    -DCMAKE_C_COMPILER=$(which clang) \
    -DCMAKE_LINKER=$(which lld) \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_TOOLCHAIN_FILE=../toolchains/jetson.toolchain.cmake \
    -DNCNN_VULKAN=ON \
    -DNCNN_SIMPLEOCV=ON \
    -DNCNN_BUILD_EXAMPLES=ON \
    ..
make -j $(nproc)

段错误

TODO 该测试使用 Clang 套件,并包含 Vulkan 支持,执行任何测试程序输出结果后不能正确停止,抛出段错误:

bash
cd ../examples
../build/examples/squeezenet ../images/128-ncnn.png

输出:

log
[0 NVIDIA Tegra X1 (nvgpu)]  queueC=0[16]  queueG=0[16]  queueT=0[16]
[0 NVIDIA Tegra X1 (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra X1 (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra X1 (nvgpu)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
532 = 0.138184
920 = 0.061310
598 = 0.049622
Segmentation fault (core dumped)

暂时未解决。猜测和 Vulkan 有关,未使用 Vulkan 的 CPU 版本没有此错误。

3. 基准测试

bash
cd ../benchmark
../build/benchmark/benchncnn 10 $(nproc) 0 0

含 Vulkan 的版本在 Jetson Nano(B01)上的测试结果如下(2023-02-02):

log
[0 NVIDIA Tegra X1 (nvgpu)]  queueC=0[16]  queueG=0[16]  queueT=0[16]
[0 NVIDIA Tegra X1 (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra X1 (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra X1 (nvgpu)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    9.32  max =    9.46  avg =    9.37
     squeezenet_int8  min =   25.15  max =   25.30  avg =   25.23
           mobilenet  min =   15.72  max =   15.86  avg =   15.79
      mobilenet_int8  min =   27.42  max =   27.72  avg =   27.57
        mobilenet_v2  min =   12.43  max =   12.53  avg =   12.48
        mobilenet_v3  min =   13.13  max =   13.26  avg =   13.19
          shufflenet  min =   10.93  max =   11.01  avg =   10.96
       shufflenet_v2  min =    9.82  max =   10.01  avg =    9.86
             mnasnet  min =   13.01  max =   13.25  avg =   13.07
     proxylessnasnet  min =   14.27  max =   14.41  avg =   14.32
     efficientnet_b0  min =   20.98  max =   21.22  avg =   21.09
   efficientnetv2_b0  min =   38.90  max =   39.22  avg =   39.02
        regnety_400m  min =   17.12  max =   17.30  avg =   17.19
           blazeface  min =    3.24  max =    3.29  avg =    3.26
           googlenet  min =   30.27  max =   30.49  avg =   30.34
      googlenet_int8  min =   79.61  max =   80.18  avg =   79.85
            resnet18  min =   27.47  max =   27.57  avg =   27.52
       resnet18_int8  min =   59.07  max =   59.74  avg =   59.40
             alexnet  min =   48.73  max =   49.26  avg =   48.96
               vgg16  min =  142.72  max =  144.31  avg =  143.05
          vgg16_int8  min =  286.85  max =  288.51  avg =  287.74
            resnet50  min =   71.55  max =   71.96  avg =   71.71
       resnet50_int8  min =  148.30  max =  149.12  avg =  148.60
      squeezenet_ssd  min =   32.92  max =   33.23  avg =   33.05
 squeezenet_ssd_int8  min =   65.86  max =   66.57  avg =   66.14
       mobilenet_ssd  min =   34.70  max =   35.01  avg =   34.86
  mobilenet_ssd_int8  min =   56.45  max =   56.75  avg =   56.63
      mobilenet_yolo  min =   70.04  max =   70.67  avg =   70.39
  mobilenetv2_yolov3  min =   45.47  max =   45.76  avg =   45.61
         yolov4-tiny  min =   66.89  max =   67.40  avg =   67.12
           nanodet_m  min =   18.73  max =   18.88  avg =   18.81
    yolo-fastest-1.1  min =   11.22  max =   11.39  avg =   11.30
      yolo-fastestv2  min =    9.76  max =    9.89  avg =    9.81
  vision_transformer  min = 2682.28  max = 2697.72  avg = 2689.92
          FastestDet  min =    9.94  max =   10.10  avg =   10.03

不含 Vulkan 的 CPU 版本在 Jetson Nano 上的测试结果如下(2023-02-02):

log
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =   28.98  max =   29.28  avg =   29.11
     squeezenet_int8  min =   25.10  max =   25.53  avg =   25.23
           mobilenet  min =   39.80  max =   40.09  avg =   39.96
      mobilenet_int8  min =   27.29  max =   27.63  avg =   27.49
        mobilenet_v2  min =   31.51  max =   31.98  avg =   31.65
        mobilenet_v3  min =   26.49  max =   26.86  avg =   26.69
          shufflenet  min =   20.04  max =   20.58  avg =   20.26
       shufflenet_v2  min =   17.02  max =   17.60  avg =   17.19
             mnasnet  min =   29.49  max =   30.02  avg =   29.62
     proxylessnasnet  min =   34.17  max =   34.86  avg =   34.42
     efficientnet_b0  min =   59.06  max =   59.87  avg =   59.28
   efficientnetv2_b0  min =   65.72  max =   91.38  avg =   68.73
        regnety_400m  min =   56.01  max =   56.48  avg =   56.21
           blazeface  min =    5.67  max =    5.84  avg =    5.74
           googlenet  min =   90.85  max =   91.61  avg =   91.17
      googlenet_int8  min =   78.91  max =   79.58  avg =   79.19
            resnet18  min =   80.18  max =   82.15  avg =   80.64
       resnet18_int8  min =   58.98  max =   77.46  avg =   61.45
             alexnet  min =   60.00  max =   61.35  avg =   60.46
               vgg16  min =  377.00  max =  382.35  avg =  379.10
          vgg16_int8  min =  285.44  max =  288.74  avg =  286.98
            resnet50  min =  196.79  max =  197.87  avg =  197.23
       resnet50_int8  min =  148.09  max =  149.84  avg =  148.88
      squeezenet_ssd  min =   82.21  max =   82.94  avg =   82.54
 squeezenet_ssd_int8  min =   65.03  max =   65.51  avg =   65.25
       mobilenet_ssd  min =   83.00  max =   83.69  avg =   83.28
  mobilenet_ssd_int8  min =   56.43  max =   56.95  avg =   56.65
      mobilenet_yolo  min =  185.10  max =  186.02  avg =  185.55
  mobilenetv2_yolov3  min =  111.98  max =  112.51  avg =  112.23
         yolov4-tiny  min =  156.89  max =  158.15  avg =  157.47
           nanodet_m  min =   40.70  max =   41.68  avg =   41.11
    yolo-fastest-1.1  min =   22.65  max =   22.91  avg =   22.79
      yolo-fastestv2  min =   18.55  max =   18.95  avg =   18.65
  vision_transformer  min = 3163.91  max = 3259.01  avg = 3206.24
          FastestDet  min =   19.80  max =   20.10  avg =   19.90

下面是详细比较(平均值):

网络名称GPU(Vulkan) 平均CPU 平均
squeezenet9.3729.11
squeezenet_int825.2325.23
mobilenet15.7939.96
mobilenet_int827.5727.49
mobilenet_v212.4831.65
mobilenet_v313.1926.69
shufflenet10.9620.26
shufflenet_v29.8617.19
mnasnet13.0729.62
proxylessnasnet14.3234.42
efficientnet_b021.0959.28
efficientnetv2_b039.0268.73
regnety_400m17.1956.21
blazeface3.265.74
googlenet30.3491.17
googlenet_int879.8579.19
resnet1827.5280.64
resnet18_int859.4061.45
alexnet48.9660.46
vgg16143.05379.10
vgg16_int8287.74286.98
resnet5071.71197.23
resnet50_int8148.60148.88
squeezenet_ssd33.0582.54
squeezenet_ssd_int866.1465.25
mobilenet_ssd34.8683.28
mobilenet_ssd_int856.6356.65
mobilenet_yolo70.39185.55
mobilenetv2_yolov345.61112.23
yolov4-tiny67.12157.47
nanodet_m18.8141.11
yolo-fastest-1.111.3022.79
yolo-fastestv29.8118.65
vision_transformer2689.923206.24
FastestDet10.0319.90