feat: 切换后端至PaddleOCR-NCNN,切换工程为CMake
1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试 2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程 3.重整权利声明文件,重整代码工程,确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
This commit is contained in:
57
3rdparty/ncnn/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
vendored
Normal file
57
3rdparty/ncnn/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
```c
|
||||
// v寄存器全部使用 %.4s
|
||||
// 128-bit vreg matches %.4s
|
||||
// a += b * c
|
||||
float32x4_t _a = vld1q_f32(a);
|
||||
float32x4_t _b = vld1q_f32(b);
|
||||
float32x4_t _c = vld1q_f32(c);
|
||||
asm volatile(
|
||||
"fmla %0.4s, %2.4s, %3.4s"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// v寄存器使用低64位 %.2s
|
||||
// low 64-bit vreg matches %.2s
|
||||
// a += b * c
|
||||
float32x2_t _a = vld1_f32(a);
|
||||
float32x2_t _b = vld1_f32(b);
|
||||
float32x2_t _c = vld1_f32(c);
|
||||
asm volatile(
|
||||
"fmla %0.2s, %2.2s, %3.2s"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3]
|
||||
// 32-bit register matches %.s[0]
|
||||
// a += b * c[0]
|
||||
// a += b * c[1]
|
||||
// a += b * c[2]
|
||||
// a += b * c[3]
|
||||
float32x4_t _a = vld1_f32(a);
|
||||
float32x4_t _b = vld1_f32(b);
|
||||
float32x4_t _c = vld1_f32(c);
|
||||
asm volatile(
|
||||
"fmla %0.4s, %2.4s, %3.s[0]"
|
||||
"fmla %0.4s, %2.4s, %3.s[1]"
|
||||
"fmla %0.4s, %2.4s, %3.s[2]"
|
||||
"fmla %0.4s, %2.4s, %3.s[3]"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
|
||||
|
||||
qwq
|
175
3rdparty/ncnn/docs/developer-guide/add-custom-layer.zh.md
vendored
Normal file
175
3rdparty/ncnn/docs/developer-guide/add-custom-layer.zh.md
vendored
Normal file
@ -0,0 +1,175 @@
|
||||
# NCNN增加自定义层
|
||||
|
||||
## 举例
|
||||
|
||||
这里举个例子添加自定义层次 如Relu6,即 std::min(6, std::max(0, val))
|
||||
|
||||
```
|
||||
Input input 0 1 input
|
||||
Convolution conv2d 1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
|
||||
Relu6 relu6 1 1 conv2d relu6
|
||||
Pooling maxpool 1 1 relu6 maxpool 0=0 1=3 2=2 3=-233 4=0
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 定义源码h文件:src/layer/relu6.h
|
||||
|
||||
```CPP
|
||||
#ifndef LAYER_RELU6_H
|
||||
#define LAYER_RELU6_H
|
||||
|
||||
#include "layer.h"
|
||||
|
||||
namespace ncnn {
|
||||
|
||||
class Relu6 : public Layer
|
||||
{
|
||||
public:
|
||||
Relu6();
|
||||
|
||||
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
|
||||
};
|
||||
|
||||
} // namespace ncnn
|
||||
|
||||
#endif // LAYER_RELU6_H
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 定义源码CPP文件:src/layer/relu6.cpp
|
||||
|
||||
```CPP
|
||||
#include "relu6.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
namespace ncnn {
|
||||
|
||||
Relu6::Relu6()
|
||||
{
|
||||
one_blob_only = true;
|
||||
support_inplace = true;
|
||||
}
|
||||
|
||||
int Relu6::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
|
||||
{
|
||||
int w = bottom_top_blob.w;
|
||||
int h = bottom_top_blob.h;
|
||||
int channels = bottom_top_blob.c;
|
||||
int size = w * h;
|
||||
|
||||
#pragma omp parallel for num_threads(opt.num_threads)
|
||||
for (int q=0; q < channels; q++)
|
||||
{
|
||||
float* ptr = bottom_top_blob.channel(q);
|
||||
|
||||
for (int i=0; i<size; i++)
|
||||
{
|
||||
ptr[i] = std::min(6, std::max(0, ptr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace ncnn
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 修改 src/CMakeLists.txt 注册Relu6
|
||||
|
||||
```CPP
|
||||
ncnn_add_layer(GroupNorm)
|
||||
ncnn_add_layer(LayerNorm)
|
||||
ncnn_add_layer(Relu6)
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 定义测试用例CPP文件 src/test_relu6.cpp
|
||||
|
||||
```CPP
|
||||
#include "layer/relu6.h"
|
||||
#include "testutil.h"
|
||||
|
||||
static int test_relu6(const ncnn::Mat& a)
|
||||
{
|
||||
ncnn::ParamDict pd;
|
||||
|
||||
std::vector<ncnn::Mat> weights(0);
|
||||
|
||||
int ret = test_layer<ncnn::Relu6>("Relu6", pd, weights, a);
|
||||
if (ret != 0)
|
||||
{
|
||||
fprintf(stderr, "test_relu6 failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int test_relu6_0()
|
||||
{
|
||||
return 0
|
||||
|| test_relu6(RandomMat(5, 7, 24))
|
||||
|| test_relu6(RandomMat(7, 9, 12))
|
||||
|| test_relu6(RandomMat(3, 5, 13));
|
||||
}
|
||||
|
||||
static int test_relu6_1()
|
||||
{
|
||||
return 0
|
||||
|| test_relu6(RandomMat(15, 24))
|
||||
|| test_relu6(RandomMat(17, 12))
|
||||
|| test_relu6(RandomMat(19, 15));
|
||||
}
|
||||
|
||||
static int test_relu6_2()
|
||||
{
|
||||
return 0
|
||||
|| test_relu6(RandomMat(128))
|
||||
|| test_relu6(RandomMat(124))
|
||||
|| test_relu6(RandomMat(127));
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
SRAND(7767517);
|
||||
|
||||
return 0
|
||||
|| test_relu6_0()
|
||||
|| test_relu6_1()
|
||||
|| test_relu6_2();
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 修改tests/CMakeLists.txt 注册Relu6测试用例
|
||||
|
||||
```CPP
|
||||
ncnn_add_layer_test(LSTM)
|
||||
ncnn_add_layer_test(Yolov3DetectionOutput)
|
||||
ncnn_add_layer_test(Relu6)
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 编译
|
||||
|
||||
```
|
||||
按原NCNN步骤编译
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 单元测试
|
||||
|
||||
```
|
||||
./test_relu6
|
||||
```
|
||||
|
85
3rdparty/ncnn/docs/developer-guide/arm-a53-a55-dual-issue.md
vendored
Normal file
85
3rdparty/ncnn/docs/developer-guide/arm-a53-a55-dual-issue.md
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
## natural assembly
|
||||
* no register dependency, no penalty
|
||||
```
|
||||
ld1 {v0.4s}, [r0], #16
|
||||
fmla v10.4s, v16.4s, v24.s[0]
|
||||
fmla v11.4s, v16.4s, v24.s[1]
|
||||
fmla v12.4s, v16.4s, v24.s[2]
|
||||
fmla v13.4s, v16.4s, v24.s[3]
|
||||
```
|
||||
|
||||
## A53
|
||||
* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
|
||||
* 64bit vector load cannot be dual issued with fmla, wait 1 cycle
|
||||
* 64bit integer load can be dual issued with fmla, no penalty
|
||||
* pointer update can be dual issued with fmla, no penalty
|
||||
* 64bit vector load and 64bit vector insert can be dual issued, no penalty
|
||||
* any vector load cannot be issued on the 4th cycle of each fmla (enters the accumulator pipeline)
|
||||
|
||||
### practical guide
|
||||
* use 64bit vector load only
|
||||
* issue vector load every three fmla
|
||||
* 1 cycle to load 64bit, dual issue with the previous interleaved 64bit insert
|
||||
* load the remaining 64bit into integer register, dual issue with fmla
|
||||
* update pointer, dual issue with fmla
|
||||
* insert 64bit into vector from integer register, dual issue with the next interleaved 64bit load
|
||||
* add nop every three fmla if no load, seems to be faster
|
||||
```
|
||||
ldr d0, [r0] // 1 cycle, v0 first 64bit
|
||||
fmla
|
||||
ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
|
||||
fmla
|
||||
add r0, r0, #16 // 0 cycle, update pointer
|
||||
fmla
|
||||
ldr d1, [r0] // 1 cycle, v1 first 64bit
|
||||
ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete
|
||||
fmla
|
||||
ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
|
||||
fmla
|
||||
add r0, r0, #16 // 0 cycle, update pointer
|
||||
fmla
|
||||
ins v1.d[1], x23 // 1 cycle, v1 second 64bit complete
|
||||
nop
|
||||
fmla
|
||||
fmla
|
||||
fmla
|
||||
nop
|
||||
nop
|
||||
fmla
|
||||
fmla
|
||||
fmla
|
||||
```
|
||||
|
||||
## A55
|
||||
* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
|
||||
* 64bit vector load can be dual issued with fmla, no penalty
|
||||
* 64bit integer load can be dual issued with fmla, no penalty
|
||||
* pointer update can be dual issued with fmla, no penalty
|
||||
* 64bit vector insert can be dual issued with fmla, no penalty
|
||||
|
||||
### practical guide
|
||||
* use 64bit vector load only
|
||||
* load 64bit, dual issue with fmla
|
||||
* load the remaining 64bit into integer register, dual issue with fmla
|
||||
* update pointer, dual issue with fmla
|
||||
* insert 64bit into vector from integer register, dual issue with fmla
|
||||
* interleaved load loose register dependency
|
||||
* nop trick is not needed
|
||||
```
|
||||
ldr d0, [r0] // 0 cycle, v0 first 64bit
|
||||
fmla
|
||||
ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
|
||||
fmla
|
||||
add r0, r0, #16 // 0 cycle, update pointer
|
||||
fmla
|
||||
ldr d1, [r0] // 0 cycle, v1 first 64bit
|
||||
fmla
|
||||
ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete
|
||||
fmla
|
||||
ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
|
||||
fmla
|
||||
add r0, r0, #16 // 0 cycle, update pointer
|
||||
fmla
|
||||
ins v1.d[1], x23 // 0 cycle, v1 second 64bit complete
|
||||
fmla
|
||||
```
|
130
3rdparty/ncnn/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
vendored
Normal file
130
3rdparty/ncnn/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
vendored
Normal file
@ -0,0 +1,130 @@
|
||||
```c
|
||||
// d寄存器全部使用 %P
|
||||
// d reg matches %P
|
||||
// a += b * c
|
||||
float32x2_t _a = vld1_f32(a);
|
||||
float32x2_t _b = vld1_f32(b);
|
||||
float32x2_t _c = vld1_f32(c);
|
||||
asm volatile(
|
||||
"vmla.f32 %P0, %P2, %P3"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// q寄存器全部使用 %q
|
||||
// q reg matches %q
|
||||
// a += b * c
|
||||
float32x4_t _a = vld1q_f32(a);
|
||||
float32x4_t _b = vld1q_f32(b);
|
||||
float32x4_t _c = vld1q_f32(c);
|
||||
asm volatile(
|
||||
"vmla.f32 %q0, %q2, %q3"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// d寄存器单路使用 %P[0] %P[1]
|
||||
// 32bit d reg matches %P[0]
|
||||
// a += b * c[0]
|
||||
// a += b * c[1]
|
||||
float32x2_t _a = vld1_f32(a);
|
||||
float32x2_t _b = vld1_f32(b);
|
||||
float32x2_t _c = vld1_f32(c);
|
||||
asm volatile(
|
||||
"vmla.f32 %P0, %P2, %P3[0]"
|
||||
"vmla.f32 %P0, %P2, %P3[1]"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// q寄存器单路使用 %e[0] %e[1] %f[0] %f[1]
|
||||
// 32-bit q reg matches %e[0]
|
||||
// a += b * c[0]
|
||||
// a += b * c[1]
|
||||
// a += b * c[2]
|
||||
// a += b * c[3]
|
||||
float32x4_t _a = vld1q_f32(a);
|
||||
float32x4_t _b = vld1q_f32(b);
|
||||
float32x4_t _c = vld1q_f32(c);
|
||||
asm volatile(
|
||||
"vmla.f32 %q0, %q2, %e3[0]"
|
||||
"vmla.f32 %q0, %q2, %e3[1]"
|
||||
"vmla.f32 %q0, %q2, %f3[0]"
|
||||
"vmla.f32 %q0, %q2, %f3[1]"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// q寄存器拆分d寄存器使用 %e %f
|
||||
// use %e %f to split q reg into two d regs
|
||||
// a += b * c[0]c[1]
|
||||
// a += b * c[2]c[3]
|
||||
float32x2_t _a = vldq_f32(a);
|
||||
float32x2_t _b = vldq_f32(b);
|
||||
float32x4_t _c = vld1q_f32(c);
|
||||
asm volatile(
|
||||
"vmla.f32 %P0, %P2, %e3"
|
||||
"vmla.f32 %P0, %P2, %f3"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// d寄存器声明绑定
|
||||
// specify concrete d reg which want to save
|
||||
// vmla.f32 d0, d2, d4
|
||||
register float32x2_t _a asm("d0") = vld1_f32(a);
|
||||
register float32x2_t _b asm("d2") = vld1_f32(b);
|
||||
register float32x2_t _c asm("d4") = vld1_f32(c);
|
||||
|
||||
asm volatile(
|
||||
"vmla.f32 %P0, %P2, %P3"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
```c
|
||||
// q寄存器声明绑定
|
||||
// bind q reg with data
|
||||
// vmla.f32 q0, q1, q2
|
||||
register float32x4_t _a asm("q0") = vld1q_f32(a);
|
||||
register float32x4_t _b asm("q1") = vld1q_f32(b);
|
||||
register float32x4_t _c asm("q2") = vld1q_f32(c);
|
||||
|
||||
asm volatile(
|
||||
"vmla.f32 %q0, %q2, %q3"
|
||||
: "=w"(_a) // %0
|
||||
: "0"(_a),
|
||||
"w"(_b), // %2
|
||||
"w"(_c) // %3
|
||||
:
|
||||
);
|
||||
```
|
||||
|
||||
如果不是因为编译器的bug,寄存器绑定是用不着的,然而。。。
|
||||
|
||||
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41538
|
||||
|
||||
qwq
|
52
3rdparty/ncnn/docs/developer-guide/binaryop-broadcasting.md
vendored
Normal file
52
3rdparty/ncnn/docs/developer-guide/binaryop-broadcasting.md
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
### broadcasting rule
|
||||
|
||||
ncnn BinaryOp accepts blobs with different shape
|
||||
|
||||
C = BinaryOp(A, B)
|
||||
|
||||
shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c]
|
||||
|
||||
|type|A|B|C|
|
||||
|---|---|---|---|
|
||||
|1|[1]|scalar|[1]|
|
||||
|2|[1]|[2]|[2]|
|
||||
|3|[1]|[2,3]|[2,3]|
|
||||
|4|[1]|[2,3,4]|[2,3,4]|
|
||||
|5|[2]|scalar|[2]|
|
||||
|6|[2]|[1]|[2]|
|
||||
|7|[2]|[2]|[2]|
|
||||
|8|[3]|[2,3]|[2,3]|
|
||||
|9|[4]|[2,3,4]|[2,3,4]|
|
||||
|10|[2,3]|scalar|[2,3]|
|
||||
|11|[2,3]|[1]|[2,3]|
|
||||
|12|[2,3]|[3]|[2,3]|
|
||||
|13|[2,3]|[2,3]|[2,3]|
|
||||
|14|[3,4]|[2,3,4]|[2,3,4]|
|
||||
|15|[2,3,4]|scalar|[2,3,4]|
|
||||
|16|[2,3,4]|[1]|[2,3,4]|
|
||||
|17|[2,3,4]|[4]|[2,3,4]|
|
||||
|18|[2,3,4]|[3,4]|[2,3,4]|
|
||||
|19|[2,3,4]|[2,3,4]|[2,3,4]|
|
||||
|20|[1]|[2,3,4,5]|[2,3,4,5]|
|
||||
|21|[5]|[2,3,4,5]|[2,3,4,5]|
|
||||
|22|[4,5]|[2,3,4,5]|[2,3,4,5]|
|
||||
|23|[3,4,5]|[2,3,4,5]|[2,3,4,5]|
|
||||
|24|[2,3,4,5]|scalar|[2,3,4,5]|
|
||||
|25|[2,3,4,5]|[1]|[2,3,4,5]|
|
||||
|26|[2,3,4,5]|[5]|[2,3,4,5]|
|
||||
|27|[2,3,4,5]|[4,5]|[2,3,4,5]|
|
||||
|28|[2,3,4,5]|[3,4,5]|[2,3,4,5]|
|
||||
|29|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]|
|
||||
|
||||
some special broadcasting rule exists for model compatibility
|
||||
|
||||
|special type|A|B|C|
|
||||
|---|---|---|---|
|
||||
|1|[2,3,4]|[1,1,4]|[2,3,4]|
|
||||
|2|[2,3,4]|[2,3,1]|[2,3,4]|
|
||||
|3|[1,1,4]|[2,3,4]|[2,3,4]|
|
||||
|4|[2,3,1]|[2,3,4]|[2,3,4]|
|
||||
|5|[2,3,4]|[1,3,4]|[2,3,4]|
|
||||
|6|[2,3,4]|[2,1,4]|[2,3,4]|
|
||||
|7|[1,3,4]|[2,3,4]|[2,3,4]|
|
||||
|8|[2,1,4]|[2,3,4]|[2,3,4]|
|
63
3rdparty/ncnn/docs/developer-guide/custom-allocator.md
vendored
Normal file
63
3rdparty/ncnn/docs/developer-guide/custom-allocator.md
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
Mat structure is now allocator-aware via an extra allocator parameter with default zero value.
|
||||
|
||||
The good-old ncnn::fastMalloc()/ncnn::fastFree() will be used for a null allocator.
|
||||
|
||||
You could pass a custom allocator to delegate all memory allocation and deallocation.
|
||||
|
||||
```cpp
|
||||
class Allocator
|
||||
{
|
||||
public:
|
||||
virtual void* fastMalloc(size_t size) = 0;
|
||||
virtual void fastFree(void* ptr) = 0;
|
||||
};
|
||||
```
|
||||
|
||||
ncnn has already implemented two simple pooled Allocator class, with mutex lock or without it.
|
||||
|
||||
```cpp
|
||||
ncnn::PoolAllocator locked_mempool;
|
||||
ncnn::UnlockedPoolAllocator unlocked_mempool;
|
||||
```
|
||||
|
||||
the two allocator types in ncnn
|
||||
|
||||
* blob allocator
|
||||
|
||||
used to allocate memory for all named blobs, which you could retrieve by Extractor::extract()
|
||||
* workspace allocator
|
||||
|
||||
used to allocate memory for internal temporary use in layer implementation, such as the temp blob after padding in convolution
|
||||
|
||||
by default, all Extractor instance use the two allocator in the default option
|
||||
You can alter them by ncnn::set_default_option()
|
||||
or you can set them per Extractor by Extractor::set_blob_allocator()/Extractor::set_workspace_allocator()
|
||||
|
||||
blob allocator is guaranteed to be called in-order in layer implementation during each Extractor lifecycle
|
||||
while workspace allocator may be called synchronously
|
||||
|
||||
the practical usage
|
||||
|
||||
* one network, one-by-one inference
|
||||
|
||||
shared unlocked blob allocator for all Extractor
|
||||
|
||||
shared locked workspace allocator for all Extractor
|
||||
|
||||
* one network, concurrent inference
|
||||
|
||||
shared unlocked blob allocator for all Extractor in each thread
|
||||
|
||||
shared locked workspace allocator for all Extractor among all threads
|
||||
|
||||
* concurrent multiple networks, one-by-one inference for each network
|
||||
|
||||
shared unlocked blob allocator for all Extractor of each network
|
||||
|
||||
shared locked workspace allocator for all Extractor among all networks (for saving memory)
|
||||
|
||||
* concurrent multiple networks, concurrent inference for each network
|
||||
|
||||
shared unlocked blob allocator for all Extractor of each network in each thread
|
||||
|
||||
shared locked workspace allocator for all Extractor among all networks (for saving memory)
|
119
3rdparty/ncnn/docs/developer-guide/element-packing.md
vendored
Normal file
119
3rdparty/ncnn/docs/developer-guide/element-packing.md
vendored
Normal file
@ -0,0 +1,119 @@
|
||||
### what is packing and why
|
||||
|
||||
packing is the form of storing multiple short-sized values as one long-sized value.
|
||||
|
||||
element packing is well mapped with the underlying simd register, which usually use one very wide register to store different types of values.
|
||||
|
||||
|C|elemsize|elempack|
|
||||
|---|---|---|
|
||||
|double|8|1|
|
||||
|float|4|1|
|
||||
|int|4|1|
|
||||
|short|2|1|
|
||||
|signed char|1|1|
|
||||
|
||||
|arm neon|elemsize|elempack|
|
||||
|---|---|---|
|
||||
|float64x2_t|16|2|
|
||||
|float32x4_t|16|4|
|
||||
|int32x4_t|16|4|
|
||||
|float16x4_t|8|4|
|
||||
|int8x8_t|8|8|
|
||||
|
||||
Though the real count of values doubles when elempack is two, the wide-sized value is still treated as one value in the view of Mat structure. For example, we want to store 40 float values in Mat object, if elempack 1 is used, Mat width is then 40, while 10 if elempack 4 is used.
|
||||
|
||||
|dims|w|h|c|cstep|elemsize|elempack|
|
||||
|---|---|---|---|---|---|---|
|
||||
|1|40|1|1|40|4|1|
|
||||
|1|10|1|1|10|16|4|
|
||||
|
||||
### packing style convention
|
||||
|
||||
In practice, elempack 1, 4, 8 are the most common cases. It is possible to use any other packing style in theory.
|
||||
|
||||
The following table show the packing axis used in ncnn for different dimension.
|
||||
|
||||
|dims|packing axis|shape before packing|shape after packing|
|
||||
|---|---|---|---|
|
||||
|1|w|w|w/elempack|
|
||||
|2|h|w, h|w, h/elempack|
|
||||
|3|c|w, h, c|w, h, c/elempack|
|
||||
|
||||
If the packing axis dim is not evenly divisible by elempack, zero padding may be used.
|
||||
|
||||
```
|
||||
outw = (w + elempack - 1) / elempack;
|
||||
```
|
||||
|
||||
The following snippet shows the memory layout after elempack=4 on 3-dim Mat
|
||||
|
||||
```
|
||||
// w=2 h=3 c=4 elempack=1
|
||||
0 1
|
||||
2 3
|
||||
4 5
|
||||
|
||||
6 7
|
||||
8 9
|
||||
10 11
|
||||
|
||||
12 13
|
||||
14 15
|
||||
16 17
|
||||
|
||||
18 19
|
||||
20 21
|
||||
22 23
|
||||
|
||||
// w=2 h=3 c=1 elempack=4
|
||||
(0,6,12,18) (1,7,13,19)
|
||||
(2,8,14,20) (3,9,15,21)
|
||||
(4,10,16,22) (5,11,17,23)
|
||||
```
|
||||
|
||||
### how to convert elempack
|
||||
|
||||
There is a convenient wrapper function provided
|
||||
```
|
||||
// convert to elempack 4 if packing axis dim is evenly divisible by elempack
|
||||
// return the identity Mat otherwise
|
||||
ncnn::Mat a;
|
||||
ncnn::Mat a_packed;
|
||||
ncnn::convert_packing(a, a_packed, 4);
|
||||
if (a_packed.elempack == 4)
|
||||
{
|
||||
// check if packing is successful
|
||||
}
|
||||
|
||||
// convert to packing 1, aka unpacking, shall be always successful
|
||||
ncnn::Mat b;
|
||||
ncnn::Mat b_unpacked;
|
||||
ncnn::convert_packing(b, b_unpacked, 1);
|
||||
```
|
||||
|
||||
### handle general interleaved data
|
||||
|
||||
Here is an example of using convert packing to convert RGB interleaved data to planar
|
||||
|
||||
**NOTE:** The following code is just presented to explain what packing is and the conversion process. Do not use it in production due to its poor performance. Do use ncnn::Mat::from_pixels()
|
||||
|
||||
```cpp
|
||||
// rgb_interleaved_u8 is RGB RGB RGB ...
|
||||
// rgb_interleaved_u8.w = w;
|
||||
// rgb_interleaved_u8.h = h;
|
||||
// rgb_interleaved_u8.c = 1;
|
||||
// rgb_interleaved_u8.elemsize = 3;
|
||||
// rgb_interleaved_u8.elempack = 3;
|
||||
|
||||
ncnn::Mat rgb_interleaved_u8(w, h, 1, 3, 3);
|
||||
ncnn::Mat rgb_planar_u8;
|
||||
|
||||
ncnn::convert_packing(rgb_interleaved_u8, rgb_planar_u8, 1);
|
||||
|
||||
// rgb_planar_u8 is now RRR ... GGG ... BBB ...
|
||||
// rgb_planar_u8.w = w;
|
||||
// rgb_planar_u8.h = h;
|
||||
// rgb_planar_u8.c = 3;
|
||||
// rgb_planar_u8.elemsize = 1;
|
||||
// rgb_planar_u8.elempack = 1;
|
||||
```
|
75
3rdparty/ncnn/docs/developer-guide/how-to-be-a-contributor.zh.md
vendored
Normal file
75
3rdparty/ncnn/docs/developer-guide/how-to-be-a-contributor.zh.md
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
### 如何提交代码
|
||||
|
||||
#### 一、fork 分支
|
||||
在浏览器中打开 [ncnn](https://github.com/tencent/ncnn), `fork` 到自己的 repositories,例如
|
||||
```
|
||||
https://github.com/user/ncnn
|
||||
```
|
||||
|
||||
clone 项目到本地,添加官方 remote 并 fetch:
|
||||
```
|
||||
$ git clone https://github.com/user/ncnn && cd ncnn
|
||||
$ git remote add tencent https://github.com/tencent/ncnn
|
||||
$ git fetch tencent
|
||||
```
|
||||
对于 `git clone` 下来的项目,它现在有两个 remote,分别是 origin 和 tencent:
|
||||
|
||||
```
|
||||
$ git remote -v
|
||||
origin https://github.com/user/ncnn (fetch)
|
||||
origin https://github.com/user/ncnn (push)
|
||||
tencent https://github.com/Tencent/ncnn (fetch)
|
||||
tencent https://github.com/Tencent/ncnn (push)
|
||||
```
|
||||
origin 指向你 fork 的仓库地址;remote 即官方 repo。可以基于不同的 remote 创建和提交分支。
|
||||
|
||||
例如切换到官方 master 分支,并基于此创建自己的分支(命名尽量言简意赅。一个分支只做一件事,方便 review 和 revert)
|
||||
```
|
||||
$ git checkout tencent/master
|
||||
$ git checkout -b add-conv-int8
|
||||
```
|
||||
|
||||
或创建分支时指定基于官方 master 分支:
|
||||
```
|
||||
$ git checkout -b fix-typo-in-document tencent/master
|
||||
```
|
||||
|
||||
> `git fetch` 是从远程获取最新代码到本地。如果是第二次 pr ncnn,直接从 `git fetch tencent` 开始即可,不需要 `git remote add tencent`,也不需要修改 `github.com/user/ncnn`。
|
||||
|
||||
#### 二、代码习惯
|
||||
为了增加沟通效率,reviewer 一般要求 contributor 遵从以下规则
|
||||
|
||||
* `if-else`和花括号`{`中间需要换行
|
||||
* 不能随意增删空行
|
||||
* tab 替换为 4 个空格
|
||||
* 为了保证平台兼容性,目前不使用`c++11`,`src`目录下尽量避免使用`template`
|
||||
* 若是新增功能或平台,`test`目录需有对应测试用例
|
||||
* 文档放到`doc`对应目录下,中文用`.zh.md`做后缀;英文直接用`.md`后缀
|
||||
|
||||
开发完成后提交到自己的 repository
|
||||
```
|
||||
$ git commit -a
|
||||
$ git push origin add-conv-int8
|
||||
```
|
||||
推荐使用 [`commitizen`](https://pypi.org/project/commitizen/) 或 [`gitlint`](https://jorisroovers.com/gitlint/) 等工具格式化 commit message,方便事后检索海量提交记录
|
||||
|
||||
#### 三、代码提交
|
||||
浏览器中打开 [ncnn pulls](https://github.com/Tencent/ncnn/pulls) ,此时应有此分支 pr 提示,点击 `Compare & pull request`
|
||||
|
||||
* 标题**必须**是英文。未完成的分支应以 `WIP:` 开头,例如 `WIP: add conv int8`
|
||||
* 正文宜包含以下内容,中英不限
|
||||
* 内容概述和实现方式
|
||||
* 功能或性能测试
|
||||
* 测试结果
|
||||
|
||||
CI 已集成了自动格式化,restyled-io 会在 pr 的同时生成 `Restyled add conv int8`,需要 merge 自动 restyled 的分支,例如
|
||||
```
|
||||
$ git fetch tencent
|
||||
$ git checkout add-conv-int8
|
||||
$ git merge tencent/restyled/pull-2078
|
||||
$ git push origin add-conv-int8
|
||||
```
|
||||
回到浏览器签署 CLA,所有 CI 测试通过后通知 reviewer merge 此分支。
|
||||
|
||||
#### 四、彩蛋
|
||||
留下个人 qq 号会触发隐藏事件。
|
323
3rdparty/ncnn/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
vendored
Normal file
323
3rdparty/ncnn/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
vendored
Normal file
@ -0,0 +1,323 @@
|
||||
# step1 create a new empty class
|
||||
```cpp
|
||||
// mylayer.h
|
||||
#include "layer.h"
|
||||
using namespace ncnn;
|
||||
|
||||
// a new layer type called MyLayer
|
||||
class MyLayer : public Layer
|
||||
{
|
||||
};
|
||||
|
||||
// mylayer.cpp
|
||||
#include "mylayer.h"
|
||||
DEFINE_LAYER_CREATOR(MyLayer)
|
||||
```
|
||||
|
||||
# step2 declare layer parameters and weights
|
||||
```cpp
|
||||
// mylayer.h
|
||||
#include "layer.h"
|
||||
using namespace ncnn;
|
||||
|
||||
class MyLayer : public Layer
|
||||
{
|
||||
private:
|
||||
int channels;// new code
|
||||
float gamma;// new code
|
||||
Mat weight;// new code
|
||||
};
|
||||
|
||||
// mylayer.cpp
|
||||
#include "mylayer.h"
|
||||
DEFINE_LAYER_CREATOR(MyLayer)
|
||||
```
|
||||
|
||||
# step3 implement load functions for parameters and weights
|
||||
```cpp
|
||||
// mylayer.h
|
||||
#include "layer.h"
|
||||
using namespace ncnn;
|
||||
|
||||
class MyLayer : public Layer
|
||||
{
|
||||
public:
|
||||
virtual int load_param(const ParamDict& pd);// new code
|
||||
virtual int load_model(const ModelBin& mb);// new code
|
||||
|
||||
private:
|
||||
int channels;
|
||||
float eps;
|
||||
Mat gamma_data;
|
||||
};
|
||||
|
||||
// mylayer.cpp
|
||||
#include "mylayer.h"
|
||||
DEFINE_LAYER_CREATOR(MyLayer)
|
||||
|
||||
// new routine for loading parameters
|
||||
int MyLayer::load_param(const ParamDict& pd)
|
||||
{
|
||||
// details about the relations with param file
|
||||
// https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
|
||||
//
|
||||
channels = pd.get(0, 0);// parse 0=<int value> entry, default value 0
|
||||
eps = pd.get(1, 0.001f);// parse 1=<float value> entry, default value 0.001f
|
||||
|
||||
return 0;// return zero if success
|
||||
}
|
||||
|
||||
// new routine for loading weights
|
||||
int MyLayer::load_model(const ModelBin& mb)
|
||||
{
|
||||
// details about the relations with model file
|
||||
// https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
|
||||
//
|
||||
// read weights with length of channels * sizeof(float)
|
||||
// the second argument explains as follows
|
||||
// 0 judge the value type automatically, you may get float or float16 or uint8 etc
|
||||
// depends on the model storage and the supporting target hardware
|
||||
// 1 read float values anyway
|
||||
// 2 read float16 values anyway
|
||||
// 3 read uint8 values anyway
|
||||
gamma_data = mb.load(channels, 1);
|
||||
if (gamma_data.empty())
|
||||
return -100;// return non-zero on error, -100 indicates out-of-memory
|
||||
|
||||
return 0;// return zero if success
|
||||
}
|
||||
```
|
||||
|
||||
# step4 determine forward behavior
|
||||
```cpp
|
||||
// mylayer.h
|
||||
#include "layer.h"
|
||||
using namespace ncnn;
|
||||
|
||||
class MyLayer : public Layer
|
||||
{
|
||||
public:
|
||||
MyLayer();// new code
|
||||
virtual int load_param(const ParamDict& pd);
|
||||
virtual int load_model(const ModelBin& mb);
|
||||
|
||||
private:
|
||||
int channels;
|
||||
float eps;
|
||||
Mat gamma_data;
|
||||
};
|
||||
|
||||
// mylayer.cpp
|
||||
#include "mylayer.h"
|
||||
DEFINE_LAYER_CREATOR(MyLayer)
|
||||
|
||||
// new routine for setting forward behavior
|
||||
MyLayer::MyLayer()
|
||||
{
|
||||
// one input and one output
|
||||
// typical one_blob_only type: Convolution, Pooling, ReLU, Softmax ...
|
||||
// typical non-one_blob_only type: Eltwise, Split, Concat, Slice ...
|
||||
one_blob_only = true;
|
||||
|
||||
// do not change the blob size, modify data in-place
|
||||
// typical support_inplace type: ReLU, Sigmoid ...
|
||||
// typical non-support_inplace type: Convolution, Pooling ...
|
||||
support_inplace = true;
|
||||
}
|
||||
|
||||
int MyLayer::load_param(const ParamDict& pd)
|
||||
{
|
||||
channels = pd.get(0, 0);
|
||||
eps = pd.get(1, 0.001f);
|
||||
|
||||
// you could alter the behavior based on loaded parameter
|
||||
// if (eps == 0.001f)
|
||||
// {
|
||||
// one_blob_only = false;
|
||||
// support_inplace = false;
|
||||
// }
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int MyLayer::load_model(const ModelBin& mb)
|
||||
{
|
||||
gamma_data = mb.load(channels, 1);
|
||||
if (gamma_data.empty())
|
||||
return -100;
|
||||
|
||||
// you could alter the behavior based on loaded weight
|
||||
// if (gamma_data[0] == 0.f)
|
||||
// {
|
||||
// one_blob_only = false;
|
||||
// support_inplace = false;
|
||||
// }
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
# step5 choose proper interface based on forward behavior
|
||||
```cpp
|
||||
// The base class Layer defines four interfaces for each forward behavior combination
|
||||
|
||||
// 1
|
||||
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
|
||||
|
||||
// 2
|
||||
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
|
||||
|
||||
// 3
|
||||
virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
|
||||
|
||||
// 4
|
||||
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
|
||||
```
|
||||
**must** = layer must implement this function
|
||||
|
||||
**optional** = layer may implement this function for optimal performance
|
||||
|
||||
sometimes the graph inference path cannot call forward_inplace directly due to data sharing, in this situation the non-inplace forward routine will be used, which deep-copy the input blob and call inplace forward on it if the optional routine is not implemented. Thus, you could avoid this deep-copy by process input to output on-the-fly.
|
||||
|
||||
|one_blob_only|support_inplace|1|2|3|4|
|
||||
|---|---|---|---|---|---|
|
||||
|false|false|must| | | |
|
||||
|false|true|optional| |must| |
|
||||
|true|false| |must| | |
|
||||
|true|true| |optional| |must|
|
||||
|
||||
# step6 implement forward function
|
||||
```cpp
|
||||
// mylayer.h
|
||||
#include "layer.h"
|
||||
using namespace ncnn;
|
||||
|
||||
class MyLayer : public Layer
|
||||
{
|
||||
public:
|
||||
MyLayer();
|
||||
virtual int load_param(const ParamDict& pd);
|
||||
virtual int load_model(const ModelBin& mb);
|
||||
|
||||
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;// new code, optional
|
||||
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;// new code
|
||||
|
||||
private:
|
||||
int channels;
|
||||
float eps;
|
||||
Mat gamma_data;
|
||||
};
|
||||
|
||||
// mylayer.cpp
|
||||
#include "mylayer.h"
|
||||
DEFINE_LAYER_CREATOR(MyLayer)
|
||||
|
||||
MyLayer::MyLayer()
|
||||
{
|
||||
one_blob_only = true;
|
||||
support_inplace = true;
|
||||
}
|
||||
|
||||
int MyLayer::load_param(const ParamDict& pd)
|
||||
{
|
||||
channels = pd.get(0, 0);
|
||||
eps = pd.get(1, 0.001f);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int MyLayer::load_model(const ModelBin& mb)
|
||||
{
|
||||
gamma_data = mb.load(channels, 1);
|
||||
if (gamma_data.empty())
|
||||
return -100;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// optional new routine for layer forward function, non-inplace version
|
||||
int MyLayer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
|
||||
{
|
||||
// check input dims, return non-zero on error
|
||||
if (bottom_blob.c != channels)
|
||||
return -1;
|
||||
|
||||
// x = (x + eps) * gamma_per_channel
|
||||
|
||||
int w = bottom_blob.w;
|
||||
int h = bottom_blob.h;
|
||||
size_t elemsize = bottom_blob.elemsize;
|
||||
int size = w * h;
|
||||
|
||||
top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
|
||||
if (top_blob.empty())
|
||||
return -100;// return non-zero on error, -100 indicates out-of-memory
|
||||
|
||||
#pragma omp parallel for num_threads(opt.num_threads)
|
||||
for (int q=0; q<channels; q++)
|
||||
{
|
||||
const float* ptr = bottom_blob.channel(q);
|
||||
float* outptr = top_blob.channel(q);
|
||||
const float gamma = gamma_data[q];
|
||||
|
||||
for (int i=0; i<size; i++)
|
||||
{
|
||||
outptr[i] = (ptr[i] + eps) * gamma ;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// new routine for layer forward function
|
||||
int MyLayer::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
|
||||
{
|
||||
// check input dims, return non-zero on error
|
||||
if (bottom_top_blob.c != channels)
|
||||
return -1;
|
||||
|
||||
// x = (x + eps) * gamma_per_channel
|
||||
|
||||
int w = bottom_top_blob.w;
|
||||
int h = bottom_top_blob.h;
|
||||
int size = w * h;
|
||||
|
||||
#pragma omp parallel for num_threads(opt.num_threads)
|
||||
for (int q=0; q<channels; q++)
|
||||
{
|
||||
float* ptr = bottom_top_blob.channel(q);
|
||||
const float gamma = gamma_data[q];
|
||||
|
||||
for (int i=0; i<size; i++)
|
||||
{
|
||||
ptr[i] = (ptr[i] + eps) * gamma ;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
# step7 integrate with ncnn library
|
||||
you may probably need to modify caffe2ncnn or mxnet2ncnn etc. to write your layer specific parameters and weights into ncnn param and model file
|
||||
|
||||
the param and model file structure [param-and-model-file-structure](param-and-model-file-structure)
|
||||
|
||||
```
|
||||
// example param file content
|
||||
Input input 0 1 input
|
||||
Convolution conv2d 1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
|
||||
MyLayer mylayer 1 1 conv2d mylayer0
|
||||
Pooling maxpool 1 1 mylayer0 maxpool 0=0 1=3 2=2 3=-233 4=0
|
||||
```
|
||||
|
||||
```cpp
|
||||
ncnn::Net net;
|
||||
|
||||
// register custom layer before load param and model
|
||||
// the layer creator function signature is always XYZ_layer_creator, which defined in DEFINE_LAYER_CREATOR macro
|
||||
net.register_custom_layer("MyLayer", MyLayer_layer_creator);
|
||||
|
||||
net.load_param("model.param");
|
||||
net.load_model("model.bin");
|
||||
```
|
38
3rdparty/ncnn/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
vendored
Normal file
38
3rdparty/ncnn/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
# benchmark
|
||||
op
|
||||
|
||||
# naive C with openmp
|
||||
for for for
|
||||
|
||||
# unroll, first try
|
||||
h
|
||||
|
||||
# register allocation
|
||||
kernels
|
||||
|
||||
# unroll, second try
|
||||
simd
|
||||
|
||||
# neon intrinsics
|
||||
optional
|
||||
|
||||
# naive neon assembly with pld
|
||||
asm
|
||||
|
||||
# pipeline optimize, first try
|
||||
more register load mla
|
||||
|
||||
# pipeline optimize, second try
|
||||
interleave load mla
|
||||
|
||||
# pipeline optimize, third try
|
||||
loop tail
|
||||
|
||||
# usual practice, load/save
|
||||
233
|
||||
|
||||
# usual practice, unroll
|
||||
233
|
||||
|
||||
# usual practice, save register
|
||||
233
|
311
3rdparty/ncnn/docs/developer-guide/low-level-operation-api.md
vendored
Normal file
311
3rdparty/ncnn/docs/developer-guide/low-level-operation-api.md
vendored
Normal file
@ -0,0 +1,311 @@
|
||||
# implement elementwise addition with/without broadcast using BinaryOp operation
|
||||
|
||||
* input must be fp32 storage without packing
|
||||
* output is expected to be fp32 storage without packing
|
||||
|
||||
```cpp
|
||||
void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c)
|
||||
{
|
||||
ncnn::Option opt;
|
||||
opt.num_threads = 2;
|
||||
opt.use_fp16_storage = false;
|
||||
opt.use_packing_layout = false;
|
||||
|
||||
ncnn::Layer* op = ncnn::create_layer("BinaryOp");
|
||||
|
||||
// set param
|
||||
ncnn::ParamDict pd;
|
||||
pd.set(0, 0);// op_type
|
||||
|
||||
op->load_param(pd);
|
||||
|
||||
op->create_pipeline(opt);
|
||||
|
||||
// forward
|
||||
std::vector<ncnn::Mat> bottoms(2);
|
||||
bottoms[0] = a;
|
||||
bottoms[1] = b;
|
||||
|
||||
std::vector<ncnn::Mat> tops(1);
|
||||
op->forward(bottoms, tops, opt);
|
||||
|
||||
c = tops[0];
|
||||
|
||||
op->destroy_pipeline(opt);
|
||||
|
||||
delete op;
|
||||
}
|
||||
```
|
||||
|
||||
# implement 3x3 box blur on three channel image using ConvolutionDepthWise operation
|
||||
|
||||
* input must be fp32 storage without packing
|
||||
* output is expected to be fp32 storage without packing
|
||||
|
||||
```cpp
|
||||
void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out)
|
||||
{
|
||||
ncnn::Option opt;
|
||||
opt.num_threads = 2;
|
||||
opt.use_fp16_storage = false;
|
||||
opt.use_packing_layout = false;
|
||||
|
||||
ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise");
|
||||
|
||||
// set param
|
||||
ncnn::ParamDict pd;
|
||||
pd.set(0, 3);// num_output
|
||||
pd.set(1, 3);// kernel_w
|
||||
pd.set(5, 0);// bias_term
|
||||
pd.set(6, 3*3*3);// weight_data_size
|
||||
pd.set(7, 3);// group
|
||||
|
||||
op->load_param(pd);
|
||||
|
||||
// set weights
|
||||
ncnn::Mat weights[1];
|
||||
weights[0].create(3*3*3);// weight_data
|
||||
|
||||
for (int i=0; i<3*3*3; i++)
|
||||
{
|
||||
weights[0][i] = 1.f / 9;
|
||||
}
|
||||
|
||||
op->load_model(ncnn::ModelBinFromMatArray(weights));
|
||||
|
||||
op->create_pipeline(opt);
|
||||
|
||||
// forward
|
||||
op->forward(rgb, out, opt);
|
||||
|
||||
op->destroy_pipeline(opt);
|
||||
|
||||
delete op;
|
||||
}
|
||||
```
|
||||
# transpose Mat, chw to cwh
|
||||
|
||||
* input must be fp32 storage with/without packing
|
||||
* output is expected to be fp32 storage packed
|
||||
|
||||
```cpp
|
||||
void transpose(const ncnn::Mat& in, ncnn::Mat& out)
|
||||
{
|
||||
ncnn::Option opt;
|
||||
opt.num_threads = 2;
|
||||
opt.use_fp16_storage = false;
|
||||
opt.use_packing_layout = true;
|
||||
|
||||
ncnn::Layer* op = ncnn::create_layer("Permute");
|
||||
|
||||
// set param
|
||||
ncnn::ParamDict pd;
|
||||
pd.set(0, 1);// order_type
|
||||
|
||||
op->load_param(pd);
|
||||
|
||||
op->create_pipeline(opt);
|
||||
|
||||
ncnn::Mat in_packed = in;
|
||||
{
|
||||
// resolve dst_elempack
|
||||
int dims = in.dims;
|
||||
int elemcount = 0;
|
||||
if (dims == 1) elemcount = in.elempack * in.w;
|
||||
if (dims == 2) elemcount = in.elempack * in.h;
|
||||
if (dims == 3) elemcount = in.elempack * in.c;
|
||||
|
||||
int dst_elempack = 1;
|
||||
if (op->support_packing)
|
||||
{
|
||||
if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
|
||||
dst_elempack = 8;
|
||||
else if (elemcount % 4 == 0)
|
||||
dst_elempack = 4;
|
||||
}
|
||||
|
||||
if (in.elempack != dst_elempack)
|
||||
{
|
||||
convert_packing(in, in_packed, dst_elempack, opt);
|
||||
}
|
||||
}
|
||||
|
||||
// forward
|
||||
op->forward(in_packed, out, opt);
|
||||
|
||||
op->destroy_pipeline(opt);
|
||||
|
||||
delete op;
|
||||
}
|
||||
```
|
||||
# apply instance normalization
|
||||
// x = (x - mean) / sqrt(var)
|
||||
|
||||
* input can be fp32/fp16 storage with/without packing
|
||||
* output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise
|
||||
|
||||
```cpp
|
||||
void normalize(const ncnn::Mat& in, ncnn::Mat& out)
|
||||
{
|
||||
ncnn::Option opt;
|
||||
opt.num_threads = 2;
|
||||
opt.use_fp16_storage = true;
|
||||
opt.use_packing_layout = true;
|
||||
|
||||
ncnn::Layer* op = ncnn::create_layer("InstanceNorm");
|
||||
|
||||
// set param
|
||||
ncnn::ParamDict pd;
|
||||
pd.set(0, in.c);// channels
|
||||
pd.set(1, 0.f);// eps
|
||||
|
||||
op->load_param(pd);
|
||||
|
||||
// set weights
|
||||
ncnn::Mat weights[2];
|
||||
weights[0].create(in.c);// gamma_data
|
||||
weights[1].create(in.c);// beta_data
|
||||
|
||||
weights[0].fill(1.f);
|
||||
weights[1].fill(0.f);
|
||||
|
||||
op->load_model(ncnn::ModelBinFromMatArray(weights));
|
||||
|
||||
op->create_pipeline(opt);
|
||||
|
||||
ncnn::Mat in_fp16 = in;
|
||||
if (in.elembits() == 32 && op->support_fp16_storage)
|
||||
{
|
||||
cast_float32_to_float16(in, in_fp16, opt);
|
||||
}
|
||||
if (in.elembits() == 16 && !op->support_fp16_storage)
|
||||
{
|
||||
cast_float16_to_float32(in, in_fp16, opt);
|
||||
}
|
||||
|
||||
ncnn::Mat in_fp16_packed = in_fp16;
|
||||
{
|
||||
// resolve dst_elempack
|
||||
int dims = in_fp16.dims;
|
||||
int elemcount = 0;
|
||||
if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w;
|
||||
if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h;
|
||||
if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c;
|
||||
|
||||
int dst_elempack = 1;
|
||||
if (op->support_packing)
|
||||
{
|
||||
if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
|
||||
dst_elempack = 8;
|
||||
else if (elemcount % 4 == 0)
|
||||
dst_elempack = 4;
|
||||
}
|
||||
|
||||
if (in_fp16.elempack != dst_elempack)
|
||||
{
|
||||
convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt);
|
||||
}
|
||||
}
|
||||
|
||||
// forward
|
||||
op->forward(in_fp16_packed, out, opt);
|
||||
|
||||
op->destroy_pipeline(opt);
|
||||
|
||||
delete op;
|
||||
}
|
||||
```
|
||||
|
||||
# cpu -> gpu -> forward -> gpu -> cpu
|
||||
|
||||
```cpp
|
||||
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
|
||||
|
||||
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
|
||||
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
|
||||
|
||||
ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev);
|
||||
ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev);
|
||||
|
||||
// create layer
|
||||
ncnn::Layer* convolution = ncnn::create_layer("Convolution");
|
||||
convolution->vkdev = vkdev;
|
||||
|
||||
// set option
|
||||
ncnn::Option opt;
|
||||
opt.num_threads = 4;
|
||||
opt.use_vulkan_compute = true;
|
||||
opt.blob_vkallocator = blob_vkallocator;
|
||||
opt.workspace_vkallocator = blob_vkallocator;
|
||||
opt.staging_vkallocator = staging_vkallocator;
|
||||
|
||||
// load param
|
||||
{
|
||||
ncnn::ParamDict pd;
|
||||
pd.set(0, outch);
|
||||
pd.set(1, ksize);
|
||||
pd.set(6, outch*inch*ksize*ksize);
|
||||
pd.use_vulkan_compute = 1;
|
||||
|
||||
convolution->load_param(pd);
|
||||
}
|
||||
|
||||
// load model
|
||||
{
|
||||
ncnn::Mat weights[2];
|
||||
weights[0] = random_mat(outch*inch*ksize*ksize);
|
||||
weights[1] = random_mat(outch);
|
||||
|
||||
ncnn::ModelBinFromMatArray mb(weights);
|
||||
convolution->load_model(mb);
|
||||
}
|
||||
|
||||
// create pipeline
|
||||
convolution->create_pipeline(opt);
|
||||
|
||||
// upload model
|
||||
{
|
||||
ncnn::VkTransfer cmd(vkdev);
|
||||
|
||||
ncnn::Option opt_upload = opt;
|
||||
opt_upload.blob_vkallocator = weight_vkallocator;
|
||||
opt_upload.workspace_vkallocator = weight_vkallocator;
|
||||
opt_upload.staging_vkallocator = weight_staging_vkallocator;
|
||||
|
||||
convolution->upload_model(cmd, opt_upload);
|
||||
|
||||
cmd.submit_and_wait();
|
||||
}
|
||||
|
||||
ncnn::Mat bottom = random_mat(w, h, inch);
|
||||
|
||||
ncnn::Mat top;
|
||||
|
||||
// forward
|
||||
{
|
||||
ncnn::VkCompute cmd(vkdev);
|
||||
|
||||
ncnn::VkMat bottom_gpu;
|
||||
cmd.record_upload(bottom, bottom_gpu, opt);
|
||||
|
||||
ncnn::VkMat top_gpu;
|
||||
convolution->forward(bottom_gpu, top_gpu, cmd, opt);
|
||||
|
||||
cmd.record_download(top_gpu, top, opt);
|
||||
|
||||
cmd.submit_and_wait();
|
||||
}
|
||||
|
||||
convolution->destroy_pipeline(opt);
|
||||
|
||||
delete convolution;
|
||||
|
||||
vkdev->reclaim_blob_allocator(blob_vkallocator);
|
||||
vkdev->reclaim_staging_allocator(staging_vkallocator);
|
||||
|
||||
weight_vkallocator->clear();
|
||||
weight_staging_vkallocator->clear();
|
||||
delete weight_vkallocator;
|
||||
delete weight_staging_vkallocator;
|
||||
```
|
||||
|
46
3rdparty/ncnn/docs/developer-guide/ncnn-tips-and-tricks.zh.md
vendored
Normal file
46
3rdparty/ncnn/docs/developer-guide/ncnn-tips-and-tricks.zh.md
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
### blob内存是隐含共享的
|
||||
|
||||
ncnn的blob最初直接使用opencv的cv::Mat,后发现blob最多只支持三维,因此实现了类似的Mat
|
||||
Mat的data每个通道内存16字节对齐,并且有原子的引用计数,a=b不复制数据,超级快
|
||||
Mat支持直接引用外部的内存块,不复制数据,加快模型加载和输入输出
|
||||
|
||||
举个例子:split layer 将一个blob复制成n个,ncnn中实现为单纯的增加引用计数,没有任何数据复制
|
||||
|
||||
### 只运算一部分并保留中间结果
|
||||
|
||||
ncnn的net在解决分支依赖时是自上而下深度优先的,因此当网络有多个分支时,运算只会在需要结果的那个分支中进行,节约时间
|
||||
当多个分支有重合部分时,运算其中一个分支后会自动保留其余分支所需的中间结果,隐含共享,以便运算其余分支时利用
|
||||
|
||||
举个例子:某网络结构为 A -> B -> C1 + C2,向ncnn索要C1结果时,运算过程是 A -> B -> C1,同时B结果引用计数加1自动保留,后面还需要C2结果时,只运算C2就足够了
|
||||
|
||||
### 开启轻模式省内存
|
||||
|
||||
每个layer都会产生blob,除了最后的结果和多分支中间结果,大部分blob都不值得保留,开启轻模式可以在运算后自动回收,省下内存
|
||||
|
||||
举个例子:某网络结构为 A -> B -> C,在轻模式下,向ncnn索要C结果时,A结果会在运算B时自动回收,而B结果会在运算C时自动回收,最后只保留C结果,后面再需要C结果会直接获得,满足绝大部分深度网络的使用方式
|
||||
|
||||
### 网络和运算是分开的
|
||||
|
||||
ncnn的net是网络模型,实际使用的是extractor,也就是同个net可以有很多个运算实例,而且运算实例互不影响,中间结果保留在extractor内部,在多线程使用时共用网络的结构和参数数据,初始化网络模型和参数只需要一遍
|
||||
|
||||
举个例子:全局静态的net实例,初始化一次后,就能不停地生成extractor使用
|
||||
|
||||
### openmp虽快但未必合适
|
||||
|
||||
ncnn中几乎所有运算都能用上openmp多线程加速,而且性能很赞
|
||||
不过系统有时候会突然慢一下,比如手机太热自动降频,界面操作等等,ncnn耗时也会偶尔抖动变长,在计算耗时稳定性比较重要的时候建议关闭openmp,或者设置下extractor线程数
|
||||
|
||||
举个例子:手机自拍时,用ncnn进行人脸实时定位,如果耗时突然涨一下就会感觉到掉帧,而稳定的帧率体验更好
|
||||
|
||||
### NCNN_STDIO/NCNN_STRING禁用模型文件
|
||||
|
||||
ncnn支持加载自有的模型文件和模型内存,NCNN_STDIO控制是否需要支持加载模型文件,设成0能禁用这部分代码,从而减小库的体积,NCNN_STRING设成0能清除大部分可见的字符串和解析过程
|
||||
模型内存加载时的参数数据是直接引用的,速度更快,通常在手机上使用这种方式
|
||||
|
||||
### 削减 ncnn 内置的层实现
|
||||
|
||||
cmake的时候,加参数 -DWITH_LAYER_xxx=OFF 就可以完全不编译对应的内置层,这样可以进一步减小库的体积
|
||||
|
||||
### 关于 ARM big.LITTLE 调度
|
||||
|
||||
调用set_cpu_powersave可以把ncnn运算线程控制在特定的cpu核心上,大核心速度快耗电多,小核心速度慢点但省电,大小一起用手机热得快
|
194
3rdparty/ncnn/docs/developer-guide/new-model-load-api.md
vendored
Normal file
194
3rdparty/ncnn/docs/developer-guide/new-model-load-api.md
vendored
Normal file
@ -0,0 +1,194 @@
|
||||
## current model load api
|
||||
### Cons
|
||||
#### long and awful code
|
||||
#### two functions
|
||||
#### deal float32 float16 quantized-u8
|
||||
#### deal alignment size
|
||||
```cpp
|
||||
#if NCNN_STDIO
|
||||
int Convolution::load_model(FILE* binfp)
|
||||
{
|
||||
int nread;
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned char f0;
|
||||
unsigned char f1;
|
||||
unsigned char f2;
|
||||
unsigned char f3;
|
||||
};
|
||||
unsigned int tag;
|
||||
} flag_struct;
|
||||
|
||||
nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
|
||||
if (nread != 1)
|
||||
{
|
||||
fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
|
||||
return -1;
|
||||
}
|
||||
|
||||
unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
|
||||
|
||||
weight_data.create(weight_data_size);
|
||||
if (weight_data.empty())
|
||||
return -100;
|
||||
|
||||
if (flag_struct.tag == 0x01306B47)
|
||||
{
|
||||
// half-precision weight data
|
||||
int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
|
||||
std::vector<unsigned short> float16_weights;
|
||||
float16_weights.resize(align_weight_data_size);
|
||||
nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
|
||||
if (nread != 1)
|
||||
{
|
||||
fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
|
||||
return -1;
|
||||
}
|
||||
|
||||
weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
|
||||
if (weight_data.empty())
|
||||
return -100;
|
||||
}
|
||||
else if (flag != 0)
|
||||
{
|
||||
// quantized weight data
|
||||
float quantization_value[256];
|
||||
nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
|
||||
if (nread != 1)
|
||||
{
|
||||
fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
|
||||
std::vector<unsigned char> index_array;
|
||||
index_array.resize(align_weight_data_size);
|
||||
nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
|
||||
if (nread != 1)
|
||||
{
|
||||
fprintf(stderr, "Convolution read index_array failed %d\n", nread);
|
||||
return -1;
|
||||
}
|
||||
|
||||
float* weight_data_ptr = weight_data;
|
||||
for (int i = 0; i < weight_data_size; i++)
|
||||
{
|
||||
weight_data_ptr[i] = quantization_value[ index_array[i] ];
|
||||
}
|
||||
}
|
||||
else if (flag_struct.f0 == 0)
|
||||
{
|
||||
// raw weight data
|
||||
nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
|
||||
if (nread != 1)
|
||||
{
|
||||
fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (bias_term)
|
||||
{
|
||||
bias_data.create(num_output);
|
||||
if (bias_data.empty())
|
||||
return -100;
|
||||
nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
|
||||
if (nread != 1)
|
||||
{
|
||||
fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif // NCNN_STDIO
|
||||
|
||||
int Convolution::load_model(const unsigned char*& mem)
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
unsigned char f0;
|
||||
unsigned char f1;
|
||||
unsigned char f2;
|
||||
unsigned char f3;
|
||||
};
|
||||
unsigned int tag;
|
||||
} flag_struct;
|
||||
|
||||
memcpy(&flag_struct, mem, sizeof(flag_struct));
|
||||
mem += sizeof(flag_struct);
|
||||
|
||||
unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
|
||||
|
||||
if (flag_struct.tag == 0x01306B47)
|
||||
{
|
||||
// half-precision weight data
|
||||
weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
|
||||
mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
|
||||
if (weight_data.empty())
|
||||
return -100;
|
||||
}
|
||||
else if (flag != 0)
|
||||
{
|
||||
// quantized weight data
|
||||
const float* quantization_value = (const float*)mem;
|
||||
mem += 256 * sizeof(float);
|
||||
|
||||
const unsigned char* index_array = (const unsigned char*)mem;
|
||||
mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
|
||||
|
||||
weight_data.create(weight_data_size);
|
||||
if (weight_data.empty())
|
||||
return -100;
|
||||
float* weight_data_ptr = weight_data;
|
||||
for (int i = 0; i < weight_data_size; i++)
|
||||
{
|
||||
weight_data_ptr[i] = quantization_value[ index_array[i] ];
|
||||
}
|
||||
}
|
||||
else if (flag_struct.f0 == 0)
|
||||
{
|
||||
// raw weight data
|
||||
weight_data = Mat(weight_data_size, (float*)mem);
|
||||
mem += weight_data_size * sizeof(float);
|
||||
}
|
||||
|
||||
if (bias_term)
|
||||
{
|
||||
bias_data = Mat(num_output, (float*)mem);
|
||||
mem += num_output * sizeof(float);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## new model load api proposed
|
||||
### Pros
|
||||
#### clean and simple api
|
||||
#### element type detection
|
||||
```cpp
|
||||
int Convolution::load_model(const ModelBin& mb)
|
||||
{
|
||||
// auto detect element type
|
||||
weight_data = mb.load(weight_data_size, 0);
|
||||
if (weight_data.empty())
|
||||
return -100;
|
||||
|
||||
if (bias_term)
|
||||
{
|
||||
// certain type specified
|
||||
bias_data = mb.load(num_output, 1);
|
||||
if (bias_data.empty())
|
||||
return -100;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
92
3rdparty/ncnn/docs/developer-guide/new-param-load-api.md
vendored
Normal file
92
3rdparty/ncnn/docs/developer-guide/new-param-load-api.md
vendored
Normal file
@ -0,0 +1,92 @@
|
||||
## current param load api
|
||||
### Cons
|
||||
#### long and awful code
|
||||
#### three functions
|
||||
#### not extensible
|
||||
#### no default value
|
||||
#### no variable length array
|
||||
```
|
||||
MyLayer mylayer 1 1 in out 100 1.250000
|
||||
```
|
||||
```
|
||||
binary 100
|
||||
binary 1.250000
|
||||
```
|
||||
```cpp
|
||||
#if NCNN_STDIO
|
||||
#if NCNN_STRING
|
||||
int MyLayer::load_param(FILE* paramfp)
|
||||
{
|
||||
int nscan = fscanf(paramfp, "%d %f", &a, &b);
|
||||
if (nscan != 2)
|
||||
{
|
||||
fprintf(stderr, "MyLayer load_param failed %d\n", nscan);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif // NCNN_STRING
|
||||
int MyLayer::load_param_bin(FILE* paramfp)
|
||||
{
|
||||
fread(&a, sizeof(int), 1, paramfp);
|
||||
|
||||
fread(&b, sizeof(float), 1, paramfp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif // NCNN_STDIO
|
||||
|
||||
int MyLayer::load_param(const unsigned char*& mem)
|
||||
{
|
||||
a = *(int*)(mem);
|
||||
mem += 4;
|
||||
|
||||
b = *(float*)(mem);
|
||||
mem += 4;
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## new param load api proposed
|
||||
### Pros
|
||||
#### clean and simple api
|
||||
#### default value
|
||||
#### extensible
|
||||
#### variable length array
|
||||
```
|
||||
7767517
|
||||
MyLayer mylayer 1 1 in out 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
|
||||
```
|
||||
```
|
||||
binary 0xDD857600(magic)
|
||||
|
||||
binary 0
|
||||
binary 100
|
||||
binary 1
|
||||
binary 1.250000
|
||||
binary -23303
|
||||
binary 5
|
||||
binary 0.1
|
||||
binary 0.2
|
||||
binary 0.4
|
||||
binary 0.8
|
||||
binary 1.0
|
||||
binary -233(EOP)
|
||||
```
|
||||
```cpp
|
||||
int MyLayer::load_param(const ParamDict& pd)
|
||||
{
|
||||
// pd.get( param id (seq), default value );
|
||||
a = pd.get(0, 100);
|
||||
b = pd.get(1, 1.25f);
|
||||
|
||||
// get default value for c if not specified in param file
|
||||
c = pd.get(2, 0.001);
|
||||
|
||||
// get array
|
||||
d = pd.get(3, Mat(len, array));
|
||||
return 0;
|
||||
}
|
||||
```
|
303
3rdparty/ncnn/docs/developer-guide/operation-param-weight-table.md
vendored
Normal file
303
3rdparty/ncnn/docs/developer-guide/operation-param-weight-table.md
vendored
Normal file
@ -0,0 +1,303 @@
|
||||
|
||||
|operation|param id|param phase|default value|weight order|
|
||||
|:---:|:---:|:---:|:---:|:---:|
|
||||
|AbsVal|||
|
||||
|ArgMax|0|out_max_val|0|
|
||||
||1|topk|1|
|
||||
|BatchNorm|0|channels|0|slope mean variance bias|
|
||||
||1|eps|0.f|
|
||||
|Bias|0|bias_data_size|0|
|
||||
|BinaryOp|0|op_type|0|
|
||||
||1|with_scalar|0|
|
||||
||2|b|0.f|
|
||||
|BNLL|||
|
||||
|Cast|0|type_from|0|
|
||||
||1|type_to|0|
|
||||
|Clip|0|min|-FLT_MAX|
|
||||
||1|max|FLT_MAX|
|
||||
|Concat|0|axis|0|
|
||||
|Convolution|0|num_output|0|weight bias|
|
||||
||1|kernel_w|0|
|
||||
||2|dilation_w|1|
|
||||
||3|stride_w|1|
|
||||
||4|pad_left|0|
|
||||
||5|bias_term|0|
|
||||
||6|weight_data_size|0|
|
||||
||8|int8_scale_term|0|
|
||||
||9|activation_type|0|
|
||||
||10|activation_params|[ ]|
|
||||
||11|kernel_h|kernel_w|
|
||||
||12|dilation_h|dilation_w|
|
||||
||13|stride_h|stride_w|
|
||||
||15|pad_right|pad_left|
|
||||
||14|pad_top|pad_left|
|
||||
||16|pad_bottom|pad_top|
|
||||
||17|impl_type|0|
|
||||
||18|pad_value|0.f|
|
||||
|ConvolutionDepthWise|0|num_output|0|weight bias|
|
||||
||1|kernel_w|0|
|
||||
||2|dilation_w|1|
|
||||
||3|stride_w|1|
|
||||
||4|pad_left|0|
|
||||
||5|bias_term|0|
|
||||
||6|weight_data_size|0|
|
||||
||7|group|1|
|
||||
||8|int8_scale_term|0|
|
||||
||9|activation_type|0|
|
||||
||10|activation_params|[ ]|
|
||||
||11|kernel_h|kernel_w|
|
||||
||12|dilation_h|dilation_w|
|
||||
||13|stride_h|stride_w|
|
||||
||15|pad_right|pad_left|
|
||||
||14|pad_top|pad_left|
|
||||
||16|pad_bottom|pad_top|
|
||||
||18|pad_value|0.f|
|
||||
|Crop|0|woffset|0|
|
||||
||1|hoffset|0|
|
||||
||2|coffset|0|
|
||||
||3|outw|0|
|
||||
||4|outh|0|
|
||||
||5|outc|0|
|
||||
||6|woffset2|0|
|
||||
||7|hoffset2|0|
|
||||
||8|coffset2|0|
|
||||
||9|starts|[ ]|
|
||||
||10|ends|[ ]|
|
||||
||11|axes|[ ]|
|
||||
|Deconvolution|0|num_output|0|weight bias|
|
||||
||1|kernel_w|0|
|
||||
||2|dilation_w|1|
|
||||
||3|stride_w|1|
|
||||
||4|pad_left|0|
|
||||
||5|bias_term|0|
|
||||
||6|weight_data_size|0|
|
||||
||9|activation_type|0|
|
||||
||10|activation_params|[ ]|
|
||||
||11|kernel_h|kernel_w|
|
||||
||12|dilation_h|dilation_w|
|
||||
||13|stride_h|stride_w|
|
||||
||15|pad_right|pad_left|
|
||||
||14|pad_top|pad_left|
|
||||
||16|pad_bottom|pad_top|
|
||||
||18|output_pad_right|0|
|
||||
||19|output_pad_bottom|output_pad_right|
|
||||
||20|output_w|0|
|
||||
||21|output_h|output_w|
|
||||
|DeconvolutionDepthWise|0|num_output|0|weight bias|
|
||||
||1|kernel_w|0|
|
||||
||2|dilation_w|1|
|
||||
||3|stride_w|1|
|
||||
||4|pad_left|0|
|
||||
||5|bias_term|0|
|
||||
||6|weight_data_size|0|
|
||||
||7|group|1|
|
||||
||9|activation_type|0|
|
||||
||10|activation_params|[ ]|
|
||||
||11|kernel_h|kernel_w|
|
||||
||12|dilation_h|dilation_w|
|
||||
||13|stride_h|stride_w|
|
||||
||15|pad_right|pad_left|
|
||||
||14|pad_top|pad_left|
|
||||
||16|pad_bottom|pad_top|
|
||||
||18|output_pad_right|0|
|
||||
||19|output_pad_bottom|output_pad_right|
|
||||
||20|output_w|0|
|
||||
||21|output_h|output_w|
|
||||
|Dequantize|0|scale|1.f|bias|
|
||||
||1|bias_term|0|
|
||||
||2|bias_data_size|0|
|
||||
|DetectionOutput|0|num_class|0|
|
||||
||1|nms_threshold|0.05f|
|
||||
||2|nms_top_k|300|
|
||||
||3|keep_top_k|100|
|
||||
||4|confidence_threshold|0.5f|
|
||||
||5|variances[0]|0.1f|
|
||||
||6|variances[1]|0.1f|
|
||||
||7|variances[2]|0.2f|
|
||||
||8|variances[3]|0.2f|
|
||||
|Dropout|0|scale|1.f|
|
||||
|Eltwise|0|op_type|0|
|
||||
||1|coeffs|[ ]|
|
||||
|ELU|0|alpha|0.1f|
|
||||
|Embed|0|num_output|0|weight bias|
|
||||
||1|input_dim|0|
|
||||
||2|bias_term|0|
|
||||
||3|weight_data_size|0|
|
||||
|Exp|0|base|-1.f|
|
||||
||1|scale|1.f|
|
||||
||2|shift|0.f|
|
||||
|ExpandDims|0|expand_w|0|
|
||||
||1|expand_h|0|
|
||||
||2|expand_c|0|
|
||||
||3|axes|[ ]|
|
||||
|Flatten|||
|
||||
|HardSigmoid|0|alpha|0.2f||
|
||||
||1|beta|0.5f|
|
||||
|HardSwish|0|alpha|0.2f||
|
||||
||1|beta|0.5f|
|
||||
|InnerProduct|0|num_output|0|weight bias|
|
||||
||1|bias_term|0|
|
||||
||2|weight_data_size|0|
|
||||
||8|int8_scale_term|0|
|
||||
||9|activation_type|0|
|
||||
||10|activation_params|[ ]|
|
||||
|Input|0|w|0|
|
||||
||1|h|0|
|
||||
||2|c|0|
|
||||
|InstanceNorm|0|channels|0|gamma bias|
|
||||
||1|eps|0.001f|
|
||||
|Interp|0|resize_type|0|
|
||||
||1|height_scale|1.f|
|
||||
||2|width_scale|1.f|
|
||||
||3|output_height|0|
|
||||
||4|output_width|0|
|
||||
|Log|0|base|-1.f|
|
||||
||1|scale|1.f|
|
||||
||2|shift|0.f|
|
||||
|LRN|0|region_type|0|
|
||||
||1|local_size|5|
|
||||
||2|alpha|1.f|
|
||||
||3|beta|0.75f|
|
||||
||4|bias|1.f|
|
||||
|LSTM|0|num_output|0|
|
||||
||1|weight_data_size|1|
|
||||
||2|direction|0|
|
||||
|MemoryData|0|w|0|
|
||||
||1|h|0|
|
||||
||2|c|0|
|
||||
|Mish|||
|
||||
|MVN|0|normalize_variance|0|
|
||||
||1|across_channels|0|
|
||||
||2|eps|0.0001f|
|
||||
|Noop|||
|
||||
|Normalize|0|across_spatial|0|scale|
|
||||
||4|across_channel|0|
|
||||
||1|channel_shared|0|
|
||||
||2|eps|0.0001f|
|
||||
||9|eps_mode|0|
|
||||
||3|scale_data_size|0|
|
||||
|Packing|0|out_packing|1|
|
||||
||1|use_padding|0|
|
||||
||2|cast_type_from|0|
|
||||
||3|cast_type_to|0|
|
||||
||4|storage_type_from|0|
|
||||
||5|storage_type_to|0|
|
||||
|Padding|0|top|0|per_channel_pad_data|
|
||||
||1|bottom|0|
|
||||
||2|left|0|
|
||||
||3|right|0|
|
||||
||4|type|0|
|
||||
||5|value|0.f|
|
||||
||6|per_channel_pad_data_size|0|
|
||||
||7|front|0|
|
||||
||8|behind|0|
|
||||
|Permute|0|order_type|0|
|
||||
|PixelShuffle|0|upscale_factor|1|
|
||||
|Pooling|0|pooling_type(0: max 1: avg)|0|
|
||||
||1|kernel_w|0|
|
||||
||11|kernel_h|kernel_w|
|
||||
||2|stride_w|1|
|
||||
||12|stride_h|stride_w|
|
||||
||3|pad_left|0|
|
||||
||14|pad_right|pad_left|
|
||||
||13|pad_top|pad_left|
|
||||
||15|pad_bottom|pad_top|
|
||||
||4|global_pooling|0|
|
||||
||5|pad_mode|0|
|
||||
|Power|0|power|1.f|
|
||||
||1|scale|1.f|
|
||||
||2|shift|0.f|
|
||||
|PReLU|0|num_slope|0|slope|
|
||||
|PriorBox|0|min_sizes|[ ]|
|
||||
||1|max_sizes|[ ]|
|
||||
||2|aspect_ratios|[ ]|
|
||||
||3|varainces[0]|0.f|
|
||||
||4|varainces[1]|0.f|
|
||||
||5|varainces[2]|0.f|
|
||||
||6|varainces[3]|0.f|
|
||||
||7|flip|1|
|
||||
||8|clip|0|
|
||||
||9|image_width|0|
|
||||
||10|image_height|0|
|
||||
||11|step_width|-233.f|
|
||||
||12|step_height|-233.f|
|
||||
||13|offset|0.f|
|
||||
||14|step_mmdetection|0|
|
||||
||15|center_mmdetection|0|
|
||||
|Proposal|0|feat_stride|16|
|
||||
||1|base_size|16|
|
||||
||2|pre_nms_topN|6000|
|
||||
||3|after_nms_topN|300|
|
||||
||4|num_thresh|0.7f|
|
||||
||5|min_size|16|
|
||||
|PSROIPooling|0|pooled_width|7|
|
||||
||1|pooled_height|7|
|
||||
||2|spatial_scale|0.0625f|
|
||||
||3|output_dim|0|
|
||||
|Quantize|0|scale|1.f|
|
||||
|Reduction|0|operation|0|
|
||||
||1|dim|0|
|
||||
||2|coeff|1.f|
|
||||
||3|axes|[ ]|
|
||||
||4|keepdims|0|
|
||||
|ReLU|0|slope|0.f|
|
||||
|Reorg|0|stride|0|
|
||||
|Requantize|0|scale_in|1.f|bias|
|
||||
||1|scale_out|1.f|
|
||||
||2|bias_term|0|
|
||||
||3|bias_data_size|0|
|
||||
||4|fusion_relu|0|
|
||||
|Reshape|0|w|-233|
|
||||
||1|h|-233|
|
||||
||2|c|-233|
|
||||
||3|permute|0|
|
||||
|ROIAlign|0|pooled_width|0|
|
||||
||1|pooled_height|0|
|
||||
||2|spatial_scale|1.f|
|
||||
||3|sampling_ratio|0|
|
||||
||4|aligned|0|
|
||||
||5|version|0|
|
||||
|ROIPooling|0|pooled_width|0|
|
||||
||1|pooled_height|0|
|
||||
||2|spatial_scale|1.f|
|
||||
|Scale|0|scale_data_size|0|scale bias|
|
||||
||1|bias_term|0|
|
||||
|SELU|0|alpha|1.67326324f||
|
||||
||1|lambda|1.050700987f|
|
||||
|ShuffleChannel|0|group|1|
|
||||
|Sigmoid|||
|
||||
|Slice|0|slices|[ ]|
|
||||
||1|axis|0|
|
||||
|Softmax|0|axis|0|
|
||||
|Split|||
|
||||
|SPP|0|pooling_type|0|
|
||||
||1|pyramid_height|1|
|
||||
|Squeeze|0|squeeze_w|0|
|
||||
||1|squeeze_h|0|
|
||||
||2|squeeze_c|0|
|
||||
||3|axes|[ ]|
|
||||
|StatisticsPooling|0|include_stddev|0|
|
||||
|Swish|||
|
||||
|TanH|||
|
||||
|Threshold|0|threshold|0.f|
|
||||
|Tile|0|dim|0|
|
||||
||1|tiles|1|
|
||||
|UnaryOp|0|op_type|0|
|
||||
|YoloDetectionOutput|0|num_class|20|
|
||||
||1|num_box|5|
|
||||
||2|confidence_threshold|0.01f|
|
||||
||3|num_threshold|0.45f|
|
||||
||4|biases|[]|
|
||||
|Yolov3DetectionOutput|0|num_class|20|
|
||||
||1|num_box|5|
|
||||
||2|confidence_threshold|0.01f|
|
||||
||3|num_threshold|0.45f|
|
||||
||4|biases|[]|
|
||||
||5|mask|[]|
|
||||
||6|anchors_scale|[]|
|
||||
|RNN|0|num_output|0|
|
||||
||1|weight_data_size|0|
|
||||
||2|direction|0|
|
||||
|MultiHeadAttention|0|embed_dim|0|
|
||||
||1|num_head|1|
|
||||
||2|weight_data_size|0|
|
1643
3rdparty/ncnn/docs/developer-guide/operators.md
vendored
Normal file
1643
3rdparty/ncnn/docs/developer-guide/operators.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
64
3rdparty/ncnn/docs/developer-guide/param-and-model-file-structure.md
vendored
Normal file
64
3rdparty/ncnn/docs/developer-guide/param-and-model-file-structure.md
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
## net.param
|
||||
### example
|
||||
```
|
||||
7767517
|
||||
3 3
|
||||
Input input 0 1 data 0=4 1=4 2=1
|
||||
InnerProduct ip 1 1 data fc 0=10 1=1 2=80
|
||||
Softmax softmax 1 1 fc prob 0=0
|
||||
```
|
||||
### overview
|
||||
```
|
||||
[magic]
|
||||
```
|
||||
* magic number : 7767517
|
||||
```
|
||||
[layer count] [blob count]
|
||||
```
|
||||
* layer count : count of the layer line follows, should be exactly the count of all layer names
|
||||
* blob count : count of all blobs, usually greater than or equals to the layer count
|
||||
### layer line
|
||||
```
|
||||
[layer type] [layer name] [input count] [output count] [input blobs] [output blobs] [layer specific params]
|
||||
```
|
||||
* layer type : type name, such as Convolution Softmax etc
|
||||
* layer name : name of this layer, must be unique among all layer names
|
||||
* input count : count of the blobs this layer needs as input
|
||||
* output count : count of the blobs this layer produces as output
|
||||
* input blobs : name list of all the input blob names, separated by space, must be unique among input blob names of all layers
|
||||
* output blobs : name list of all the output blob names, separated by space, must be unique among output blob names of all layers
|
||||
* layer specific params : key=value pair list, separated by space
|
||||
### layer param
|
||||
```
|
||||
0=1 1=2.5 -23303=2,2.0,3.0
|
||||
```
|
||||
key index should be unique in each layer line, pair can be omitted if the default value used
|
||||
|
||||
the meaning of existing param key index can be looked up at [operation-param-weight-table](operation-param-weight-table)
|
||||
|
||||
* integer or float key : index 0 ~ 19
|
||||
* integer value : int
|
||||
* float value : float
|
||||
* integer array or float array key : -23300 minus index 0 ~ 19
|
||||
* integer array value : [array size],int,int,...,int
|
||||
* float array value : [array size],float,float,...,float
|
||||
|
||||
## net.bin
|
||||
```
|
||||
+---------+---------+---------+---------+---------+---------+
|
||||
| weight1 | weight2 | weight3 | weight4 | ....... | weightN |
|
||||
+---------+---------+---------+---------+---------+---------+
|
||||
^ ^ ^ ^
|
||||
0x0 0x80 0x140 0x1C0
|
||||
```
|
||||
the model binary is the concatenation of all weight data, each weight buffer is aligned by 32bit
|
||||
|
||||
### weight buffer
|
||||
```
|
||||
[flag] (optional)
|
||||
[raw data]
|
||||
[padding] (optional)
|
||||
```
|
||||
* flag : unsigned int, little-endian, indicating the weight storage type, 0 => float32, 0x01306B47 => float16, otherwise => quantized int8, may be omitted if the layer implementation forced the storage type explicitly
|
||||
* raw data : raw weight data, little-endian, float32 data or float16 data or quantized table and indexes depending on the storage type flag
|
||||
* padding : padding space for 32bit alignment, may be omitted if already aligned
|
29
3rdparty/ncnn/docs/developer-guide/preload-practice.zh.md
vendored
Normal file
29
3rdparty/ncnn/docs/developer-guide/preload-practice.zh.md
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
## 只是实践经验,没有理论,不一定正确
|
||||
|
||||
```
|
||||
prfm pldl1keep, [x0, #256]
|
||||
```
|
||||
* 放在 ld1 [x0] 前面 0~8 条指令
|
||||
* #256 表示把 x0+256 的内容放进 L1 cache
|
||||
* ldp 也适用
|
||||
* (经验)不写 offset 不如写个 #128
|
||||
* (经验)pldl1strm 似乎没啥意思,也没 pldl1keep 快
|
||||
* (经验)x0 ~ x0+256 的内容也会进来
|
||||
* (经验)load 128bit 用 #128,256bit或更多用 #256
|
||||
* (经验)避免 pld a,pld b,load a,load b 顺序,可能相互干扰
|
||||
* (经验)提前太多会失效
|
||||
* (经验)适合连续读
|
||||
|
||||
```
|
||||
prfm pldl2strm, [x0, #256]
|
||||
```
|
||||
* 放在 ld1 [x0] 前面 N 条指令,N 尽量大些
|
||||
* #256 表示把 x0+256 的内容放进 L2 cache
|
||||
* ldp 也适用
|
||||
* (经验)不写 offset 不如写个 #128
|
||||
* (经验)pldl2strm 效果稍好于 pldl2keep
|
||||
* (经验)x0 ~ x0+256 的内容也会进来
|
||||
* (经验)load 128bit 用 #128,256bit 用 #256
|
||||
* (经验)读很多数据,用不同 offset 连续两次 pldl2strm
|
||||
* (经验)后面不要对同位置再 pldl1keep,会变慢
|
||||
* (经验)适合提前准备要跳到很远的地方读,比如换 channel
|
57
3rdparty/ncnn/docs/developer-guide/tensorflow-op-combination.md
vendored
Normal file
57
3rdparty/ncnn/docs/developer-guide/tensorflow-op-combination.md
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
## batchnorm
|
||||
```
|
||||
Input A 0 1 A 0 0 0
|
||||
MemoryData sub/y 0 1 sub/y 16 0 0
|
||||
BinaryOp sub 2 1 A sub/y sub 1
|
||||
MemoryData div/y 0 1 div/y 16 0 0
|
||||
BinaryOp div 2 1 sub div/y div 3
|
||||
MemoryData mul/y 0 1 mul/y 16 0 0
|
||||
BinaryOp mul 2 1 div mul/y mul 2
|
||||
MemoryData BiasAdd/bias 0 1 BiasAdd/bias 16 0 0
|
||||
BinaryOp BiasAdd 2 1 mul BiasAdd/bias BiasAdd 0
|
||||
```
|
||||
## convolution
|
||||
```
|
||||
Input A 0 1 A 0 0 0
|
||||
Convolution Conv2D 1 1 A Conv2D 10 3 1 1 0 0 270
|
||||
MemoryData biases/read 0 1 biases/read 10 0 0
|
||||
BinaryOp BiasAdd 2 1 Conv2D biases/read BiasAdd 0
|
||||
```
|
||||
## innerproduct
|
||||
```
|
||||
Input A 0 1 A 0 0 0
|
||||
MemoryData biases/read 0 1 biases/read 10 0 0
|
||||
InnerProduct MatMul 1 1 A MatMul 10 0 2560
|
||||
BinaryOp conv6 2 1 MatMul biases/read conv6 0
|
||||
```
|
||||
## leakyrelu
|
||||
```
|
||||
Input A 0 1 A 0 0 0
|
||||
Split splitncnn_0 1 2 A A_splitncnn_0 A_splitncnn_1
|
||||
MemoryData mul_1/x 0 1 mul_1/x 0 0 0
|
||||
BinaryOp mul_1 2 1 mul_1/x A_splitncnn_1 mul_1 2
|
||||
BinaryOp leaky 2 1 mul_1 A_splitncnn_0 leaky 4
|
||||
```
|
||||
## prelu
|
||||
```
|
||||
Input A 0 1 A 0 0 0
|
||||
Split splitncnn_0 1 2 A A_splitncnn_0 A_splitncnn_1
|
||||
MemoryData prelu/alpha 0 1 prelu/alpha 10 0 0
|
||||
ReLU prelu/Relu 1 1 A_splitncnn_1 prelu/Relu 0.000000
|
||||
UnaryOp prelu/Neg 1 1 A_splitncnn_0 prelu/Neg 1
|
||||
ReLU prelu/Relu_1 1 1 prelu/Neg prelu/Relu_1 0.000000
|
||||
UnaryOp prelu/Neg_1 1 1 prelu/Relu_1 prelu/Neg_1 1
|
||||
BinaryOp prelu/Mul 2 1 prelu/alpha prelu/Neg_1 prelu/Mul 2
|
||||
BinaryOp prelu/add 2 1 prelu/Relu prelu/Mul prelu/add 0
|
||||
```
|
||||
## softmax
|
||||
```
|
||||
Input A 0 1 A 0 0 0
|
||||
Split splitncnn_4 1 2 A A_splitncnn_0 A_splitncnn_1
|
||||
Reduction Max 1 1 A_splitncnn_1 Max 4 -2 1.000000
|
||||
BinaryOp sub 2 1 A_splitncnn_0 Max sub 1
|
||||
UnaryOp Exp 1 1 sub Exp 7
|
||||
Split splitncnn_5 1 2 Exp Exp_splitncnn_0 Exp_splitncnn_1
|
||||
Reduction Sum 1 1 Exp_splitncnn_1 Sum 0 -2 1.000000
|
||||
BinaryOp prob 2 1 Exp_splitncnn_0 Sum prob 3
|
||||
```
|
Reference in New Issue
Block a user