feat: 切换后端至PaddleOCR-NCNN,切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试
2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程
3.重整权利声明文件,重整代码工程,确保最小化侵权风险

Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake
Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
This commit is contained in:
wangzhengyang
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions

View File

@ -0,0 +1,57 @@
```c
// v寄存器全部使用 %.4s
// 128-bit vreg matches %.4s
// a += b * c
float32x4_t _a = vld1q_f32(a);
float32x4_t _b = vld1q_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
"fmla %0.4s, %2.4s, %3.4s"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// v寄存器使用低64位 %.2s
// low 64-bit vreg matches %.2s
// a += b * c
float32x2_t _a = vld1_f32(a);
float32x2_t _b = vld1_f32(b);
float32x2_t _c = vld1_f32(c);
asm volatile(
"fmla %0.2s, %2.2s, %3.2s"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3]
// 32-bit register matches %.s[0]
// a += b * c[0]
// a += b * c[1]
// a += b * c[2]
// a += b * c[3]
float32x4_t _a = vld1_f32(a);
float32x4_t _b = vld1_f32(b);
float32x4_t _c = vld1_f32(c);
asm volatile(
"fmla %0.4s, %2.4s, %3.s[0]"
"fmla %0.4s, %2.4s, %3.s[1]"
"fmla %0.4s, %2.4s, %3.s[2]"
"fmla %0.4s, %2.4s, %3.s[3]"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
qwq

View File

@ -0,0 +1,175 @@
# NCNN增加自定义层
## 举例
这里举个例子添加自定义层次 如Relu6即 std::min(6, std::max(0, val))
```
Input input 0 1 input
Convolution conv2d 1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
Relu6 relu6 1 1 conv2d relu6
Pooling maxpool 1 1 relu6 maxpool 0=0 1=3 2=2 3=-233 4=0
```
## 定义源码h文件src/layer/relu6.h
```CPP
#ifndef LAYER_RELU6_H
#define LAYER_RELU6_H
#include "layer.h"
namespace ncnn {
class Relu6 : public Layer
{
public:
Relu6();
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};
} // namespace ncnn
#endif // LAYER_RELU6_H
```
## 定义源码CPP文件src/layer/relu6.cpp
```CPP
#include "relu6.h"
#include <math.h>
namespace ncnn {
Relu6::Relu6()
{
one_blob_only = true;
support_inplace = true;
}
int Relu6::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
for (int i=0; i<size; i++)
{
ptr[i] = std::min(6, std::max(0, ptr[i]));
}
}
return 0;
}
} // namespace ncnn
```
## 修改 src/CMakeLists.txt 注册Relu6
```CPP
ncnn_add_layer(GroupNorm)
ncnn_add_layer(LayerNorm)
ncnn_add_layer(Relu6)
```
## 定义测试用例CPP文件 src/test_relu6.cpp
```CPP
#include "layer/relu6.h"
#include "testutil.h"
static int test_relu6(const ncnn::Mat& a)
{
ncnn::ParamDict pd;
std::vector<ncnn::Mat> weights(0);
int ret = test_layer<ncnn::Relu6>("Relu6", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_relu6 failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
}
return ret;
}
static int test_relu6_0()
{
return 0
|| test_relu6(RandomMat(5, 7, 24))
|| test_relu6(RandomMat(7, 9, 12))
|| test_relu6(RandomMat(3, 5, 13));
}
static int test_relu6_1()
{
return 0
|| test_relu6(RandomMat(15, 24))
|| test_relu6(RandomMat(17, 12))
|| test_relu6(RandomMat(19, 15));
}
static int test_relu6_2()
{
return 0
|| test_relu6(RandomMat(128))
|| test_relu6(RandomMat(124))
|| test_relu6(RandomMat(127));
}
int main()
{
SRAND(7767517);
return 0
|| test_relu6_0()
|| test_relu6_1()
|| test_relu6_2();
}
```
## 修改tests/CMakeLists.txt 注册Relu6测试用例
```CPP
ncnn_add_layer_test(LSTM)
ncnn_add_layer_test(Yolov3DetectionOutput)
ncnn_add_layer_test(Relu6)
```
## 编译
```
按原NCNN步骤编译
```
## 单元测试
```
./test_relu6
```

View File

@ -0,0 +1,85 @@
## natural assembly
* no register dependency, no penalty
```
ld1 {v0.4s}, [r0], #16
fmla v10.4s, v16.4s, v24.s[0]
fmla v11.4s, v16.4s, v24.s[1]
fmla v12.4s, v16.4s, v24.s[2]
fmla v13.4s, v16.4s, v24.s[3]
```
## A53
* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
* 64bit vector load cannot be dual issued with fmla, wait 1 cycle
* 64bit integer load can be dual issued with fmla, no penalty
* pointer update can be dual issued with fmla, no penalty
* 64bit vector load and 64bit vector insert can be dual issued, no penalty
* any vector load cannot be issued on the 4th cycle of each fmla (enters the accumulator pipeline)
### practical guide
* use 64bit vector load only
* issue vector load every three fmla
* 1 cycle to load 64bit, dual issue with the previous interleaved 64bit insert
* load the remaining 64bit into integer register, dual issue with fmla
* update pointer, dual issue with fmla
* insert 64bit into vector from integer register, dual issue with the next interleaved 64bit load
* add nop every three fmla if no load, seems to be faster
```
ldr d0, [r0] // 1 cycle, v0 first 64bit
fmla
ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
fmla
add r0, r0, #16 // 0 cycle, update pointer
fmla
ldr d1, [r0] // 1 cycle, v1 first 64bit
ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete
fmla
ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
fmla
add r0, r0, #16 // 0 cycle, update pointer
fmla
ins v1.d[1], x23 // 1 cycle, v1 second 64bit complete
nop
fmla
fmla
fmla
nop
nop
fmla
fmla
fmla
```
## A55
* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
* 64bit vector load can be dual issued with fmla, no penalty
* 64bit integer load can be dual issued with fmla, no penalty
* pointer update can be dual issued with fmla, no penalty
* 64bit vector insert can be dual issued with fmla, no penalty
### practical guide
* use 64bit vector load only
* load 64bit, dual issue with fmla
* load the remaining 64bit into integer register, dual issue with fmla
* update pointer, dual issue with fmla
* insert 64bit into vector from integer register, dual issue with fmla
* interleaved load loose register dependency
* nop trick is not needed
```
ldr d0, [r0] // 0 cycle, v0 first 64bit
fmla
ldr x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
fmla
add r0, r0, #16 // 0 cycle, update pointer
fmla
ldr d1, [r0] // 0 cycle, v1 first 64bit
fmla
ins v0.d[1], x23 // 0 cycle, v0 second 64bit complete
fmla
ldr x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
fmla
add r0, r0, #16 // 0 cycle, update pointer
fmla
ins v1.d[1], x23 // 0 cycle, v1 second 64bit complete
fmla
```

View File

@ -0,0 +1,130 @@
```c
// d寄存器全部使用 %P
// d reg matches %P
// a += b * c
float32x2_t _a = vld1_f32(a);
float32x2_t _b = vld1_f32(b);
float32x2_t _c = vld1_f32(c);
asm volatile(
"vmla.f32 %P0, %P2, %P3"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// q寄存器全部使用 %q
// q reg matches %q
// a += b * c
float32x4_t _a = vld1q_f32(a);
float32x4_t _b = vld1q_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
"vmla.f32 %q0, %q2, %q3"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// d寄存器单路使用 %P[0] %P[1]
// 32bit d reg matches %P[0]
// a += b * c[0]
// a += b * c[1]
float32x2_t _a = vld1_f32(a);
float32x2_t _b = vld1_f32(b);
float32x2_t _c = vld1_f32(c);
asm volatile(
"vmla.f32 %P0, %P2, %P3[0]"
"vmla.f32 %P0, %P2, %P3[1]"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// q寄存器单路使用 %e[0] %e[1] %f[0] %f[1]
// 32-bit q reg matches %e[0]
// a += b * c[0]
// a += b * c[1]
// a += b * c[2]
// a += b * c[3]
float32x4_t _a = vld1q_f32(a);
float32x4_t _b = vld1q_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
"vmla.f32 %q0, %q2, %e3[0]"
"vmla.f32 %q0, %q2, %e3[1]"
"vmla.f32 %q0, %q2, %f3[0]"
"vmla.f32 %q0, %q2, %f3[1]"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// q寄存器拆分d寄存器使用 %e %f
// use %e %f to split q reg into two d regs
// a += b * c[0]c[1]
// a += b * c[2]c[3]
float32x2_t _a = vldq_f32(a);
float32x2_t _b = vldq_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
"vmla.f32 %P0, %P2, %e3"
"vmla.f32 %P0, %P2, %f3"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// d寄存器声明绑定
// specify concrete d reg which want to save
// vmla.f32 d0, d2, d4
register float32x2_t _a asm("d0") = vld1_f32(a);
register float32x2_t _b asm("d2") = vld1_f32(b);
register float32x2_t _c asm("d4") = vld1_f32(c);
asm volatile(
"vmla.f32 %P0, %P2, %P3"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
```c
// q寄存器声明绑定
// bind q reg with data
// vmla.f32 q0, q1, q2
register float32x4_t _a asm("q0") = vld1q_f32(a);
register float32x4_t _b asm("q1") = vld1q_f32(b);
register float32x4_t _c asm("q2") = vld1q_f32(c);
asm volatile(
"vmla.f32 %q0, %q2, %q3"
: "=w"(_a) // %0
: "0"(_a),
"w"(_b), // %2
"w"(_c) // %3
:
);
```
如果不是因为编译器的bug寄存器绑定是用不着的然而。。。
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41538
qwq

View File

@ -0,0 +1,52 @@
### broadcasting rule
ncnn BinaryOp accepts blobs with different shape
C = BinaryOp(A, B)
shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c]
|type|A|B|C|
|---|---|---|---|
|1|[1]|scalar|[1]|
|2|[1]|[2]|[2]|
|3|[1]|[2,3]|[2,3]|
|4|[1]|[2,3,4]|[2,3,4]|
|5|[2]|scalar|[2]|
|6|[2]|[1]|[2]|
|7|[2]|[2]|[2]|
|8|[3]|[2,3]|[2,3]|
|9|[4]|[2,3,4]|[2,3,4]|
|10|[2,3]|scalar|[2,3]|
|11|[2,3]|[1]|[2,3]|
|12|[2,3]|[3]|[2,3]|
|13|[2,3]|[2,3]|[2,3]|
|14|[3,4]|[2,3,4]|[2,3,4]|
|15|[2,3,4]|scalar|[2,3,4]|
|16|[2,3,4]|[1]|[2,3,4]|
|17|[2,3,4]|[4]|[2,3,4]|
|18|[2,3,4]|[3,4]|[2,3,4]|
|19|[2,3,4]|[2,3,4]|[2,3,4]|
|20|[1]|[2,3,4,5]|[2,3,4,5]|
|21|[5]|[2,3,4,5]|[2,3,4,5]|
|22|[4,5]|[2,3,4,5]|[2,3,4,5]|
|23|[3,4,5]|[2,3,4,5]|[2,3,4,5]|
|24|[2,3,4,5]|scalar|[2,3,4,5]|
|25|[2,3,4,5]|[1]|[2,3,4,5]|
|26|[2,3,4,5]|[5]|[2,3,4,5]|
|27|[2,3,4,5]|[4,5]|[2,3,4,5]|
|28|[2,3,4,5]|[3,4,5]|[2,3,4,5]|
|29|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]|
some special broadcasting rule exists for model compatibility
|special type|A|B|C|
|---|---|---|---|
|1|[2,3,4]|[1,1,4]|[2,3,4]|
|2|[2,3,4]|[2,3,1]|[2,3,4]|
|3|[1,1,4]|[2,3,4]|[2,3,4]|
|4|[2,3,1]|[2,3,4]|[2,3,4]|
|5|[2,3,4]|[1,3,4]|[2,3,4]|
|6|[2,3,4]|[2,1,4]|[2,3,4]|
|7|[1,3,4]|[2,3,4]|[2,3,4]|
|8|[2,1,4]|[2,3,4]|[2,3,4]|

View File

@ -0,0 +1,63 @@
Mat structure is now allocator-aware via an extra allocator parameter with default zero value.
The good-old ncnn::fastMalloc()/ncnn::fastFree() will be used for a null allocator.
You could pass a custom allocator to delegate all memory allocation and deallocation.
```cpp
class Allocator
{
public:
virtual void* fastMalloc(size_t size) = 0;
virtual void fastFree(void* ptr) = 0;
};
```
ncnn has already implemented two simple pooled Allocator class, with mutex lock or without it.
```cpp
ncnn::PoolAllocator locked_mempool;
ncnn::UnlockedPoolAllocator unlocked_mempool;
```
the two allocator types in ncnn
* blob allocator
used to allocate memory for all named blobs, which you could retrieve by Extractor::extract()
* workspace allocator
used to allocate memory for internal temporary use in layer implementation, such as the temp blob after padding in convolution
by default, all Extractor instance use the two allocator in the default option
You can alter them by ncnn::set_default_option()
or you can set them per Extractor by Extractor::set_blob_allocator()/Extractor::set_workspace_allocator()
blob allocator is guaranteed to be called in-order in layer implementation during each Extractor lifecycle
while workspace allocator may be called synchronously
the practical usage
* one network, one-by-one inference
shared unlocked blob allocator for all Extractor
shared locked workspace allocator for all Extractor
* one network, concurrent inference
shared unlocked blob allocator for all Extractor in each thread
shared locked workspace allocator for all Extractor among all threads
* concurrent multiple networks, one-by-one inference for each network
shared unlocked blob allocator for all Extractor of each network
shared locked workspace allocator for all Extractor among all networks (for saving memory)
* concurrent multiple networks, concurrent inference for each network
shared unlocked blob allocator for all Extractor of each network in each thread
shared locked workspace allocator for all Extractor among all networks (for saving memory)

View File

@ -0,0 +1,119 @@
### what is packing and why
packing is the form of storing multiple short-sized values as one long-sized value.
element packing is well mapped with the underlying simd register, which usually use one very wide register to store different types of values.
|C|elemsize|elempack|
|---|---|---|
|double|8|1|
|float|4|1|
|int|4|1|
|short|2|1|
|signed char|1|1|
|arm neon|elemsize|elempack|
|---|---|---|
|float64x2_t|16|2|
|float32x4_t|16|4|
|int32x4_t|16|4|
|float16x4_t|8|4|
|int8x8_t|8|8|
Though the real count of values doubles when elempack is two, the wide-sized value is still treated as one value in the view of Mat structure. For example, we want to store 40 float values in Mat object, if elempack 1 is used, Mat width is then 40, while 10 if elempack 4 is used.
|dims|w|h|c|cstep|elemsize|elempack|
|---|---|---|---|---|---|---|
|1|40|1|1|40|4|1|
|1|10|1|1|10|16|4|
### packing style convention
In practice, elempack 1, 4, 8 are the most common cases. It is possible to use any other packing style in theory.
The following table show the packing axis used in ncnn for different dimension.
|dims|packing axis|shape before packing|shape after packing|
|---|---|---|---|
|1|w|w|w/elempack|
|2|h|w, h|w, h/elempack|
|3|c|w, h, c|w, h, c/elempack|
If the packing axis dim is not evenly divisible by elempack, zero padding may be used.
```
outw = (w + elempack - 1) / elempack;
```
The following snippet shows the memory layout after elempack=4 on 3-dim Mat
```
// w=2 h=3 c=4 elempack=1
0 1
2 3
4 5
6 7
8 9
10 11
12 13
14 15
16 17
18 19
20 21
22 23
// w=2 h=3 c=1 elempack=4
(0,6,12,18) (1,7,13,19)
(2,8,14,20) (3,9,15,21)
(4,10,16,22) (5,11,17,23)
```
### how to convert elempack
There is a convenient wrapper function provided
```
// convert to elempack 4 if packing axis dim is evenly divisible by elempack
// return the identity Mat otherwise
ncnn::Mat a;
ncnn::Mat a_packed;
ncnn::convert_packing(a, a_packed, 4);
if (a_packed.elempack == 4)
{
// check if packing is successful
}
// convert to packing 1, aka unpacking, shall be always successful
ncnn::Mat b;
ncnn::Mat b_unpacked;
ncnn::convert_packing(b, b_unpacked, 1);
```
### handle general interleaved data
Here is an example of using convert packing to convert RGB interleaved data to planar
**NOTE:** The following code is just presented to explain what packing is and the conversion process. Do not use it in production due to its poor performance. Do use ncnn::Mat::from_pixels()
```cpp
// rgb_interleaved_u8 is RGB RGB RGB ...
// rgb_interleaved_u8.w = w;
// rgb_interleaved_u8.h = h;
// rgb_interleaved_u8.c = 1;
// rgb_interleaved_u8.elemsize = 3;
// rgb_interleaved_u8.elempack = 3;
ncnn::Mat rgb_interleaved_u8(w, h, 1, 3, 3);
ncnn::Mat rgb_planar_u8;
ncnn::convert_packing(rgb_interleaved_u8, rgb_planar_u8, 1);
// rgb_planar_u8 is now RRR ... GGG ... BBB ...
// rgb_planar_u8.w = w;
// rgb_planar_u8.h = h;
// rgb_planar_u8.c = 3;
// rgb_planar_u8.elemsize = 1;
// rgb_planar_u8.elempack = 1;
```

View File

@ -0,0 +1,75 @@
### 如何提交代码
#### 一、fork 分支
在浏览器中打开 [ncnn](https://github.com/tencent/ncnn), `fork` 到自己的 repositories例如
```
https://github.com/user/ncnn
```
clone 项目到本地,添加官方 remote 并 fetch:
```
$ git clone https://github.com/user/ncnn && cd ncnn
$ git remote add tencent https://github.com/tencent/ncnn
$ git fetch tencent
```
对于 `git clone` 下来的项目,它现在有两个 remote分别是 origin 和 tencent
```
$ git remote -v
origin https://github.com/user/ncnn (fetch)
origin https://github.com/user/ncnn (push)
tencent https://github.com/Tencent/ncnn (fetch)
tencent https://github.com/Tencent/ncnn (push)
```
origin 指向你 fork 的仓库地址remote 即官方 repo。可以基于不同的 remote 创建和提交分支。
例如切换到官方 master 分支,并基于此创建自己的分支(命名尽量言简意赅。一个分支只做一件事,方便 review 和 revert
```
$ git checkout tencent/master
$ git checkout -b add-conv-int8
```
或创建分支时指定基于官方 master 分支:
```
$ git checkout -b fix-typo-in-document tencent/master
```
> `git fetch` 是从远程获取最新代码到本地。如果是第二次 pr ncnn直接从 `git fetch tencent` 开始即可,不需要 `git remote add tencent`,也不需要修改 `github.com/user/ncnn`。
#### 二、代码习惯
为了增加沟通效率reviewer 一般要求 contributor 遵从以下规则
* `if-else`和花括号`{`中间需要换行
* 不能随意增删空行
* tab 替换为 4 个空格
* 为了保证平台兼容性,目前不使用`c++11``src`目录下尽量避免使用`template`
* 若是新增功能或平台,`test`目录需有对应测试用例
* 文档放到`doc`对应目录下,中文用`.zh.md`做后缀;英文直接用`.md`后缀
开发完成后提交到自己的 repository
```
$ git commit -a
$ git push origin add-conv-int8
```
推荐使用 [`commitizen`](https://pypi.org/project/commitizen/) 或 [`gitlint`](https://jorisroovers.com/gitlint/) 等工具格式化 commit message方便事后检索海量提交记录
#### 三、代码提交
浏览器中打开 [ncnn pulls](https://github.com/Tencent/ncnn/pulls) ,此时应有此分支 pr 提示,点击 `Compare & pull request`
* 标题**必须**是英文。未完成的分支应以 `WIP:` 开头,例如 `WIP: add conv int8`
* 正文宜包含以下内容,中英不限
* 内容概述和实现方式
* 功能或性能测试
* 测试结果
CI 已集成了自动格式化restyled-io 会在 pr 的同时生成 `Restyled add conv int8`,需要 merge 自动 restyled 的分支,例如
```
$ git fetch tencent
$ git checkout add-conv-int8
$ git merge tencent/restyled/pull-2078
$ git push origin add-conv-int8
```
回到浏览器签署 CLA所有 CI 测试通过后通知 reviewer merge 此分支。
#### 四、彩蛋
留下个人 qq 号会触发隐藏事件。

View File

@ -0,0 +1,323 @@
# step1 create a new empty class
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;
// a new layer type called MyLayer
class MyLayer : public Layer
{
};
// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
```
# step2 declare layer parameters and weights
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;
class MyLayer : public Layer
{
private:
int channels;// new code
float gamma;// new code
Mat weight;// new code
};
// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
```
# step3 implement load functions for parameters and weights
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;
class MyLayer : public Layer
{
public:
virtual int load_param(const ParamDict& pd);// new code
virtual int load_model(const ModelBin& mb);// new code
private:
int channels;
float eps;
Mat gamma_data;
};
// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
// new routine for loading parameters
int MyLayer::load_param(const ParamDict& pd)
{
// details about the relations with param file
// https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
//
channels = pd.get(0, 0);// parse 0=<int value> entry, default value 0
eps = pd.get(1, 0.001f);// parse 1=<float value> entry, default value 0.001f
return 0;// return zero if success
}
// new routine for loading weights
int MyLayer::load_model(const ModelBin& mb)
{
// details about the relations with model file
// https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
//
// read weights with length of channels * sizeof(float)
// the second argument explains as follows
// 0 judge the value type automatically, you may get float or float16 or uint8 etc
// depends on the model storage and the supporting target hardware
// 1 read float values anyway
// 2 read float16 values anyway
// 3 read uint8 values anyway
gamma_data = mb.load(channels, 1);
if (gamma_data.empty())
return -100;// return non-zero on error, -100 indicates out-of-memory
return 0;// return zero if success
}
```
# step4 determine forward behavior
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;
class MyLayer : public Layer
{
public:
MyLayer();// new code
virtual int load_param(const ParamDict& pd);
virtual int load_model(const ModelBin& mb);
private:
int channels;
float eps;
Mat gamma_data;
};
// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
// new routine for setting forward behavior
MyLayer::MyLayer()
{
// one input and one output
// typical one_blob_only type: Convolution, Pooling, ReLU, Softmax ...
// typical non-one_blob_only type: Eltwise, Split, Concat, Slice ...
one_blob_only = true;
// do not change the blob size, modify data in-place
// typical support_inplace type: ReLU, Sigmoid ...
// typical non-support_inplace type: Convolution, Pooling ...
support_inplace = true;
}
int MyLayer::load_param(const ParamDict& pd)
{
channels = pd.get(0, 0);
eps = pd.get(1, 0.001f);
// you could alter the behavior based on loaded parameter
// if (eps == 0.001f)
// {
// one_blob_only = false;
// support_inplace = false;
// }
return 0;
}
int MyLayer::load_model(const ModelBin& mb)
{
gamma_data = mb.load(channels, 1);
if (gamma_data.empty())
return -100;
// you could alter the behavior based on loaded weight
// if (gamma_data[0] == 0.f)
// {
// one_blob_only = false;
// support_inplace = false;
// }
return 0;
}
```
# step5 choose proper interface based on forward behavior
```cpp
// The base class Layer defines four interfaces for each forward behavior combination
// 1
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
// 2
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
// 3
virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
// 4
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
```
**must** = layer must implement this function
**optional** = layer may implement this function for optimal performance
sometimes the graph inference path cannot call forward_inplace directly due to data sharing, in this situation the non-inplace forward routine will be used, which deep-copy the input blob and call inplace forward on it if the optional routine is not implemented. Thus, you could avoid this deep-copy by process input to output on-the-fly.
|one_blob_only|support_inplace|1|2|3|4|
|---|---|---|---|---|---|
|false|false|must| | | |
|false|true|optional| |must| |
|true|false| |must| | |
|true|true| |optional| |must|
# step6 implement forward function
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;
class MyLayer : public Layer
{
public:
MyLayer();
virtual int load_param(const ParamDict& pd);
virtual int load_model(const ModelBin& mb);
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;// new code, optional
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;// new code
private:
int channels;
float eps;
Mat gamma_data;
};
// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
MyLayer::MyLayer()
{
one_blob_only = true;
support_inplace = true;
}
int MyLayer::load_param(const ParamDict& pd)
{
channels = pd.get(0, 0);
eps = pd.get(1, 0.001f);
return 0;
}
int MyLayer::load_model(const ModelBin& mb)
{
gamma_data = mb.load(channels, 1);
if (gamma_data.empty())
return -100;
return 0;
}
// optional new routine for layer forward function, non-inplace version
int MyLayer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// check input dims, return non-zero on error
if (bottom_blob.c != channels)
return -1;
// x = (x + eps) * gamma_per_channel
int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;
int size = w * h;
top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
if (top_blob.empty())
return -100;// return non-zero on error, -100 indicates out-of-memory
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
const float* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);
const float gamma = gamma_data[q];
for (int i=0; i<size; i++)
{
outptr[i] = (ptr[i] + eps) * gamma ;
}
}
return 0;
}
// new routine for layer forward function
int MyLayer::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
// check input dims, return non-zero on error
if (bottom_top_blob.c != channels)
return -1;
// x = (x + eps) * gamma_per_channel
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int size = w * h;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);
const float gamma = gamma_data[q];
for (int i=0; i<size; i++)
{
ptr[i] = (ptr[i] + eps) * gamma ;
}
}
return 0;
}
```
# step7 integrate with ncnn library
you may probably need to modify caffe2ncnn or mxnet2ncnn etc. to write your layer specific parameters and weights into ncnn param and model file
the param and model file structure [param-and-model-file-structure](param-and-model-file-structure)
```
// example param file content
Input input 0 1 input
Convolution conv2d 1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
MyLayer mylayer 1 1 conv2d mylayer0
Pooling maxpool 1 1 mylayer0 maxpool 0=0 1=3 2=2 3=-233 4=0
```
```cpp
ncnn::Net net;
// register custom layer before load param and model
// the layer creator function signature is always XYZ_layer_creator, which defined in DEFINE_LAYER_CREATOR macro
net.register_custom_layer("MyLayer", MyLayer_layer_creator);
net.load_param("model.param");
net.load_model("model.bin");
```

View File

@ -0,0 +1,38 @@
# benchmark
op
# naive C with openmp
for for for
# unroll, first try
h
# register allocation
kernels
# unroll, second try
simd
# neon intrinsics
optional
# naive neon assembly with pld
asm
# pipeline optimize, first try
more register load mla
# pipeline optimize, second try
interleave load mla
# pipeline optimize, third try
loop tail
# usual practice, load/save
233
# usual practice, unroll
233
# usual practice, save register
233

View File

@ -0,0 +1,311 @@
# implement elementwise addition with/without broadcast using BinaryOp operation
* input must be fp32 storage without packing
* output is expected to be fp32 storage without packing
```cpp
void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c)
{
ncnn::Option opt;
opt.num_threads = 2;
opt.use_fp16_storage = false;
opt.use_packing_layout = false;
ncnn::Layer* op = ncnn::create_layer("BinaryOp");
// set param
ncnn::ParamDict pd;
pd.set(0, 0);// op_type
op->load_param(pd);
op->create_pipeline(opt);
// forward
std::vector<ncnn::Mat> bottoms(2);
bottoms[0] = a;
bottoms[1] = b;
std::vector<ncnn::Mat> tops(1);
op->forward(bottoms, tops, opt);
c = tops[0];
op->destroy_pipeline(opt);
delete op;
}
```
# implement 3x3 box blur on three channel image using ConvolutionDepthWise operation
* input must be fp32 storage without packing
* output is expected to be fp32 storage without packing
```cpp
void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out)
{
ncnn::Option opt;
opt.num_threads = 2;
opt.use_fp16_storage = false;
opt.use_packing_layout = false;
ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise");
// set param
ncnn::ParamDict pd;
pd.set(0, 3);// num_output
pd.set(1, 3);// kernel_w
pd.set(5, 0);// bias_term
pd.set(6, 3*3*3);// weight_data_size
pd.set(7, 3);// group
op->load_param(pd);
// set weights
ncnn::Mat weights[1];
weights[0].create(3*3*3);// weight_data
for (int i=0; i<3*3*3; i++)
{
weights[0][i] = 1.f / 9;
}
op->load_model(ncnn::ModelBinFromMatArray(weights));
op->create_pipeline(opt);
// forward
op->forward(rgb, out, opt);
op->destroy_pipeline(opt);
delete op;
}
```
# transpose Mat, chw to cwh
* input must be fp32 storage with/without packing
* output is expected to be fp32 storage packed
```cpp
void transpose(const ncnn::Mat& in, ncnn::Mat& out)
{
ncnn::Option opt;
opt.num_threads = 2;
opt.use_fp16_storage = false;
opt.use_packing_layout = true;
ncnn::Layer* op = ncnn::create_layer("Permute");
// set param
ncnn::ParamDict pd;
pd.set(0, 1);// order_type
op->load_param(pd);
op->create_pipeline(opt);
ncnn::Mat in_packed = in;
{
// resolve dst_elempack
int dims = in.dims;
int elemcount = 0;
if (dims == 1) elemcount = in.elempack * in.w;
if (dims == 2) elemcount = in.elempack * in.h;
if (dims == 3) elemcount = in.elempack * in.c;
int dst_elempack = 1;
if (op->support_packing)
{
if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
dst_elempack = 8;
else if (elemcount % 4 == 0)
dst_elempack = 4;
}
if (in.elempack != dst_elempack)
{
convert_packing(in, in_packed, dst_elempack, opt);
}
}
// forward
op->forward(in_packed, out, opt);
op->destroy_pipeline(opt);
delete op;
}
```
# apply instance normalization
// x = (x - mean) / sqrt(var)
* input can be fp32/fp16 storage with/without packing
* output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise
```cpp
void normalize(const ncnn::Mat& in, ncnn::Mat& out)
{
ncnn::Option opt;
opt.num_threads = 2;
opt.use_fp16_storage = true;
opt.use_packing_layout = true;
ncnn::Layer* op = ncnn::create_layer("InstanceNorm");
// set param
ncnn::ParamDict pd;
pd.set(0, in.c);// channels
pd.set(1, 0.f);// eps
op->load_param(pd);
// set weights
ncnn::Mat weights[2];
weights[0].create(in.c);// gamma_data
weights[1].create(in.c);// beta_data
weights[0].fill(1.f);
weights[1].fill(0.f);
op->load_model(ncnn::ModelBinFromMatArray(weights));
op->create_pipeline(opt);
ncnn::Mat in_fp16 = in;
if (in.elembits() == 32 && op->support_fp16_storage)
{
cast_float32_to_float16(in, in_fp16, opt);
}
if (in.elembits() == 16 && !op->support_fp16_storage)
{
cast_float16_to_float32(in, in_fp16, opt);
}
ncnn::Mat in_fp16_packed = in_fp16;
{
// resolve dst_elempack
int dims = in_fp16.dims;
int elemcount = 0;
if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w;
if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h;
if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c;
int dst_elempack = 1;
if (op->support_packing)
{
if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
dst_elempack = 8;
else if (elemcount % 4 == 0)
dst_elempack = 4;
}
if (in_fp16.elempack != dst_elempack)
{
convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt);
}
}
// forward
op->forward(in_fp16_packed, out, opt);
op->destroy_pipeline(opt);
delete op;
}
```
# cpu -> gpu -> forward -> gpu -> cpu
```cpp
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev);
ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev);
// create layer
ncnn::Layer* convolution = ncnn::create_layer("Convolution");
convolution->vkdev = vkdev;
// set option
ncnn::Option opt;
opt.num_threads = 4;
opt.use_vulkan_compute = true;
opt.blob_vkallocator = blob_vkallocator;
opt.workspace_vkallocator = blob_vkallocator;
opt.staging_vkallocator = staging_vkallocator;
// load param
{
ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, ksize);
pd.set(6, outch*inch*ksize*ksize);
pd.use_vulkan_compute = 1;
convolution->load_param(pd);
}
// load model
{
ncnn::Mat weights[2];
weights[0] = random_mat(outch*inch*ksize*ksize);
weights[1] = random_mat(outch);
ncnn::ModelBinFromMatArray mb(weights);
convolution->load_model(mb);
}
// create pipeline
convolution->create_pipeline(opt);
// upload model
{
ncnn::VkTransfer cmd(vkdev);
ncnn::Option opt_upload = opt;
opt_upload.blob_vkallocator = weight_vkallocator;
opt_upload.workspace_vkallocator = weight_vkallocator;
opt_upload.staging_vkallocator = weight_staging_vkallocator;
convolution->upload_model(cmd, opt_upload);
cmd.submit_and_wait();
}
ncnn::Mat bottom = random_mat(w, h, inch);
ncnn::Mat top;
// forward
{
ncnn::VkCompute cmd(vkdev);
ncnn::VkMat bottom_gpu;
cmd.record_upload(bottom, bottom_gpu, opt);
ncnn::VkMat top_gpu;
convolution->forward(bottom_gpu, top_gpu, cmd, opt);
cmd.record_download(top_gpu, top, opt);
cmd.submit_and_wait();
}
convolution->destroy_pipeline(opt);
delete convolution;
vkdev->reclaim_blob_allocator(blob_vkallocator);
vkdev->reclaim_staging_allocator(staging_vkallocator);
weight_vkallocator->clear();
weight_staging_vkallocator->clear();
delete weight_vkallocator;
delete weight_staging_vkallocator;
```

View File

@ -0,0 +1,46 @@
### blob内存是隐含共享的
ncnn的blob最初直接使用opencv的cv::Mat后发现blob最多只支持三维因此实现了类似的Mat
Mat的data每个通道内存16字节对齐并且有原子的引用计数a=b不复制数据超级快
Mat支持直接引用外部的内存块不复制数据加快模型加载和输入输出
举个例子split layer 将一个blob复制成n个ncnn中实现为单纯的增加引用计数没有任何数据复制
### 只运算一部分并保留中间结果
ncnn的net在解决分支依赖时是自上而下深度优先的因此当网络有多个分支时运算只会在需要结果的那个分支中进行节约时间
当多个分支有重合部分时,运算其中一个分支后会自动保留其余分支所需的中间结果,隐含共享,以便运算其余分支时利用
举个例子:某网络结构为 A -> B -> C1 + C2向ncnn索要C1结果时运算过程是 A -> B -> C1同时B结果引用计数加1自动保留后面还需要C2结果时只运算C2就足够了
### 开启轻模式省内存
每个layer都会产生blob除了最后的结果和多分支中间结果大部分blob都不值得保留开启轻模式可以在运算后自动回收省下内存
举个例子:某网络结构为 A -> B -> C在轻模式下向ncnn索要C结果时A结果会在运算B时自动回收而B结果会在运算C时自动回收最后只保留C结果后面再需要C结果会直接获得满足绝大部分深度网络的使用方式
### 网络和运算是分开的
ncnn的net是网络模型实际使用的是extractor也就是同个net可以有很多个运算实例而且运算实例互不影响中间结果保留在extractor内部在多线程使用时共用网络的结构和参数数据初始化网络模型和参数只需要一遍
举个例子全局静态的net实例初始化一次后就能不停地生成extractor使用
### openmp虽快但未必合适
ncnn中几乎所有运算都能用上openmp多线程加速而且性能很赞
不过系统有时候会突然慢一下比如手机太热自动降频界面操作等等ncnn耗时也会偶尔抖动变长在计算耗时稳定性比较重要的时候建议关闭openmp或者设置下extractor线程数
举个例子手机自拍时用ncnn进行人脸实时定位如果耗时突然涨一下就会感觉到掉帧而稳定的帧率体验更好
### NCNN_STDIO/NCNN_STRING禁用模型文件
ncnn支持加载自有的模型文件和模型内存NCNN_STDIO控制是否需要支持加载模型文件设成0能禁用这部分代码从而减小库的体积NCNN_STRING设成0能清除大部分可见的字符串和解析过程
模型内存加载时的参数数据是直接引用的,速度更快,通常在手机上使用这种方式
### 削减 ncnn 内置的层实现
cmake的时候加参数 -DWITH_LAYER_xxx=OFF 就可以完全不编译对应的内置层,这样可以进一步减小库的体积
### 关于 ARM big.LITTLE 调度
调用set_cpu_powersave可以把ncnn运算线程控制在特定的cpu核心上大核心速度快耗电多小核心速度慢点但省电大小一起用手机热得快

View File

@ -0,0 +1,194 @@
## current model load api
### Cons
#### long and awful code
#### two functions
#### deal float32 float16 quantized-u8
#### deal alignment size
```cpp
#if NCNN_STDIO
int Convolution::load_model(FILE* binfp)
{
int nread;
union
{
struct
{
unsigned char f0;
unsigned char f1;
unsigned char f2;
unsigned char f3;
};
unsigned int tag;
} flag_struct;
nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
return -1;
}
unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
weight_data.create(weight_data_size);
if (weight_data.empty())
return -100;
if (flag_struct.tag == 0x01306B47)
{
// half-precision weight data
int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
std::vector<unsigned short> float16_weights;
float16_weights.resize(align_weight_data_size);
nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
return -1;
}
weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
if (weight_data.empty())
return -100;
}
else if (flag != 0)
{
// quantized weight data
float quantization_value[256];
nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
return -1;
}
int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
std::vector<unsigned char> index_array;
index_array.resize(align_weight_data_size);
nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read index_array failed %d\n", nread);
return -1;
}
float* weight_data_ptr = weight_data;
for (int i = 0; i < weight_data_size; i++)
{
weight_data_ptr[i] = quantization_value[ index_array[i] ];
}
}
else if (flag_struct.f0 == 0)
{
// raw weight data
nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
return -1;
}
}
if (bias_term)
{
bias_data.create(num_output);
if (bias_data.empty())
return -100;
nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
if (nread != 1)
{
fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
return -1;
}
}
return 0;
}
#endif // NCNN_STDIO
int Convolution::load_model(const unsigned char*& mem)
{
union
{
struct
{
unsigned char f0;
unsigned char f1;
unsigned char f2;
unsigned char f3;
};
unsigned int tag;
} flag_struct;
memcpy(&flag_struct, mem, sizeof(flag_struct));
mem += sizeof(flag_struct);
unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
if (flag_struct.tag == 0x01306B47)
{
// half-precision weight data
weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
if (weight_data.empty())
return -100;
}
else if (flag != 0)
{
// quantized weight data
const float* quantization_value = (const float*)mem;
mem += 256 * sizeof(float);
const unsigned char* index_array = (const unsigned char*)mem;
mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
weight_data.create(weight_data_size);
if (weight_data.empty())
return -100;
float* weight_data_ptr = weight_data;
for (int i = 0; i < weight_data_size; i++)
{
weight_data_ptr[i] = quantization_value[ index_array[i] ];
}
}
else if (flag_struct.f0 == 0)
{
// raw weight data
weight_data = Mat(weight_data_size, (float*)mem);
mem += weight_data_size * sizeof(float);
}
if (bias_term)
{
bias_data = Mat(num_output, (float*)mem);
mem += num_output * sizeof(float);
}
return 0;
}
```
## new model load api proposed
### Pros
#### clean and simple api
#### element type detection
```cpp
int Convolution::load_model(const ModelBin& mb)
{
// auto detect element type
weight_data = mb.load(weight_data_size, 0);
if (weight_data.empty())
return -100;
if (bias_term)
{
// certain type specified
bias_data = mb.load(num_output, 1);
if (bias_data.empty())
return -100;
}
return 0;
}
```

View File

@ -0,0 +1,92 @@
## current param load api
### Cons
#### long and awful code
#### three functions
#### not extensible
#### no default value
#### no variable length array
```
MyLayer mylayer 1 1 in out 100 1.250000
```
```
binary 100
binary 1.250000
```
```cpp
#if NCNN_STDIO
#if NCNN_STRING
int MyLayer::load_param(FILE* paramfp)
{
int nscan = fscanf(paramfp, "%d %f", &a, &b);
if (nscan != 2)
{
fprintf(stderr, "MyLayer load_param failed %d\n", nscan);
return -1;
}
return 0;
}
#endif // NCNN_STRING
int MyLayer::load_param_bin(FILE* paramfp)
{
fread(&a, sizeof(int), 1, paramfp);
fread(&b, sizeof(float), 1, paramfp);
return 0;
}
#endif // NCNN_STDIO
int MyLayer::load_param(const unsigned char*& mem)
{
a = *(int*)(mem);
mem += 4;
b = *(float*)(mem);
mem += 4;
return 0;
}
```
## new param load api proposed
### Pros
#### clean and simple api
#### default value
#### extensible
#### variable length array
```
7767517
MyLayer mylayer 1 1 in out 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
```
```
binary 0xDD857600(magic)
binary 0
binary 100
binary 1
binary 1.250000
binary -23303
binary 5
binary 0.1
binary 0.2
binary 0.4
binary 0.8
binary 1.0
binary -233(EOP)
```
```cpp
int MyLayer::load_param(const ParamDict& pd)
{
// pd.get( param id (seq), default value );
a = pd.get(0, 100);
b = pd.get(1, 1.25f);
// get default value for c if not specified in param file
c = pd.get(2, 0.001);
// get array
d = pd.get(3, Mat(len, array));
return 0;
}
```

View File

@ -0,0 +1,303 @@
|operation|param id|param phase|default value|weight order|
|:---:|:---:|:---:|:---:|:---:|
|AbsVal|||
|ArgMax|0|out_max_val|0|
||1|topk|1|
|BatchNorm|0|channels|0|slope mean variance bias|
||1|eps|0.f|
|Bias|0|bias_data_size|0|
|BinaryOp|0|op_type|0|
||1|with_scalar|0|
||2|b|0.f|
|BNLL|||
|Cast|0|type_from|0|
||1|type_to|0|
|Clip|0|min|-FLT_MAX|
||1|max|FLT_MAX|
|Concat|0|axis|0|
|Convolution|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||8|int8_scale_term|0|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||17|impl_type|0|
||18|pad_value|0.f|
|ConvolutionDepthWise|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||7|group|1|
||8|int8_scale_term|0|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||18|pad_value|0.f|
|Crop|0|woffset|0|
||1|hoffset|0|
||2|coffset|0|
||3|outw|0|
||4|outh|0|
||5|outc|0|
||6|woffset2|0|
||7|hoffset2|0|
||8|coffset2|0|
||9|starts|[ ]|
||10|ends|[ ]|
||11|axes|[ ]|
|Deconvolution|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||18|output_pad_right|0|
||19|output_pad_bottom|output_pad_right|
||20|output_w|0|
||21|output_h|output_w|
|DeconvolutionDepthWise|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||7|group|1|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||18|output_pad_right|0|
||19|output_pad_bottom|output_pad_right|
||20|output_w|0|
||21|output_h|output_w|
|Dequantize|0|scale|1.f|bias|
||1|bias_term|0|
||2|bias_data_size|0|
|DetectionOutput|0|num_class|0|
||1|nms_threshold|0.05f|
||2|nms_top_k|300|
||3|keep_top_k|100|
||4|confidence_threshold|0.5f|
||5|variances[0]|0.1f|
||6|variances[1]|0.1f|
||7|variances[2]|0.2f|
||8|variances[3]|0.2f|
|Dropout|0|scale|1.f|
|Eltwise|0|op_type|0|
||1|coeffs|[ ]|
|ELU|0|alpha|0.1f|
|Embed|0|num_output|0|weight bias|
||1|input_dim|0|
||2|bias_term|0|
||3|weight_data_size|0|
|Exp|0|base|-1.f|
||1|scale|1.f|
||2|shift|0.f|
|ExpandDims|0|expand_w|0|
||1|expand_h|0|
||2|expand_c|0|
||3|axes|[ ]|
|Flatten|||
|HardSigmoid|0|alpha|0.2f||
||1|beta|0.5f|
|HardSwish|0|alpha|0.2f||
||1|beta|0.5f|
|InnerProduct|0|num_output|0|weight bias|
||1|bias_term|0|
||2|weight_data_size|0|
||8|int8_scale_term|0|
||9|activation_type|0|
||10|activation_params|[ ]|
|Input|0|w|0|
||1|h|0|
||2|c|0|
|InstanceNorm|0|channels|0|gamma bias|
||1|eps|0.001f|
|Interp|0|resize_type|0|
||1|height_scale|1.f|
||2|width_scale|1.f|
||3|output_height|0|
||4|output_width|0|
|Log|0|base|-1.f|
||1|scale|1.f|
||2|shift|0.f|
|LRN|0|region_type|0|
||1|local_size|5|
||2|alpha|1.f|
||3|beta|0.75f|
||4|bias|1.f|
|LSTM|0|num_output|0|
||1|weight_data_size|1|
||2|direction|0|
|MemoryData|0|w|0|
||1|h|0|
||2|c|0|
|Mish|||
|MVN|0|normalize_variance|0|
||1|across_channels|0|
||2|eps|0.0001f|
|Noop|||
|Normalize|0|across_spatial|0|scale|
||4|across_channel|0|
||1|channel_shared|0|
||2|eps|0.0001f|
||9|eps_mode|0|
||3|scale_data_size|0|
|Packing|0|out_packing|1|
||1|use_padding|0|
||2|cast_type_from|0|
||3|cast_type_to|0|
||4|storage_type_from|0|
||5|storage_type_to|0|
|Padding|0|top|0|per_channel_pad_data|
||1|bottom|0|
||2|left|0|
||3|right|0|
||4|type|0|
||5|value|0.f|
||6|per_channel_pad_data_size|0|
||7|front|0|
||8|behind|0|
|Permute|0|order_type|0|
|PixelShuffle|0|upscale_factor|1|
|Pooling|0|pooling_type(0: max 1: avg)|0|
||1|kernel_w|0|
||11|kernel_h|kernel_w|
||2|stride_w|1|
||12|stride_h|stride_w|
||3|pad_left|0|
||14|pad_right|pad_left|
||13|pad_top|pad_left|
||15|pad_bottom|pad_top|
||4|global_pooling|0|
||5|pad_mode|0|
|Power|0|power|1.f|
||1|scale|1.f|
||2|shift|0.f|
|PReLU|0|num_slope|0|slope|
|PriorBox|0|min_sizes|[ ]|
||1|max_sizes|[ ]|
||2|aspect_ratios|[ ]|
||3|varainces[0]|0.f|
||4|varainces[1]|0.f|
||5|varainces[2]|0.f|
||6|varainces[3]|0.f|
||7|flip|1|
||8|clip|0|
||9|image_width|0|
||10|image_height|0|
||11|step_width|-233.f|
||12|step_height|-233.f|
||13|offset|0.f|
||14|step_mmdetection|0|
||15|center_mmdetection|0|
|Proposal|0|feat_stride|16|
||1|base_size|16|
||2|pre_nms_topN|6000|
||3|after_nms_topN|300|
||4|num_thresh|0.7f|
||5|min_size|16|
|PSROIPooling|0|pooled_width|7|
||1|pooled_height|7|
||2|spatial_scale|0.0625f|
||3|output_dim|0|
|Quantize|0|scale|1.f|
|Reduction|0|operation|0|
||1|dim|0|
||2|coeff|1.f|
||3|axes|[ ]|
||4|keepdims|0|
|ReLU|0|slope|0.f|
|Reorg|0|stride|0|
|Requantize|0|scale_in|1.f|bias|
||1|scale_out|1.f|
||2|bias_term|0|
||3|bias_data_size|0|
||4|fusion_relu|0|
|Reshape|0|w|-233|
||1|h|-233|
||2|c|-233|
||3|permute|0|
|ROIAlign|0|pooled_width|0|
||1|pooled_height|0|
||2|spatial_scale|1.f|
||3|sampling_ratio|0|
||4|aligned|0|
||5|version|0|
|ROIPooling|0|pooled_width|0|
||1|pooled_height|0|
||2|spatial_scale|1.f|
|Scale|0|scale_data_size|0|scale bias|
||1|bias_term|0|
|SELU|0|alpha|1.67326324f||
||1|lambda|1.050700987f|
|ShuffleChannel|0|group|1|
|Sigmoid|||
|Slice|0|slices|[ ]|
||1|axis|0|
|Softmax|0|axis|0|
|Split|||
|SPP|0|pooling_type|0|
||1|pyramid_height|1|
|Squeeze|0|squeeze_w|0|
||1|squeeze_h|0|
||2|squeeze_c|0|
||3|axes|[ ]|
|StatisticsPooling|0|include_stddev|0|
|Swish|||
|TanH|||
|Threshold|0|threshold|0.f|
|Tile|0|dim|0|
||1|tiles|1|
|UnaryOp|0|op_type|0|
|YoloDetectionOutput|0|num_class|20|
||1|num_box|5|
||2|confidence_threshold|0.01f|
||3|num_threshold|0.45f|
||4|biases|[]|
|Yolov3DetectionOutput|0|num_class|20|
||1|num_box|5|
||2|confidence_threshold|0.01f|
||3|num_threshold|0.45f|
||4|biases|[]|
||5|mask|[]|
||6|anchors_scale|[]|
|RNN|0|num_output|0|
||1|weight_data_size|0|
||2|direction|0|
|MultiHeadAttention|0|embed_dim|0|
||1|num_head|1|
||2|weight_data_size|0|

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,64 @@
## net.param
### example
```
7767517
3 3
Input input 0 1 data 0=4 1=4 2=1
InnerProduct ip 1 1 data fc 0=10 1=1 2=80
Softmax softmax 1 1 fc prob 0=0
```
### overview
```
[magic]
```
* magic number : 7767517
```
[layer count] [blob count]
```
* layer count : count of the layer line follows, should be exactly the count of all layer names
* blob count : count of all blobs, usually greater than or equals to the layer count
### layer line
```
[layer type] [layer name] [input count] [output count] [input blobs] [output blobs] [layer specific params]
```
* layer type : type name, such as Convolution Softmax etc
* layer name : name of this layer, must be unique among all layer names
* input count : count of the blobs this layer needs as input
* output count : count of the blobs this layer produces as output
* input blobs : name list of all the input blob names, separated by space, must be unique among input blob names of all layers
* output blobs : name list of all the output blob names, separated by space, must be unique among output blob names of all layers
* layer specific params : key=value pair list, separated by space
### layer param
```
0=1 1=2.5 -23303=2,2.0,3.0
```
key index should be unique in each layer line, pair can be omitted if the default value used
the meaning of existing param key index can be looked up at [operation-param-weight-table](operation-param-weight-table)
* integer or float key : index 0 ~ 19
* integer value : int
* float value : float
* integer array or float array key : -23300 minus index 0 ~ 19
* integer array value : [array size],int,int,...,int
* float array value : [array size],float,float,...,float
## net.bin
```
+---------+---------+---------+---------+---------+---------+
| weight1 | weight2 | weight3 | weight4 | ....... | weightN |
+---------+---------+---------+---------+---------+---------+
^ ^ ^ ^
0x0 0x80 0x140 0x1C0
```
the model binary is the concatenation of all weight data, each weight buffer is aligned by 32bit
### weight buffer
```
[flag] (optional)
[raw data]
[padding] (optional)
```
* flag : unsigned int, little-endian, indicating the weight storage type, 0 => float32, 0x01306B47 => float16, otherwise => quantized int8, may be omitted if the layer implementation forced the storage type explicitly
* raw data : raw weight data, little-endian, float32 data or float16 data or quantized table and indexes depending on the storage type flag
* padding : padding space for 32bit alignment, may be omitted if already aligned

View File

@ -0,0 +1,29 @@
## 只是实践经验,没有理论,不一定正确
```
prfm pldl1keep, [x0, #256]
```
* 放在 ld1 [x0] 前面 0~8 条指令
* #256 表示把 x0+256 的内容放进 L1 cache
* ldp 也适用
* (经验)不写 offset 不如写个 #128
* (经验)pldl1strm 似乎没啥意思,也没 pldl1keep 快
* (经验)x0 ~ x0+256 的内容也会进来
* (经验)load 128bit 用 #128256bit或更多用 #256
* (经验)避免 pld apld bload aload b 顺序,可能相互干扰
* (经验)提前太多会失效
* (经验)适合连续读
```
prfm pldl2strm, [x0, #256]
```
* 放在 ld1 [x0] 前面 N 条指令N 尽量大些
* #256 表示把 x0+256 的内容放进 L2 cache
* ldp 也适用
* (经验)不写 offset 不如写个 #128
* (经验)pldl2strm 效果稍好于 pldl2keep
* (经验)x0 ~ x0+256 的内容也会进来
* (经验)load 128bit 用 #128256bit 用 #256
* (经验)读很多数据,用不同 offset 连续两次 pldl2strm
* (经验)后面不要对同位置再 pldl1keep会变慢
* (经验)适合提前准备要跳到很远的地方读,比如换 channel

View File

@ -0,0 +1,57 @@
## batchnorm
```
Input A 0 1 A 0 0 0
MemoryData sub/y 0 1 sub/y 16 0 0
BinaryOp sub 2 1 A sub/y sub 1
MemoryData div/y 0 1 div/y 16 0 0
BinaryOp div 2 1 sub div/y div 3
MemoryData mul/y 0 1 mul/y 16 0 0
BinaryOp mul 2 1 div mul/y mul 2
MemoryData BiasAdd/bias 0 1 BiasAdd/bias 16 0 0
BinaryOp BiasAdd 2 1 mul BiasAdd/bias BiasAdd 0
```
## convolution
```
Input A 0 1 A 0 0 0
Convolution Conv2D 1 1 A Conv2D 10 3 1 1 0 0 270
MemoryData biases/read 0 1 biases/read 10 0 0
BinaryOp BiasAdd 2 1 Conv2D biases/read BiasAdd 0
```
## innerproduct
```
Input A 0 1 A 0 0 0
MemoryData biases/read 0 1 biases/read 10 0 0
InnerProduct MatMul 1 1 A MatMul 10 0 2560
BinaryOp conv6 2 1 MatMul biases/read conv6 0
```
## leakyrelu
```
Input A 0 1 A 0 0 0
Split splitncnn_0 1 2 A A_splitncnn_0 A_splitncnn_1
MemoryData mul_1/x 0 1 mul_1/x 0 0 0
BinaryOp mul_1 2 1 mul_1/x A_splitncnn_1 mul_1 2
BinaryOp leaky 2 1 mul_1 A_splitncnn_0 leaky 4
```
## prelu
```
Input A 0 1 A 0 0 0
Split splitncnn_0 1 2 A A_splitncnn_0 A_splitncnn_1
MemoryData prelu/alpha 0 1 prelu/alpha 10 0 0
ReLU prelu/Relu 1 1 A_splitncnn_1 prelu/Relu 0.000000
UnaryOp prelu/Neg 1 1 A_splitncnn_0 prelu/Neg 1
ReLU prelu/Relu_1 1 1 prelu/Neg prelu/Relu_1 0.000000
UnaryOp prelu/Neg_1 1 1 prelu/Relu_1 prelu/Neg_1 1
BinaryOp prelu/Mul 2 1 prelu/alpha prelu/Neg_1 prelu/Mul 2
BinaryOp prelu/add 2 1 prelu/Relu prelu/Mul prelu/add 0
```
## softmax
```
Input A 0 1 A 0 0 0
Split splitncnn_4 1 2 A A_splitncnn_0 A_splitncnn_1
Reduction Max 1 1 A_splitncnn_1 Max 4 -2 1.000000
BinaryOp sub 2 1 A_splitncnn_0 Max sub 1
UnaryOp Exp 1 1 sub Exp 7
Split splitncnn_5 1 2 Exp Exp_splitncnn_0 Exp_splitncnn_1
Reduction Sum 1 1 Exp_splitncnn_1 Sum 0 -2 1.000000
BinaryOp prob 2 1 Exp_splitncnn_0 Sum prob 3
```