feat: 切换后端至PaddleOCR-NCNN，切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法，已通过基本的兼容性测试 2.工程改为使用CMake组织，后续为了更好地兼容第三方库，不再提供QMake工程 3.重整权利声明文件，重整代码工程，确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN，切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions
--- a/3rdparty/ncnn/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
+++ b/3rdparty/ncnn/docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
@ -0,0 +1,57 @@
+```c
+// v寄存器全部使用 %.4s
+// 128-bit vreg matches %.4s
+// a += b * c
+float32x4_t _a = vld1q_f32(a);
+float32x4_t _b = vld1q_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "fmla  %0.4s, %2.4s, %3.4s"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// v寄存器使用低64位  %.2s
+// low 64-bit vreg matches %.2s
+// a += b * c
+float32x2_t _a = vld1_f32(a);
+float32x2_t _b = vld1_f32(b);
+float32x2_t _c = vld1_f32(c);
+asm volatile(
+    "fmla  %0.2s, %2.2s, %3.2s"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3]
+// 32-bit register matches %.s[0]
+// a += b * c[0]
+// a += b * c[1]
+// a += b * c[2]
+// a += b * c[3]
+float32x4_t _a = vld1_f32(a);
+float32x4_t _b = vld1_f32(b);
+float32x4_t _c = vld1_f32(c);
+asm volatile(
+    "fmla  %0.4s, %2.4s, %3.s[0]"
+    "fmla  %0.4s, %2.4s, %3.s[1]"
+    "fmla  %0.4s, %2.4s, %3.s[2]"
+    "fmla  %0.4s, %2.4s, %3.s[3]"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+
+
+qwq
--- a/3rdparty/ncnn/docs/developer-guide/add-custom-layer.zh.md
+++ b/3rdparty/ncnn/docs/developer-guide/add-custom-layer.zh.md
@ -0,0 +1,175 @@
+# NCNN增加自定义层
+
+## 举例
+
+这里举个例子添加自定义层次 如Relu6，即 std::min(6, std::max(0, val))
+
+```
+Input            input   0 1 input
+Convolution      conv2d  1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
+Relu6            relu6   1 1 conv2d relu6
+Pooling          maxpool 1 1 relu6 maxpool 0=0 1=3 2=2 3=-233 4=0
+```
+
+
+
+## 定义源码h文件：src/layer/relu6.h
+
+```CPP
+#ifndef LAYER_RELU6_H
+#define LAYER_RELU6_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Relu6 : public Layer
+{
+public:
+    Relu6();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RELU6_H
+```
+
+
+
+## 定义源码CPP文件：src/layer/relu6.cpp
+
+```CPP
+#include "relu6.h"
+
+#include <math.h>
+
+namespace ncnn {
+
+Relu6::Relu6()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int Relu6::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                ptr[i] = std::min(6, std::max(0, ptr[i]));
+            }
+        }
+
+        return 0;
+}
+
+} // namespace ncnn
+
+```
+
+
+
+## 修改 src/CMakeLists.txt 注册Relu6
+
+```CPP
+ncnn_add_layer(GroupNorm)
+ncnn_add_layer(LayerNorm)
+ncnn_add_layer(Relu6)
+```
+
+
+
+## 定义测试用例CPP文件 src/test_relu6.cpp 
+
+```CPP
+#include "layer/relu6.h"
+#include "testutil.h"
+
+static int test_relu6(const ncnn::Mat& a)
+{
+    ncnn::ParamDict pd;
+
+    std::vector<ncnn::Mat> weights(0);
+
+    int ret = test_layer<ncnn::Relu6>("Relu6", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_relu6 failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
+    }
+
+    return ret;
+}
+
+static int test_relu6_0()
+{
+    return 0
+           || test_relu6(RandomMat(5, 7, 24))
+           || test_relu6(RandomMat(7, 9, 12))
+           || test_relu6(RandomMat(3, 5, 13));
+}
+
+static int test_relu6_1()
+{
+    return 0
+           || test_relu6(RandomMat(15, 24))
+           || test_relu6(RandomMat(17, 12))
+           || test_relu6(RandomMat(19, 15));
+}
+
+static int test_relu6_2()
+{
+    return 0
+           || test_relu6(RandomMat(128))
+           || test_relu6(RandomMat(124))
+           || test_relu6(RandomMat(127));
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_relu6_0()
+           || test_relu6_1()
+           || test_relu6_2();
+}
+
+```
+
+
+
+## 修改tests/CMakeLists.txt 注册Relu6测试用例
+
+```CPP
+ncnn_add_layer_test(LSTM)
+ncnn_add_layer_test(Yolov3DetectionOutput)
+ncnn_add_layer_test(Relu6)
+```
+
+
+
+## 编译
+
+```
+按原NCNN步骤编译
+```
+
+
+
+## 单元测试
+
+```
+./test_relu6
+```
+
--- a/3rdparty/ncnn/docs/developer-guide/arm-a53-a55-dual-issue.md
+++ b/3rdparty/ncnn/docs/developer-guide/arm-a53-a55-dual-issue.md
@ -0,0 +1,85 @@
+## natural assembly
+* no register dependency, no penalty
+```
+ld1     {v0.4s}, [r0], #16
+fmla    v10.4s, v16.4s, v24.s[0]
+fmla    v11.4s, v16.4s, v24.s[1]
+fmla    v12.4s, v16.4s, v24.s[2]
+fmla    v13.4s, v16.4s, v24.s[3]
+```
+
+## A53
+* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
+* 64bit vector load cannot be dual issued with fmla, wait 1 cycle
+* 64bit integer load can be dual issued with fmla, no penalty
+* pointer update can be dual issued with fmla, no penalty
+* 64bit vector load and 64bit vector insert can be dual issued, no penalty
+* any vector load cannot be issued on the 4th cycle of each fmla (enters the accumulator pipeline)
+
+### practical guide
+* use 64bit vector load only
+* issue vector load every three fmla
+* 1 cycle to load 64bit, dual issue with the previous interleaved 64bit insert
+* load the remaining 64bit into integer register, dual issue with fmla
+* update pointer, dual issue with fmla
+* insert 64bit into vector from integer register, dual issue with the next interleaved 64bit load
+* add nop every three fmla if no load, seems to be faster
+```
+ldr     d0, [r0] // 1 cycle, v0 first 64bit
+fmla
+ldr     x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
+fmla
+add     r0, r0, #16 // 0 cycle, update pointer
+fmla
+ldr     d1, [r0] // 1 cycle, v1 first 64bit
+ins     v0.d[1], x23 // 0 cycle, v0 second 64bit complete
+fmla
+ldr     x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
+fmla
+add     r0, r0, #16 // 0 cycle, update pointer
+fmla
+ins     v1.d[1], x23 // 1 cycle, v1 second 64bit complete
+nop
+fmla
+fmla
+fmla
+nop
+nop
+fmla
+fmla
+fmla
+```
+
+## A55
+* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
+* 64bit vector load can be dual issued with fmla, no penalty
+* 64bit integer load can be dual issued with fmla, no penalty
+* pointer update can be dual issued with fmla, no penalty
+* 64bit vector insert can be dual issued with fmla, no penalty
+
+### practical guide
+* use 64bit vector load only
+* load 64bit, dual issue with fmla
+* load the remaining 64bit into integer register, dual issue with fmla
+* update pointer, dual issue with fmla
+* insert 64bit into vector from integer register, dual issue with fmla
+* interleaved load loose register dependency
+* nop trick is not needed
+```
+ldr     d0, [r0] // 0 cycle, v0 first 64bit
+fmla
+ldr     x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
+fmla
+add     r0, r0, #16 // 0 cycle, update pointer
+fmla
+ldr     d1, [r0] // 0 cycle, v1 first 64bit
+fmla
+ins     v0.d[1], x23 // 0 cycle, v0 second 64bit complete
+fmla
+ldr     x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
+fmla
+add     r0, r0, #16 // 0 cycle, update pointer
+fmla
+ins     v1.d[1], x23 // 0 cycle, v1 second 64bit complete
+fmla
+```
--- a/3rdparty/ncnn/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
+++ b/3rdparty/ncnn/docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
@ -0,0 +1,130 @@
+```c
+// d寄存器全部使用 %P
+// d reg matches %P
+// a += b * c
+float32x2_t _a = vld1_f32(a);
+float32x2_t _b = vld1_f32(b);
+float32x2_t _c = vld1_f32(c);
+asm volatile(
+    "vmla.f32  %P0, %P2, %P3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// q寄存器全部使用 %q
+// q reg matches %q
+// a += b * c
+float32x4_t _a = vld1q_f32(a);
+float32x4_t _b = vld1q_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "vmla.f32  %q0, %q2, %q3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// d寄存器单路使用 %P[0] %P[1]
+// 32bit d reg matches %P[0]
+// a += b * c[0]
+// a += b * c[1]
+float32x2_t _a = vld1_f32(a);
+float32x2_t _b = vld1_f32(b);
+float32x2_t _c = vld1_f32(c);
+asm volatile(
+    "vmla.f32  %P0, %P2, %P3[0]"
+    "vmla.f32  %P0, %P2, %P3[1]"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// q寄存器单路使用 %e[0] %e[1] %f[0] %f[1]
+// 32-bit q reg matches %e[0]
+// a += b * c[0]
+// a += b * c[1]
+// a += b * c[2]
+// a += b * c[3]
+float32x4_t _a = vld1q_f32(a);
+float32x4_t _b = vld1q_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "vmla.f32  %q0, %q2, %e3[0]"
+    "vmla.f32  %q0, %q2, %e3[1]"
+    "vmla.f32  %q0, %q2, %f3[0]"
+    "vmla.f32  %q0, %q2, %f3[1]"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// q寄存器拆分d寄存器使用 %e %f
+// use %e %f to split q reg into two d regs
+// a += b * c[0]c[1]
+// a += b * c[2]c[3]
+float32x2_t _a = vldq_f32(a);
+float32x2_t _b = vldq_f32(b);
+float32x4_t _c = vld1q_f32(c);
+asm volatile(
+    "vmla.f32  %P0, %P2, %e3"
+    "vmla.f32  %P0, %P2, %f3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// d寄存器声明绑定
+// specify concrete d reg which want to save
+// vmla.f32  d0, d2, d4
+register float32x2_t _a asm("d0") = vld1_f32(a);
+register float32x2_t _b asm("d2") = vld1_f32(b);
+register float32x2_t _c asm("d4") = vld1_f32(c);
+
+asm volatile(
+    "vmla.f32  %P0, %P2, %P3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+```c
+// q寄存器声明绑定
+// bind q reg with data
+// vmla.f32  q0, q1, q2
+register float32x4_t _a asm("q0") = vld1q_f32(a);
+register float32x4_t _b asm("q1") = vld1q_f32(b);
+register float32x4_t _c asm("q2") = vld1q_f32(c);
+
+asm volatile(
+    "vmla.f32  %q0, %q2, %q3"
+    : "=w"(_a) // %0
+    : "0"(_a),
+      "w"(_b), // %2
+      "w"(_c)  // %3
+    :
+);
+```
+
+如果不是因为编译器的bug，寄存器绑定是用不着的，然而。。。
+
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41538
+
+qwq
--- a/3rdparty/ncnn/docs/developer-guide/binaryop-broadcasting.md
+++ b/3rdparty/ncnn/docs/developer-guide/binaryop-broadcasting.md
@ -0,0 +1,52 @@
+### broadcasting rule
+
+ncnn BinaryOp accepts blobs with different shape
+
+C = BinaryOp(A, B)
+
+shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c]
+
+|type|A|B|C|
+|---|---|---|---|
+|1|[1]|scalar|[1]|
+|2|[1]|[2]|[2]|
+|3|[1]|[2,3]|[2,3]|
+|4|[1]|[2,3,4]|[2,3,4]|
+|5|[2]|scalar|[2]|
+|6|[2]|[1]|[2]|
+|7|[2]|[2]|[2]|
+|8|[3]|[2,3]|[2,3]|
+|9|[4]|[2,3,4]|[2,3,4]|
+|10|[2,3]|scalar|[2,3]|
+|11|[2,3]|[1]|[2,3]|
+|12|[2,3]|[3]|[2,3]|
+|13|[2,3]|[2,3]|[2,3]|
+|14|[3,4]|[2,3,4]|[2,3,4]|
+|15|[2,3,4]|scalar|[2,3,4]|
+|16|[2,3,4]|[1]|[2,3,4]|
+|17|[2,3,4]|[4]|[2,3,4]|
+|18|[2,3,4]|[3,4]|[2,3,4]|
+|19|[2,3,4]|[2,3,4]|[2,3,4]|
+|20|[1]|[2,3,4,5]|[2,3,4,5]|
+|21|[5]|[2,3,4,5]|[2,3,4,5]|
+|22|[4,5]|[2,3,4,5]|[2,3,4,5]|
+|23|[3,4,5]|[2,3,4,5]|[2,3,4,5]|
+|24|[2,3,4,5]|scalar|[2,3,4,5]|
+|25|[2,3,4,5]|[1]|[2,3,4,5]|
+|26|[2,3,4,5]|[5]|[2,3,4,5]|
+|27|[2,3,4,5]|[4,5]|[2,3,4,5]|
+|28|[2,3,4,5]|[3,4,5]|[2,3,4,5]|
+|29|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]|
+
+some special broadcasting rule exists for model compatibility
+
+|special type|A|B|C|
+|---|---|---|---|
+|1|[2,3,4]|[1,1,4]|[2,3,4]|
+|2|[2,3,4]|[2,3,1]|[2,3,4]|
+|3|[1,1,4]|[2,3,4]|[2,3,4]|
+|4|[2,3,1]|[2,3,4]|[2,3,4]|
+|5|[2,3,4]|[1,3,4]|[2,3,4]|
+|6|[2,3,4]|[2,1,4]|[2,3,4]|
+|7|[1,3,4]|[2,3,4]|[2,3,4]|
+|8|[2,1,4]|[2,3,4]|[2,3,4]|
--- a/3rdparty/ncnn/docs/developer-guide/custom-allocator.md
+++ b/3rdparty/ncnn/docs/developer-guide/custom-allocator.md
@ -0,0 +1,63 @@
+Mat structure is now allocator-aware via an extra allocator parameter with default zero value.
+
+The good-old ncnn::fastMalloc()/ncnn::fastFree() will be used for a null allocator.
+
+You could pass a custom allocator to delegate all memory allocation and deallocation.
+
+```cpp
+class Allocator
+{
+public:
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+```
+
+ncnn has already implemented two simple pooled Allocator class, with mutex lock or without it.
+
+```cpp
+ncnn::PoolAllocator locked_mempool;
+ncnn::UnlockedPoolAllocator unlocked_mempool;
+```
+
+the two allocator types in ncnn
+
+* blob allocator
+
+    used to allocate memory for all named blobs, which you could retrieve by Extractor::extract()
+* workspace allocator
+
+    used to allocate memory for internal temporary use in layer implementation, such as the temp blob after padding in convolution
+
+by default, all Extractor instance use the two allocator in the default option
+You can alter them by ncnn::set_default_option()
+or you can set them per Extractor by Extractor::set_blob_allocator()/Extractor::set_workspace_allocator()
+
+blob allocator is guaranteed to be called in-order in layer implementation during each Extractor lifecycle
+while workspace allocator may be called synchronously
+
+the practical usage
+
+* one network, one-by-one inference
+
+    shared unlocked blob allocator for all Extractor
+
+    shared locked workspace allocator for all Extractor
+
+* one network, concurrent inference
+
+    shared unlocked blob allocator for all Extractor in each thread
+
+    shared locked workspace allocator for all Extractor among all threads
+
+* concurrent multiple networks, one-by-one inference for each network
+
+    shared unlocked blob allocator for all Extractor of each network
+
+    shared locked workspace allocator for all Extractor among all networks (for saving memory)
+
+* concurrent multiple networks, concurrent inference for each network
+
+    shared unlocked blob allocator for all Extractor of each network in each thread
+
+    shared locked workspace allocator for all Extractor among all networks (for saving memory)
--- a/3rdparty/ncnn/docs/developer-guide/element-packing.md
+++ b/3rdparty/ncnn/docs/developer-guide/element-packing.md
@ -0,0 +1,119 @@
+### what is packing and why
+
+packing is the form of storing multiple short-sized values as one long-sized value.
+
+element packing is well mapped with the underlying simd register, which usually use one very wide register to store different types of values.
+
+|C|elemsize|elempack|
+|---|---|---|
+|double|8|1|
+|float|4|1|
+|int|4|1|
+|short|2|1|
+|signed char|1|1|
+
+|arm neon|elemsize|elempack|
+|---|---|---|
+|float64x2_t|16|2|
+|float32x4_t|16|4|
+|int32x4_t|16|4|
+|float16x4_t|8|4|
+|int8x8_t|8|8|
+
+Though the real count of values doubles when elempack is two, the wide-sized value is still treated as one value in the view of Mat structure. For example, we want to store 40 float values in Mat object, if elempack 1 is used, Mat width is then 40, while 10 if elempack 4 is used.
+
+|dims|w|h|c|cstep|elemsize|elempack|
+|---|---|---|---|---|---|---|
+|1|40|1|1|40|4|1|
+|1|10|1|1|10|16|4|
+
+### packing style convention
+
+In practice, elempack 1, 4, 8 are the most common cases. It is possible to use any other packing style in theory.
+
+The following table show the packing axis used in ncnn for different dimension.
+
+|dims|packing axis|shape before packing|shape after packing|
+|---|---|---|---|
+|1|w|w|w/elempack|
+|2|h|w, h|w, h/elempack|
+|3|c|w, h, c|w, h, c/elempack|
+
+If the packing axis dim is not evenly divisible by elempack, zero padding may be used.
+
+```
+outw = (w + elempack - 1) / elempack;
+```
+
+The following snippet shows the memory layout after elempack=4 on 3-dim Mat
+
+```
+// w=2 h=3 c=4 elempack=1
+0 1
+2 3
+4 5
+
+6 7
+8 9
+10 11
+
+12 13
+14 15
+16 17
+
+18 19
+20 21
+22 23
+
+// w=2 h=3 c=1 elempack=4
+(0,6,12,18) (1,7,13,19)
+(2,8,14,20) (3,9,15,21)
+(4,10,16,22) (5,11,17,23)
+```
+
+### how to convert elempack
+
+There is a convenient wrapper function provided
+```
+// convert to elempack 4 if packing axis dim is evenly divisible by elempack
+// return the identity Mat otherwise
+ncnn::Mat a;
+ncnn::Mat a_packed;
+ncnn::convert_packing(a, a_packed, 4);
+if (a_packed.elempack == 4)
+{
+    // check if packing is successful
+}
+
+// convert to packing 1, aka unpacking, shall be always successful
+ncnn::Mat b;
+ncnn::Mat b_unpacked;
+ncnn::convert_packing(b, b_unpacked, 1);
+```
+
+### handle general interleaved data
+
+Here is an example of using convert packing to convert RGB interleaved data to planar
+
+**NOTE:** The following code is just presented to explain what packing is and the conversion process. Do not use it in production due to its poor performance. Do use ncnn::Mat::from_pixels()
+
+```cpp
+// rgb_interleaved_u8 is RGB RGB RGB ...
+// rgb_interleaved_u8.w = w;
+// rgb_interleaved_u8.h = h;
+// rgb_interleaved_u8.c = 1;
+// rgb_interleaved_u8.elemsize = 3;
+// rgb_interleaved_u8.elempack = 3;
+
+ncnn::Mat rgb_interleaved_u8(w, h, 1, 3, 3);
+ncnn::Mat rgb_planar_u8;
+
+ncnn::convert_packing(rgb_interleaved_u8, rgb_planar_u8, 1);
+
+// rgb_planar_u8 is now RRR ... GGG ... BBB ...
+// rgb_planar_u8.w = w;
+// rgb_planar_u8.h = h;
+// rgb_planar_u8.c = 3;
+// rgb_planar_u8.elemsize = 1;
+// rgb_planar_u8.elempack = 1;
+```
--- a/3rdparty/ncnn/docs/developer-guide/how-to-be-a-contributor.zh.md
+++ b/3rdparty/ncnn/docs/developer-guide/how-to-be-a-contributor.zh.md
@ -0,0 +1,75 @@
+### 如何提交代码
+
+#### 一、fork 分支
+在浏览器中打开 [ncnn](https://github.com/tencent/ncnn), `fork` 到自己的 repositories，例如
+```
+https://github.com/user/ncnn
+```
+
+clone 项目到本地，添加官方 remote 并 fetch:
+```
+$ git clone https://github.com/user/ncnn && cd ncnn
+$ git remote add tencent https://github.com/tencent/ncnn
+$ git fetch tencent
+```
+对于 `git clone` 下来的项目，它现在有两个 remote，分别是 origin 和 tencent：
+
+```
+$ git remote -v
+origin   https://github.com/user/ncnn (fetch)
+origin   https://github.com/user/ncnn (push)
+tencent  https://github.com/Tencent/ncnn (fetch)
+tencent  https://github.com/Tencent/ncnn (push)
+```
+origin 指向你 fork 的仓库地址；remote 即官方 repo。可以基于不同的 remote 创建和提交分支。
+
+例如切换到官方 master 分支，并基于此创建自己的分支（命名尽量言简意赅。一个分支只做一件事，方便 review 和 revert）
+```
+$ git checkout tencent/master
+$ git checkout -b add-conv-int8
+```
+
+或创建分支时指定基于官方 master 分支：
+```
+$ git checkout -b fix-typo-in-document tencent/master
+```
+
+> `git fetch` 是从远程获取最新代码到本地。如果是第二次 pr ncnn，直接从  `git fetch tencent` 开始即可，不需要 `git remote add tencent`，也不需要修改 `github.com/user/ncnn`。
+
+#### 二、代码习惯
+为了增加沟通效率，reviewer 一般要求 contributor 遵从以下规则
+
+* `if-else`和花括号`{`中间需要换行
+* 不能随意增删空行
+* tab 替换为 4 个空格
+* 为了保证平台兼容性，目前不使用`c++11`，`src`目录下尽量避免使用`template`
+* 若是新增功能或平台，`test`目录需有对应测试用例
+* 文档放到`doc`对应目录下，中文用`.zh.md`做后缀；英文直接用`.md`后缀
+
+开发完成后提交到自己的 repository
+```
+$ git commit -a
+$ git push origin add-conv-int8
+```
+推荐使用 [`commitizen`](https://pypi.org/project/commitizen/) 或 [`gitlint`](https://jorisroovers.com/gitlint/) 等工具格式化 commit message，方便事后检索海量提交记录
+
+#### 三、代码提交
+浏览器中打开 [ncnn pulls](https://github.com/Tencent/ncnn/pulls) ，此时应有此分支 pr 提示，点击 `Compare & pull request`
+
+* 标题**必须**是英文。未完成的分支应以 `WIP:` 开头，例如 `WIP: add conv int8`
+* 正文宜包含以下内容，中英不限
+    * 内容概述和实现方式
+    * 功能或性能测试
+    * 测试结果
+
+CI 已集成了自动格式化，restyled-io 会在 pr 的同时生成 `Restyled add conv int8`，需要 merge 自动 restyled 的分支，例如
+```
+$ git fetch tencent
+$ git checkout add-conv-int8
+$ git merge tencent/restyled/pull-2078
+$ git push origin add-conv-int8
+```
+回到浏览器签署  CLA，所有 CI 测试通过后通知 reviewer merge 此分支。
+
+#### 四、彩蛋
+留下个人 qq 号会触发隐藏事件。
--- a/3rdparty/ncnn/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
+++ b/3rdparty/ncnn/docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
@ -0,0 +1,323 @@
+# step1 create a new empty class
+```cpp
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+// a new layer type called MyLayer
+class MyLayer : public Layer
+{
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+```
+
+# step2 declare layer parameters and weights
+```cpp
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+private:
+    int channels;// new code
+    float gamma;// new code
+    Mat weight;// new code
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+```
+
+# step3 implement load functions for parameters and weights
+```cpp
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+public:
+    virtual int load_param(const ParamDict& pd);// new code
+    virtual int load_model(const ModelBin& mb);// new code
+
+private:
+    int channels;
+    float eps;
+    Mat gamma_data;
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+
+// new routine for loading parameters
+int MyLayer::load_param(const ParamDict& pd)
+{
+    // details about the relations with param file
+    // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
+    //
+    channels = pd.get(0, 0);// parse 0=<int value> entry, default value 0
+    eps = pd.get(1, 0.001f);// parse 1=<float value> entry, default value 0.001f
+
+    return 0;// return zero if success
+}
+
+// new routine for loading weights
+int MyLayer::load_model(const ModelBin& mb)
+{
+    // details about the relations with model file
+    // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
+    //
+    // read weights with length of channels * sizeof(float)
+    // the second argument explains as follows
+    // 0 judge the value type automatically, you may get float or float16 or uint8 etc
+    //   depends on the model storage and the supporting target hardware
+    // 1 read float values anyway
+    // 2 read float16 values anyway
+    // 3 read uint8 values anyway
+    gamma_data = mb.load(channels, 1);
+    if (gamma_data.empty())
+        return -100;// return non-zero on error, -100 indicates out-of-memory
+
+    return 0;// return zero if success
+}
+```
+
+# step4 determine forward behavior
+```cpp
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+public:
+    MyLayer();// new code
+    virtual int load_param(const ParamDict& pd);
+    virtual int load_model(const ModelBin& mb);
+
+private:
+    int channels;
+    float eps;
+    Mat gamma_data;
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+
+// new routine for setting forward behavior
+MyLayer::MyLayer()
+{
+    // one input and one output
+    // typical one_blob_only type: Convolution, Pooling, ReLU, Softmax ...
+    // typical non-one_blob_only type: Eltwise, Split, Concat, Slice ...
+    one_blob_only = true;
+
+    // do not change the blob size, modify data in-place
+    // typical support_inplace type: ReLU, Sigmoid ...
+    // typical non-support_inplace type: Convolution, Pooling ...
+    support_inplace = true;
+}
+
+int MyLayer::load_param(const ParamDict& pd)
+{
+    channels = pd.get(0, 0);
+    eps = pd.get(1, 0.001f);
+
+    // you could alter the behavior based on loaded parameter
+    // if (eps == 0.001f)
+    // {
+    //     one_blob_only = false;
+    //     support_inplace = false;
+    // }
+
+    return 0;
+}
+
+int MyLayer::load_model(const ModelBin& mb)
+{
+    gamma_data = mb.load(channels, 1);
+    if (gamma_data.empty())
+        return -100;
+
+    // you could alter the behavior based on loaded weight
+    // if (gamma_data[0] == 0.f)
+    // {
+    //     one_blob_only = false;
+    //     support_inplace = false;
+    // }
+
+    return 0;
+}
+```
+
+# step5 choose proper interface based on forward behavior
+```cpp
+// The base class Layer defines four interfaces for each forward behavior combination
+
+// 1
+virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+// 2
+virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+// 3
+virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+
+// 4
+virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+```
+**must** = layer must implement this function
+
+**optional** = layer may implement this function for optimal performance
+
+sometimes the graph inference path cannot call forward_inplace directly due to data sharing, in this situation the non-inplace forward routine will be used, which deep-copy the input blob and call inplace forward on it if the optional routine is not implemented. Thus, you could avoid this deep-copy by process input to output on-the-fly.
+
+|one_blob_only|support_inplace|1|2|3|4|
+|---|---|---|---|---|---|
+|false|false|must| | | |
+|false|true|optional| |must| |
+|true|false| |must| | |
+|true|true| |optional| |must|
+
+# step6 implement forward function
+```cpp
+// mylayer.h
+#include "layer.h"
+using namespace ncnn;
+
+class MyLayer : public Layer
+{
+public:
+    MyLayer();
+    virtual int load_param(const ParamDict& pd);
+    virtual int load_model(const ModelBin& mb);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;// new code, optional
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;// new code
+
+private:
+    int channels;
+    float eps;
+    Mat gamma_data;
+};
+
+// mylayer.cpp
+#include "mylayer.h"
+DEFINE_LAYER_CREATOR(MyLayer)
+
+MyLayer::MyLayer()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int MyLayer::load_param(const ParamDict& pd)
+{
+    channels = pd.get(0, 0);
+    eps = pd.get(1, 0.001f);
+
+    return 0;
+}
+
+int MyLayer::load_model(const ModelBin& mb)
+{
+    gamma_data = mb.load(channels, 1);
+    if (gamma_data.empty())
+        return -100;
+
+    return 0;
+}
+
+// optional new routine for layer forward function, non-inplace version
+int MyLayer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // check input dims, return non-zero on error
+    if (bottom_blob.c != channels)
+        return -1;
+
+    // x = (x + eps) * gamma_per_channel
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+    int size = w * h;
+
+    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;// return non-zero on error, -100 indicates out-of-memory
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        float* outptr = top_blob.channel(q);
+        const float gamma = gamma_data[q];
+
+        for (int i=0; i<size; i++)
+        {
+            outptr[i] = (ptr[i] + eps) * gamma ;
+        }
+    }
+
+    return 0;
+}
+
+// new routine for layer forward function
+int MyLayer::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    // check input dims, return non-zero on error
+    if (bottom_top_blob.c != channels)
+        return -1;
+
+    // x = (x + eps) * gamma_per_channel
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int size = w * h;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        const float gamma = gamma_data[q];
+
+        for (int i=0; i<size; i++)
+        {
+            ptr[i] = (ptr[i] + eps) * gamma ;
+        }
+    }
+
+    return 0;
+}
+```
+
+# step7 integrate with ncnn library
+you may probably need to modify caffe2ncnn or mxnet2ncnn etc. to write your layer specific parameters and weights into ncnn param and model file
+
+the param and model file structure [param-and-model-file-structure](param-and-model-file-structure)
+
+```
+// example param file content
+Input            input   0 1 input
+Convolution      conv2d  1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
+MyLayer          mylayer 1 1 conv2d mylayer0
+Pooling          maxpool 1 1 mylayer0 maxpool 0=0 1=3 2=2 3=-233 4=0
+```
+
+```cpp
+ncnn::Net net;
+
+// register custom layer before load param and model
+// the layer creator function signature is always XYZ_layer_creator, which defined in DEFINE_LAYER_CREATOR macro
+net.register_custom_layer("MyLayer", MyLayer_layer_creator);
+
+net.load_param("model.param");
+net.load_model("model.bin");
+```
--- a/3rdparty/ncnn/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
+++ b/3rdparty/ncnn/docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
@ -0,0 +1,38 @@
+# benchmark
+op
+
+# naive C with openmp
+for for for
+
+# unroll, first try
+h
+
+# register allocation
+kernels
+
+# unroll, second try
+simd
+
+# neon intrinsics
+optional
+
+# naive neon assembly with pld
+asm
+
+# pipeline optimize, first try
+more register load mla
+
+# pipeline optimize, second try
+interleave load mla
+
+# pipeline optimize, third try
+loop tail
+
+# usual practice, load/save
+233
+
+# usual practice, unroll
+233
+
+# usual practice, save register
+233
--- a/3rdparty/ncnn/docs/developer-guide/low-level-operation-api.md
+++ b/3rdparty/ncnn/docs/developer-guide/low-level-operation-api.md
@ -0,0 +1,311 @@
+# implement elementwise addition with/without broadcast using BinaryOp operation
+
+* input must be fp32 storage without packing
+* output is expected to be fp32 storage without packing
+
+```cpp
+void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c)
+{
+    ncnn::Option opt;
+    opt.num_threads = 2;
+    opt.use_fp16_storage = false;
+    opt.use_packing_layout = false;
+
+    ncnn::Layer* op = ncnn::create_layer("BinaryOp");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, 0);// op_type
+
+    op->load_param(pd);
+
+    op->create_pipeline(opt);
+
+    // forward
+    std::vector<ncnn::Mat> bottoms(2);
+    bottoms[0] = a;
+    bottoms[1] = b;
+
+    std::vector<ncnn::Mat> tops(1);
+    op->forward(bottoms, tops, opt);
+
+    c = tops[0];
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+}
+```
+
+# implement 3x3 box blur on three channel image using ConvolutionDepthWise operation
+
+* input must be fp32 storage without packing
+* output is expected to be fp32 storage without packing
+
+```cpp
+void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out)
+{
+    ncnn::Option opt;
+    opt.num_threads = 2;
+    opt.use_fp16_storage = false;
+    opt.use_packing_layout = false;
+
+    ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, 3);// num_output
+    pd.set(1, 3);// kernel_w
+    pd.set(5, 0);// bias_term
+    pd.set(6, 3*3*3);// weight_data_size
+    pd.set(7, 3);// group
+
+    op->load_param(pd);
+
+    // set weights
+    ncnn::Mat weights[1];
+    weights[0].create(3*3*3);// weight_data
+
+    for (int i=0; i<3*3*3; i++)
+    {
+        weights[0][i] = 1.f / 9;
+    }
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    // forward
+    op->forward(rgb, out, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+}
+```
+# transpose Mat, chw to cwh
+
+* input must be fp32 storage with/without packing
+* output is expected to be fp32 storage packed
+
+```cpp
+void transpose(const ncnn::Mat& in, ncnn::Mat& out)
+{
+    ncnn::Option opt;
+    opt.num_threads = 2;
+    opt.use_fp16_storage = false;
+    opt.use_packing_layout = true;
+
+    ncnn::Layer* op = ncnn::create_layer("Permute");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, 1);// order_type
+
+    op->load_param(pd);
+
+    op->create_pipeline(opt);
+
+    ncnn::Mat in_packed = in;
+    {
+        // resolve dst_elempack
+        int dims = in.dims;
+        int elemcount = 0;
+        if (dims == 1) elemcount = in.elempack * in.w;
+        if (dims == 2) elemcount = in.elempack * in.h;
+        if (dims == 3) elemcount = in.elempack * in.c;
+
+        int dst_elempack = 1;
+        if (op->support_packing)
+        {
+            if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+        }
+
+        if (in.elempack != dst_elempack)
+        {
+            convert_packing(in, in_packed, dst_elempack, opt);
+        }
+    }
+
+    // forward
+    op->forward(in_packed, out, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+}
+```
+# apply instance normalization
+// x = (x - mean) / sqrt(var)
+
+* input can be fp32/fp16 storage with/without packing
+* output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise
+
+```cpp
+void normalize(const ncnn::Mat& in, ncnn::Mat& out)
+{
+    ncnn::Option opt;
+    opt.num_threads = 2;
+    opt.use_fp16_storage = true;
+    opt.use_packing_layout = true;
+
+    ncnn::Layer* op = ncnn::create_layer("InstanceNorm");
+
+    // set param
+    ncnn::ParamDict pd;
+    pd.set(0, in.c);// channels
+    pd.set(1, 0.f);// eps
+
+    op->load_param(pd);
+
+    // set weights
+    ncnn::Mat weights[2];
+    weights[0].create(in.c);// gamma_data
+    weights[1].create(in.c);// beta_data
+
+    weights[0].fill(1.f);
+    weights[1].fill(0.f);
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    ncnn::Mat in_fp16 = in;
+    if (in.elembits() == 32 && op->support_fp16_storage)
+    {
+        cast_float32_to_float16(in, in_fp16, opt);
+    }
+    if (in.elembits() == 16 && !op->support_fp16_storage)
+    {
+        cast_float16_to_float32(in, in_fp16, opt);
+    }
+
+    ncnn::Mat in_fp16_packed = in_fp16;
+    {
+        // resolve dst_elempack
+        int dims = in_fp16.dims;
+        int elemcount = 0;
+        if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w;
+        if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h;
+        if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c;
+
+        int dst_elempack = 1;
+        if (op->support_packing)
+        {
+            if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+        }
+
+        if (in_fp16.elempack != dst_elempack)
+        {
+            convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt);
+        }
+    }
+
+    // forward
+    op->forward(in_fp16_packed, out, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+}
+```
+
+# cpu -> gpu -> forward -> gpu -> cpu
+
+```cpp
+ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+
+ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev);
+ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev);
+
+// create layer
+ncnn::Layer* convolution = ncnn::create_layer("Convolution");
+convolution->vkdev = vkdev;
+
+// set option
+ncnn::Option opt;
+opt.num_threads = 4;
+opt.use_vulkan_compute = true;
+opt.blob_vkallocator = blob_vkallocator;
+opt.workspace_vkallocator = blob_vkallocator;
+opt.staging_vkallocator = staging_vkallocator;
+
+// load param
+{
+    ncnn::ParamDict pd;
+    pd.set(0, outch);
+    pd.set(1, ksize);
+    pd.set(6, outch*inch*ksize*ksize);
+    pd.use_vulkan_compute = 1;
+
+    convolution->load_param(pd);
+}
+
+// load model
+{
+    ncnn::Mat weights[2];
+    weights[0] = random_mat(outch*inch*ksize*ksize);
+    weights[1] = random_mat(outch);
+
+    ncnn::ModelBinFromMatArray mb(weights);
+    convolution->load_model(mb);
+}
+
+// create pipeline
+convolution->create_pipeline(opt);
+
+// upload model
+{
+    ncnn::VkTransfer cmd(vkdev);
+
+    ncnn::Option opt_upload = opt;
+    opt_upload.blob_vkallocator = weight_vkallocator;
+    opt_upload.workspace_vkallocator = weight_vkallocator;
+    opt_upload.staging_vkallocator = weight_staging_vkallocator;
+
+    convolution->upload_model(cmd, opt_upload);
+
+    cmd.submit_and_wait();
+}
+
+ncnn::Mat bottom = random_mat(w, h, inch);
+
+ncnn::Mat top;
+
+// forward
+{
+    ncnn::VkCompute cmd(vkdev);
+
+    ncnn::VkMat bottom_gpu;
+    cmd.record_upload(bottom, bottom_gpu, opt);
+
+    ncnn::VkMat top_gpu;
+    convolution->forward(bottom_gpu, top_gpu, cmd, opt);
+
+    cmd.record_download(top_gpu, top, opt);
+
+    cmd.submit_and_wait();
+}
+
+convolution->destroy_pipeline(opt);
+
+delete convolution;
+
+vkdev->reclaim_blob_allocator(blob_vkallocator);
+vkdev->reclaim_staging_allocator(staging_vkallocator);
+
+weight_vkallocator->clear();
+weight_staging_vkallocator->clear();
+delete weight_vkallocator;
+delete weight_staging_vkallocator;
+```
+
--- a/3rdparty/ncnn/docs/developer-guide/ncnn-tips-and-tricks.zh.md
+++ b/3rdparty/ncnn/docs/developer-guide/ncnn-tips-and-tricks.zh.md
@ -0,0 +1,46 @@
+### blob内存是隐含共享的
+
+ncnn的blob最初直接使用opencv的cv::Mat，后发现blob最多只支持三维，因此实现了类似的Mat
+Mat的data每个通道内存16字节对齐，并且有原子的引用计数，a=b不复制数据，超级快
+Mat支持直接引用外部的内存块，不复制数据，加快模型加载和输入输出
+
+举个例子：split layer 将一个blob复制成n个，ncnn中实现为单纯的增加引用计数，没有任何数据复制
+
+### 只运算一部分并保留中间结果
+
+ncnn的net在解决分支依赖时是自上而下深度优先的，因此当网络有多个分支时，运算只会在需要结果的那个分支中进行，节约时间
+当多个分支有重合部分时，运算其中一个分支后会自动保留其余分支所需的中间结果，隐含共享，以便运算其余分支时利用
+
+举个例子：某网络结构为 A -> B -> C1 + C2，向ncnn索要C1结果时，运算过程是 A -> B -> C1，同时B结果引用计数加1自动保留，后面还需要C2结果时，只运算C2就足够了
+
+### 开启轻模式省内存
+
+每个layer都会产生blob，除了最后的结果和多分支中间结果，大部分blob都不值得保留，开启轻模式可以在运算后自动回收，省下内存
+
+举个例子：某网络结构为 A -> B -> C，在轻模式下，向ncnn索要C结果时，A结果会在运算B时自动回收，而B结果会在运算C时自动回收，最后只保留C结果，后面再需要C结果会直接获得，满足绝大部分深度网络的使用方式
+
+### 网络和运算是分开的
+
+ncnn的net是网络模型，实际使用的是extractor，也就是同个net可以有很多个运算实例，而且运算实例互不影响，中间结果保留在extractor内部，在多线程使用时共用网络的结构和参数数据，初始化网络模型和参数只需要一遍
+
+举个例子：全局静态的net实例，初始化一次后，就能不停地生成extractor使用
+
+### openmp虽快但未必合适
+
+ncnn中几乎所有运算都能用上openmp多线程加速，而且性能很赞
+不过系统有时候会突然慢一下，比如手机太热自动降频，界面操作等等，ncnn耗时也会偶尔抖动变长，在计算耗时稳定性比较重要的时候建议关闭openmp，或者设置下extractor线程数
+
+举个例子：手机自拍时，用ncnn进行人脸实时定位，如果耗时突然涨一下就会感觉到掉帧，而稳定的帧率体验更好
+
+### NCNN_STDIO/NCNN_STRING禁用模型文件
+
+ncnn支持加载自有的模型文件和模型内存，NCNN_STDIO控制是否需要支持加载模型文件，设成0能禁用这部分代码，从而减小库的体积，NCNN_STRING设成0能清除大部分可见的字符串和解析过程
+模型内存加载时的参数数据是直接引用的，速度更快，通常在手机上使用这种方式
+
+### 削减 ncnn 内置的层实现
+
+cmake的时候，加参数 -DWITH_LAYER_xxx=OFF 就可以完全不编译对应的内置层，这样可以进一步减小库的体积
+
+### 关于 ARM big.LITTLE 调度
+
+调用set_cpu_powersave可以把ncnn运算线程控制在特定的cpu核心上，大核心速度快耗电多，小核心速度慢点但省电，大小一起用手机热得快
--- a/3rdparty/ncnn/docs/developer-guide/new-model-load-api.md
+++ b/3rdparty/ncnn/docs/developer-guide/new-model-load-api.md
@ -0,0 +1,194 @@
+## current model load api
+### Cons
+#### long and awful code
+#### two functions
+#### deal float32 float16 quantized-u8
+#### deal alignment size
+```cpp
+#if NCNN_STDIO
+int Convolution::load_model(FILE* binfp)
+{
+    int nread;
+
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
+    if (nread != 1)
+    {
+        fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
+        return -1;
+    }
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    weight_data.create(weight_data_size);
+    if (weight_data.empty())
+        return -100;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
+        std::vector<unsigned short> float16_weights;
+        float16_weights.resize(align_weight_data_size);
+        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
+            return -1;
+        }
+
+        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        float quantization_value[256];
+        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
+            return -1;
+        }
+
+        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
+        std::vector<unsigned char> index_array;
+        index_array.resize(align_weight_data_size);
+        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read index_array failed %d\n", nread);
+            return -1;
+        }
+
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    if (bias_term)
+    {
+        bias_data.create(num_output);
+        if (bias_data.empty())
+            return -100;
+        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
+        if (nread != 1)
+        {
+            fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int Convolution::load_model(const unsigned char*& mem)
+{
+    union
+    {
+        struct
+        {
+            unsigned char f0;
+            unsigned char f1;
+            unsigned char f2;
+            unsigned char f3;
+        };
+        unsigned int tag;
+    } flag_struct;
+
+    memcpy(&flag_struct, mem, sizeof(flag_struct));
+    mem += sizeof(flag_struct);
+
+    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
+
+    if (flag_struct.tag == 0x01306B47)
+    {
+        // half-precision weight data
+        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
+        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
+        if (weight_data.empty())
+            return -100;
+    }
+    else if (flag != 0)
+    {
+        // quantized weight data
+        const float* quantization_value = (const float*)mem;
+        mem += 256 * sizeof(float);
+
+        const unsigned char* index_array = (const unsigned char*)mem;
+        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);
+
+        weight_data.create(weight_data_size);
+        if (weight_data.empty())
+            return -100;
+        float* weight_data_ptr = weight_data;
+        for (int i = 0; i < weight_data_size; i++)
+        {
+            weight_data_ptr[i] = quantization_value[ index_array[i] ];
+        }
+    }
+    else if (flag_struct.f0 == 0)
+    {
+        // raw weight data
+        weight_data = Mat(weight_data_size, (float*)mem);
+        mem += weight_data_size * sizeof(float);
+    }
+
+    if (bias_term)
+    {
+        bias_data = Mat(num_output, (float*)mem);
+        mem += num_output * sizeof(float);
+    }
+
+    return 0;
+}
+```
+
+## new model load api proposed
+### Pros
+#### clean and simple api
+#### element type detection
+```cpp
+int Convolution::load_model(const ModelBin& mb)
+{
+    // auto detect element type
+    weight_data = mb.load(weight_data_size, 0);
+    if (weight_data.empty())
+        return -100;
+
+    if (bias_term)
+    {
+        // certain type specified
+        bias_data = mb.load(num_output, 1);
+        if (bias_data.empty())
+            return -100;
+    }
+
+    return 0;
+}
+```
--- a/3rdparty/ncnn/docs/developer-guide/new-param-load-api.md
+++ b/3rdparty/ncnn/docs/developer-guide/new-param-load-api.md
@ -0,0 +1,92 @@
+## current param load api
+### Cons
+#### long and awful code
+#### three functions
+#### not extensible
+#### no default value
+#### no variable length array
+```
+MyLayer  mylayer 1 1 in out 100 1.250000
+```
+```
+binary 100
+binary 1.250000
+```
+```cpp
+#if NCNN_STDIO
+#if NCNN_STRING
+int MyLayer::load_param(FILE* paramfp)
+{
+    int nscan = fscanf(paramfp, "%d %f", &a, &b);
+    if (nscan != 2)
+    {
+        fprintf(stderr, "MyLayer load_param failed %d\n", nscan);
+        return -1;
+    }
+
+    return 0;
+}
+#endif // NCNN_STRING
+int MyLayer::load_param_bin(FILE* paramfp)
+{
+    fread(&a, sizeof(int), 1, paramfp);
+
+    fread(&b, sizeof(float), 1, paramfp);
+
+    return 0;
+}
+#endif // NCNN_STDIO
+
+int MyLayer::load_param(const unsigned char*& mem)
+{
+    a = *(int*)(mem);
+    mem += 4;
+
+    b = *(float*)(mem);
+    mem += 4;
+
+    return 0;
+}
+```
+
+## new param load api proposed
+### Pros
+#### clean and simple api
+#### default value
+#### extensible
+#### variable length array
+```
+7767517
+MyLayer  mylayer 1 1 in out 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
+```
+```
+binary 0xDD857600(magic)
+
+binary 0
+binary 100
+binary 1
+binary 1.250000
+binary -23303
+binary 5
+binary 0.1
+binary 0.2
+binary 0.4
+binary 0.8
+binary 1.0
+binary -233(EOP)
+```
+```cpp
+int MyLayer::load_param(const ParamDict& pd)
+{
+    // pd.get( param id (seq), default value );
+    a = pd.get(0, 100);
+    b = pd.get(1, 1.25f);
+
+    // get default value for c if not specified in param file
+    c = pd.get(2, 0.001);
+
+    // get array
+    d = pd.get(3, Mat(len, array));
+    return 0;
+}
+```
--- a/3rdparty/ncnn/docs/developer-guide/operation-param-weight-table.md
+++ b/3rdparty/ncnn/docs/developer-guide/operation-param-weight-table.md
@ -0,0 +1,303 @@
+
+|operation|param id|param phase|default value|weight order|
+|:---:|:---:|:---:|:---:|:---:|
+|AbsVal|||
+|ArgMax|0|out_max_val|0|
+||1|topk|1|
+|BatchNorm|0|channels|0|slope mean variance bias|
+||1|eps|0.f|
+|Bias|0|bias_data_size|0|
+|BinaryOp|0|op_type|0|
+||1|with_scalar|0|
+||2|b|0.f|
+|BNLL|||
+|Cast|0|type_from|0|
+||1|type_to|0|
+|Clip|0|min|-FLT_MAX|
+||1|max|FLT_MAX|
+|Concat|0|axis|0|
+|Convolution|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||8|int8_scale_term|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+||17|impl_type|0|
+||18|pad_value|0.f|
+|ConvolutionDepthWise|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||7|group|1|
+||8|int8_scale_term|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+||18|pad_value|0.f|
+|Crop|0|woffset|0|
+||1|hoffset|0|
+||2|coffset|0|
+||3|outw|0|
+||4|outh|0|
+||5|outc|0|
+||6|woffset2|0|
+||7|hoffset2|0|
+||8|coffset2|0|
+||9|starts|[ ]|
+||10|ends|[ ]|
+||11|axes|[ ]|
+|Deconvolution|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+||18|output_pad_right|0|
+||19|output_pad_bottom|output_pad_right|
+||20|output_w|0|
+||21|output_h|output_w|
+|DeconvolutionDepthWise|0|num_output|0|weight bias|
+||1|kernel_w|0|
+||2|dilation_w|1|
+||3|stride_w|1|
+||4|pad_left|0|
+||5|bias_term|0|
+||6|weight_data_size|0|
+||7|group|1|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+||11|kernel_h|kernel_w|
+||12|dilation_h|dilation_w|
+||13|stride_h|stride_w|
+||15|pad_right|pad_left|
+||14|pad_top|pad_left|
+||16|pad_bottom|pad_top|
+||18|output_pad_right|0|
+||19|output_pad_bottom|output_pad_right|
+||20|output_w|0|
+||21|output_h|output_w|
+|Dequantize|0|scale|1.f|bias|
+||1|bias_term|0|
+||2|bias_data_size|0|
+|DetectionOutput|0|num_class|0|
+||1|nms_threshold|0.05f|
+||2|nms_top_k|300|
+||3|keep_top_k|100|
+||4|confidence_threshold|0.5f|
+||5|variances[0]|0.1f|
+||6|variances[1]|0.1f|
+||7|variances[2]|0.2f|
+||8|variances[3]|0.2f|
+|Dropout|0|scale|1.f|
+|Eltwise|0|op_type|0|
+||1|coeffs|[ ]|
+|ELU|0|alpha|0.1f|
+|Embed|0|num_output|0|weight bias|
+||1|input_dim|0|
+||2|bias_term|0|
+||3|weight_data_size|0|
+|Exp|0|base|-1.f|
+||1|scale|1.f|
+||2|shift|0.f|
+|ExpandDims|0|expand_w|0|
+||1|expand_h|0|
+||2|expand_c|0|
+||3|axes|[ ]|
+|Flatten|||
+|HardSigmoid|0|alpha|0.2f||
+||1|beta|0.5f|
+|HardSwish|0|alpha|0.2f||
+||1|beta|0.5f|
+|InnerProduct|0|num_output|0|weight bias|
+||1|bias_term|0|
+||2|weight_data_size|0|
+||8|int8_scale_term|0|
+||9|activation_type|0|
+||10|activation_params|[ ]|
+|Input|0|w|0|
+||1|h|0|
+||2|c|0|
+|InstanceNorm|0|channels|0|gamma bias|
+||1|eps|0.001f|
+|Interp|0|resize_type|0|
+||1|height_scale|1.f|
+||2|width_scale|1.f|
+||3|output_height|0|
+||4|output_width|0|
+|Log|0|base|-1.f|
+||1|scale|1.f|
+||2|shift|0.f|
+|LRN|0|region_type|0|
+||1|local_size|5|
+||2|alpha|1.f|
+||3|beta|0.75f|
+||4|bias|1.f|
+|LSTM|0|num_output|0|
+||1|weight_data_size|1|
+||2|direction|0|
+|MemoryData|0|w|0|
+||1|h|0|
+||2|c|0|
+|Mish|||
+|MVN|0|normalize_variance|0|
+||1|across_channels|0|
+||2|eps|0.0001f|
+|Noop|||
+|Normalize|0|across_spatial|0|scale|
+||4|across_channel|0|
+||1|channel_shared|0|
+||2|eps|0.0001f|
+||9|eps_mode|0|
+||3|scale_data_size|0|
+|Packing|0|out_packing|1|
+||1|use_padding|0|
+||2|cast_type_from|0|
+||3|cast_type_to|0|
+||4|storage_type_from|0|
+||5|storage_type_to|0|
+|Padding|0|top|0|per_channel_pad_data|
+||1|bottom|0|
+||2|left|0|
+||3|right|0|
+||4|type|0|
+||5|value|0.f|
+||6|per_channel_pad_data_size|0|
+||7|front|0|
+||8|behind|0|
+|Permute|0|order_type|0|
+|PixelShuffle|0|upscale_factor|1|
+|Pooling|0|pooling_type(0: max 1: avg)|0|
+||1|kernel_w|0|
+||11|kernel_h|kernel_w|
+||2|stride_w|1|
+||12|stride_h|stride_w|
+||3|pad_left|0|
+||14|pad_right|pad_left|
+||13|pad_top|pad_left|
+||15|pad_bottom|pad_top|
+||4|global_pooling|0|
+||5|pad_mode|0|
+|Power|0|power|1.f|
+||1|scale|1.f|
+||2|shift|0.f|
+|PReLU|0|num_slope|0|slope|
+|PriorBox|0|min_sizes|[ ]|
+||1|max_sizes|[ ]|
+||2|aspect_ratios|[ ]|
+||3|varainces[0]|0.f|
+||4|varainces[1]|0.f|
+||5|varainces[2]|0.f|
+||6|varainces[3]|0.f|
+||7|flip|1|
+||8|clip|0|
+||9|image_width|0|
+||10|image_height|0|
+||11|step_width|-233.f|
+||12|step_height|-233.f|
+||13|offset|0.f|
+||14|step_mmdetection|0|
+||15|center_mmdetection|0|
+|Proposal|0|feat_stride|16|
+||1|base_size|16|
+||2|pre_nms_topN|6000|
+||3|after_nms_topN|300|
+||4|num_thresh|0.7f|
+||5|min_size|16|
+|PSROIPooling|0|pooled_width|7|
+||1|pooled_height|7|
+||2|spatial_scale|0.0625f|
+||3|output_dim|0|
+|Quantize|0|scale|1.f|
+|Reduction|0|operation|0|
+||1|dim|0|
+||2|coeff|1.f|
+||3|axes|[ ]|
+||4|keepdims|0|
+|ReLU|0|slope|0.f|
+|Reorg|0|stride|0|
+|Requantize|0|scale_in|1.f|bias|
+||1|scale_out|1.f|
+||2|bias_term|0|
+||3|bias_data_size|0|
+||4|fusion_relu|0|
+|Reshape|0|w|-233|
+||1|h|-233|
+||2|c|-233|
+||3|permute|0|
+|ROIAlign|0|pooled_width|0|
+||1|pooled_height|0|
+||2|spatial_scale|1.f|
+||3|sampling_ratio|0|
+||4|aligned|0|
+||5|version|0|
+|ROIPooling|0|pooled_width|0|
+||1|pooled_height|0|
+||2|spatial_scale|1.f|
+|Scale|0|scale_data_size|0|scale bias|
+||1|bias_term|0|
+|SELU|0|alpha|1.67326324f||
+||1|lambda|1.050700987f|
+|ShuffleChannel|0|group|1|
+|Sigmoid|||
+|Slice|0|slices|[ ]|
+||1|axis|0|
+|Softmax|0|axis|0|
+|Split|||
+|SPP|0|pooling_type|0|
+||1|pyramid_height|1|
+|Squeeze|0|squeeze_w|0|
+||1|squeeze_h|0|
+||2|squeeze_c|0|
+||3|axes|[ ]|
+|StatisticsPooling|0|include_stddev|0|
+|Swish|||
+|TanH|||
+|Threshold|0|threshold|0.f|
+|Tile|0|dim|0|
+||1|tiles|1|
+|UnaryOp|0|op_type|0|
+|YoloDetectionOutput|0|num_class|20|
+||1|num_box|5|
+||2|confidence_threshold|0.01f|
+||3|num_threshold|0.45f|
+||4|biases|[]|
+|Yolov3DetectionOutput|0|num_class|20|
+||1|num_box|5|
+||2|confidence_threshold|0.01f|
+||3|num_threshold|0.45f|
+||4|biases|[]|
+||5|mask|[]|
+||6|anchors_scale|[]|
+|RNN|0|num_output|0|
+||1|weight_data_size|0|
+||2|direction|0|
+|MultiHeadAttention|0|embed_dim|0|
+||1|num_head|1|
+||2|weight_data_size|0|
--- a/3rdparty/ncnn/docs/developer-guide/operators.md
+++ b/3rdparty/ncnn/docs/developer-guide/operators.md
--- a/3rdparty/ncnn/docs/developer-guide/param-and-model-file-structure.md
+++ b/3rdparty/ncnn/docs/developer-guide/param-and-model-file-structure.md
@ -0,0 +1,64 @@
+## net.param
+### example
+```
+7767517
+3 3
+Input         input    0 1 data 0=4 1=4 2=1
+InnerProduct  ip       1 1 data fc 0=10 1=1 2=80
+Softmax       softmax  1 1 fc prob 0=0
+```
+### overview
+```
+[magic]
+```
+* magic number : 7767517
+```
+[layer count] [blob count]
+```
+* layer count : count of the layer line follows, should be exactly the count of all layer names
+* blob count : count of all blobs, usually greater than or equals to the layer count
+### layer line
+```
+[layer type] [layer name] [input count] [output count] [input blobs] [output blobs] [layer specific params]
+```
+* layer type : type name, such as Convolution Softmax etc
+* layer name : name of this layer, must be unique among all layer names
+* input count : count of the blobs this layer needs as input
+* output count : count of the blobs this layer produces as output
+* input blobs : name list of all the input blob names, separated by space, must be unique among input blob names of all layers
+* output blobs : name list of all the output blob names, separated by space, must be unique among output blob names of all layers
+* layer specific params : key=value pair list, separated by space
+### layer param
+```
+0=1 1=2.5 -23303=2,2.0,3.0
+```
+key index should be unique in each layer line, pair can be omitted if the default value used
+
+the meaning of existing param key index can be looked up at [operation-param-weight-table](operation-param-weight-table)
+
+* integer or float key : index 0 ~ 19
+* integer value : int
+* float value : float
+* integer array or float array key : -23300 minus index 0 ~ 19
+* integer array value : [array size],int,int,...,int
+* float array value : [array size],float,float,...,float
+
+## net.bin
+```
+  +---------+---------+---------+---------+---------+---------+
+  | weight1 | weight2 | weight3 | weight4 | ....... | weightN |
+  +---------+---------+---------+---------+---------+---------+
+  ^         ^         ^         ^
+  0x0      0x80      0x140     0x1C0
+```
+the model binary is the concatenation of all weight data, each weight buffer is aligned by 32bit
+
+### weight buffer
+```
+[flag] (optional)
+[raw data]
+[padding] (optional)
+```
+* flag : unsigned int,  little-endian, indicating the weight storage type, 0 => float32, 0x01306B47 => float16, otherwise => quantized int8, may be omitted if the layer implementation forced the storage type explicitly
+* raw data : raw weight data, little-endian, float32 data or float16 data or quantized table and indexes depending on the storage type flag
+* padding : padding space for 32bit alignment, may be omitted if already aligned
--- a/3rdparty/ncnn/docs/developer-guide/preload-practice.zh.md
+++ b/3rdparty/ncnn/docs/developer-guide/preload-practice.zh.md
@ -0,0 +1,29 @@
+## 只是实践经验，没有理论，不一定正确
+
+```
+prfm pldl1keep, [x0, #256]
+```
+* 放在 ld1 [x0] 前面 0~8 条指令
+* #256 表示把 x0+256 的内容放进 L1 cache
+* ldp 也适用
+* (经验)不写 offset 不如写个 #128
+* (经验)pldl1strm 似乎没啥意思，也没 pldl1keep 快
+* (经验)x0 ~ x0+256 的内容也会进来
+* (经验)load 128bit 用 #128，256bit或更多用 #256
+* (经验)避免 pld a，pld b，load a，load b 顺序，可能相互干扰
+* (经验)提前太多会失效
+* (经验)适合连续读
+
+```
+prfm pldl2strm, [x0, #256]
+```
+* 放在 ld1 [x0] 前面 N 条指令，N 尽量大些
+* #256 表示把 x0+256 的内容放进 L2 cache
+* ldp 也适用
+* (经验)不写 offset 不如写个 #128
+* (经验)pldl2strm 效果稍好于 pldl2keep
+* (经验)x0 ~ x0+256 的内容也会进来
+* (经验)load 128bit 用 #128，256bit 用 #256
+* (经验)读很多数据，用不同 offset 连续两次 pldl2strm
+* (经验)后面不要对同位置再 pldl1keep，会变慢
+* (经验)适合提前准备要跳到很远的地方读，比如换 channel
--- a/3rdparty/ncnn/docs/developer-guide/tensorflow-op-combination.md
+++ b/3rdparty/ncnn/docs/developer-guide/tensorflow-op-combination.md
@ -0,0 +1,57 @@
+## batchnorm
+```
+Input       A            0 1 A 0 0 0
+MemoryData  sub/y        0 1 sub/y 16 0 0
+BinaryOp    sub          2 1 A sub/y sub 1
+MemoryData  div/y        0 1 div/y 16 0 0
+BinaryOp    div          2 1 sub div/y div 3
+MemoryData  mul/y        0 1 mul/y 16 0 0
+BinaryOp    mul          2 1 div mul/y mul 2
+MemoryData  BiasAdd/bias 0 1 BiasAdd/bias 16 0 0
+BinaryOp    BiasAdd      2 1 mul BiasAdd/bias BiasAdd 0
+```
+## convolution
+```
+Input       A            0 1 A 0 0 0
+Convolution Conv2D       1 1 A Conv2D 10 3 1 1 0 0 270
+MemoryData  biases/read  0 1 biases/read 10 0 0
+BinaryOp    BiasAdd      2 1 Conv2D biases/read BiasAdd 0
+```
+## innerproduct
+```
+Input        A           0 1 A 0 0 0
+MemoryData   biases/read 0 1 biases/read 10 0 0
+InnerProduct MatMul      1 1 A MatMul 10 0 2560
+BinaryOp     conv6       2 1 MatMul biases/read conv6 0
+```
+## leakyrelu
+```
+Input       A            0 1 A 0 0 0
+Split       splitncnn_0  1 2 A A_splitncnn_0 A_splitncnn_1
+MemoryData  mul_1/x      0 1 mul_1/x 0 0 0
+BinaryOp    mul_1        2 1 mul_1/x A_splitncnn_1 mul_1 2
+BinaryOp    leaky        2 1 mul_1 A_splitncnn_0 leaky 4
+```
+## prelu
+```
+Input       A            0 1 A 0 0 0
+Split       splitncnn_0  1 2 A A_splitncnn_0 A_splitncnn_1
+MemoryData  prelu/alpha  0 1 prelu/alpha 10 0 0
+ReLU        prelu/Relu   1 1 A_splitncnn_1 prelu/Relu 0.000000
+UnaryOp     prelu/Neg    1 1 A_splitncnn_0 prelu/Neg 1
+ReLU        prelu/Relu_1 1 1 prelu/Neg prelu/Relu_1 0.000000
+UnaryOp     prelu/Neg_1  1 1 prelu/Relu_1 prelu/Neg_1 1
+BinaryOp    prelu/Mul    2 1 prelu/alpha prelu/Neg_1 prelu/Mul 2
+BinaryOp    prelu/add    2 1 prelu/Relu prelu/Mul prelu/add 0
+```
+## softmax
+```
+Input       A            0 1 A 0 0 0
+Split       splitncnn_4  1 2 A A_splitncnn_0 A_splitncnn_1
+Reduction   Max          1 1 A_splitncnn_1 Max 4 -2 1.000000
+BinaryOp    sub          2 1 A_splitncnn_0 Max sub 1
+UnaryOp     Exp          1 1 sub Exp 7
+Split       splitncnn_5  1 2 Exp Exp_splitncnn_0 Exp_splitncnn_1
+Reduction   Sum          1 1 Exp_splitncnn_1 Sum 0 -2 1.000000
+BinaryOp    prob         2 1 Exp_splitncnn_0 Sum prob 3
+```