feat: 切换后端至PaddleOCR-NCNN，切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法，已通过基本的兼容性测试 2.工程改为使用CMake组织，后续为了更好地兼容第三方库，不再提供QMake工程 3.重整权利声明文件，重整代码工程，确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN，切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions
--- a/3rdparty/ncnn/examples/CMakeLists.txt
+++ b/3rdparty/ncnn/examples/CMakeLists.txt
@ -0,0 +1,76 @@
+macro(ncnn_add_example name)
+    add_executable(${name} ${name}.cpp)
+    if(OpenCV_FOUND)
+        target_include_directories(${name} PRIVATE ${OpenCV_INCLUDE_DIRS})
+        target_link_libraries(${name} PRIVATE ncnn ${OpenCV_LIBS})
+    elseif(NCNN_SIMPLEOCV)
+        target_compile_definitions(${name} PUBLIC USE_NCNN_SIMPLEOCV)
+        target_link_libraries(${name} PRIVATE ncnn)
+    endif()
+
+    # add test to a virtual project group
+    set_property(TARGET ${name} PROPERTY FOLDER "examples")
+endmacro()
+
+if(NCNN_PIXEL)
+    if(NOT NCNN_SIMPLEOCV)
+        find_package(OpenCV QUIET COMPONENTS opencv_world)
+        # for opencv 2.4 on ubuntu 16.04, there is no opencv_world but OpenCV_FOUND will be TRUE
+        if("${OpenCV_LIBS}" STREQUAL "")
+            set(OpenCV_FOUND FALSE)
+        endif()
+        if(NOT OpenCV_FOUND)
+            find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs videoio)
+        endif()
+        if(NOT OpenCV_FOUND)
+            find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
+        endif()
+    endif()
+
+    if(OpenCV_FOUND OR NCNN_SIMPLEOCV)
+        if(OpenCV_FOUND)
+            message(STATUS "OpenCV library: ${OpenCV_INSTALL_PATH}")
+            message(STATUS "    version: ${OpenCV_VERSION}")
+            message(STATUS "    libraries: ${OpenCV_LIBS}")
+            message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+            if(${OpenCV_VERSION_MAJOR} GREATER 3)
+                set(CMAKE_CXX_STANDARD 11)
+            endif()
+        endif()
+
+        include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+        include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)
+
+        ncnn_add_example(squeezenet)
+        ncnn_add_example(squeezenet_c_api)
+        ncnn_add_example(fasterrcnn)
+        ncnn_add_example(rfcn)
+        ncnn_add_example(yolov2)
+        ncnn_add_example(yolov3)
+        ncnn_add_example(yolov5)
+        ncnn_add_example(yolov5_pnnx)
+        ncnn_add_example(yolox)
+        ncnn_add_example(mobilenetv2ssdlite)
+        ncnn_add_example(mobilenetssd)
+        ncnn_add_example(squeezenetssd)
+        ncnn_add_example(shufflenetv2)
+        ncnn_add_example(peleenetssd_seg)
+        ncnn_add_example(simplepose)
+        ncnn_add_example(retinaface)
+        ncnn_add_example(yolact)
+        ncnn_add_example(nanodet)
+        ncnn_add_example(nanodetplus_pnnx)
+        ncnn_add_example(scrfd)
+        ncnn_add_example(scrfd_crowdhuman)
+        if(OpenCV_FOUND)
+            ncnn_add_example(yolov4)
+            ncnn_add_example(rvm)
+            ncnn_add_example(p2pnet)
+        endif()
+    else()
+        message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built")
+    endif()
+else()
+    message(WARNING "NCNN_PIXEL not enabled, examples won't be built")
+endif()
--- a/3rdparty/ncnn/examples/fasterrcnn.cpp
+++ b/3rdparty/ncnn/examples/fasterrcnn.cpp
@ -0,0 +1,358 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <math.h>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = objects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = objects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = objects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = objects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net fasterrcnn;
+
+    fasterrcnn.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/rbgirshick/py-faster-rcnn
+    // py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt
+    // https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0
+    // ZF_faster_rcnn_final.caffemodel
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    fasterrcnn.load_param("ZF_faster_rcnn_final.param");
+    fasterrcnn.load_model("ZF_faster_rcnn_final.bin");
+
+    // hyper parameters taken from
+    // py-faster-rcnn/lib/fast_rcnn/config.py
+    // py-faster-rcnn/lib/fast_rcnn/test.py
+    const int target_size = 600; // __C.TEST.SCALES
+
+    const int max_per_image = 100;
+    const float confidence_thresh = 0.05f;
+
+    const float nms_threshold = 0.3f; // __C.TEST.NMS
+
+    // scale to target detect size
+    int w = bgr.cols;
+    int h = bgr.rows;
+    float scale = 1.f;
+    if (w < h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h);
+
+    const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Mat im_info(3);
+    im_info[0] = h;
+    im_info[1] = w;
+    im_info[2] = scale;
+
+    // step1, extract feature and all rois
+    ncnn::Extractor ex1 = fasterrcnn.create_extractor();
+
+    ex1.input("data", in);
+    ex1.input("im_info", im_info);
+
+    ncnn::Mat conv5_relu5; // feature
+    ncnn::Mat rois;        // all rois
+    ex1.extract("conv5_relu5", conv5_relu5);
+    ex1.extract("rois", rois);
+
+    // step2, extract bbox and score for each roi
+    std::vector<std::vector<Object> > class_candidates;
+    for (int i = 0; i < rois.c; i++)
+    {
+        ncnn::Extractor ex2 = fasterrcnn.create_extractor();
+
+        ncnn::Mat roi = rois.channel(i); // get single roi
+        ex2.input("conv5_relu5", conv5_relu5);
+        ex2.input("rois", roi);
+
+        ncnn::Mat bbox_pred;
+        ncnn::Mat cls_prob;
+        ex2.extract("bbox_pred", bbox_pred);
+        ex2.extract("cls_prob", cls_prob);
+
+        int num_class = cls_prob.w;
+        class_candidates.resize(num_class);
+
+        // find class id with highest score
+        int label = 0;
+        float score = 0.f;
+        for (int i = 0; i < num_class; i++)
+        {
+            float class_score = cls_prob[i];
+            if (class_score > score)
+            {
+                label = i;
+                score = class_score;
+            }
+        }
+
+        // ignore background or low score
+        if (label == 0 || score <= confidence_thresh)
+            continue;
+
+        //         fprintf(stderr, "%d = %f\n", label, score);
+
+        // unscale to image size
+        float x1 = roi[0] / scale;
+        float y1 = roi[1] / scale;
+        float x2 = roi[2] / scale;
+        float y2 = roi[3] / scale;
+
+        float pb_w = x2 - x1 + 1;
+        float pb_h = y2 - y1 + 1;
+
+        // apply bbox regression
+        float dx = bbox_pred[label * 4];
+        float dy = bbox_pred[label * 4 + 1];
+        float dw = bbox_pred[label * 4 + 2];
+        float dh = bbox_pred[label * 4 + 3];
+
+        float cx = x1 + pb_w * 0.5f;
+        float cy = y1 + pb_h * 0.5f;
+
+        float obj_cx = cx + pb_w * dx;
+        float obj_cy = cy + pb_h * dy;
+
+        float obj_w = pb_w * exp(dw);
+        float obj_h = pb_h * exp(dh);
+
+        float obj_x1 = obj_cx - obj_w * 0.5f;
+        float obj_y1 = obj_cy - obj_h * 0.5f;
+        float obj_x2 = obj_cx + obj_w * 0.5f;
+        float obj_y2 = obj_cy + obj_h * 0.5f;
+
+        // clip
+        obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f);
+
+        // append object
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
+        obj.label = label;
+        obj.prob = score;
+
+        class_candidates[label].push_back(obj);
+    }
+
+    // post process
+    objects.clear();
+    for (int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidates = class_candidates[i];
+
+        qsort_descent_inplace(candidates);
+
+        std::vector<int> picked;
+        nms_sorted_bboxes(candidates, picked, nms_threshold);
+
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            int z = picked[j];
+            objects.push_back(candidates[z]);
+        }
+    }
+
+    qsort_descent_inplace(objects);
+
+    if (max_per_image > 0 && max_per_image < objects.size())
+    {
+        objects.resize(max_per_image);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_fasterrcnn(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/mobilenetssd.cpp
+++ b/3rdparty/ncnn/examples/mobilenetssd.cpp
@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_mobilenet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net mobilenet;
+
+    mobilenet.opt.use_vulkan_compute = true;
+
+    // model is converted from https://github.com/chuanqi305/MobileNet-SSD
+    // and can be downloaded from https://drive.google.com/open?id=0ByaKLD9QaPtucWk0Y0dha1VVY0U
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    mobilenet.load_param("mobilenet_ssd_voc_ncnn.param");
+    mobilenet.load_model("mobilenet_ssd_voc_ncnn.bin");
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = mobilenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_mobilenet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/mobilenetv2ssdlite.cpp
+++ b/3rdparty/ncnn/examples/mobilenetv2ssdlite.cpp
@ -0,0 +1,159 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+class Noop : public ncnn::Layer
+{
+};
+DEFINE_LAYER_CREATOR(Noop)
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_mobilenetv2(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net mobilenetv2;
+
+    mobilenetv2.opt.use_vulkan_compute = true;
+
+    mobilenetv2.register_custom_layer("Silence", Noop_layer_creator);
+
+    // original pretrained model from https://github.com/chuanqi305/MobileNetv2-SSDLite
+    // https://github.com/chuanqi305/MobileNetv2-SSDLite/blob/master/ssdlite/voc/deploy.prototxt
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    mobilenetv2.load_param("mobilenetv2_ssdlite_voc.param");
+    mobilenetv2.load_model("mobilenetv2_ssdlite_voc.bin");
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = mobilenetv2.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_mobilenetv2(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/mobilenetv3ssdlite.cpp
+++ b/3rdparty/ncnn/examples/mobilenetv3ssdlite.cpp
@ -0,0 +1,173 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+#include "platform.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+#if NCNN_VULKAN
+#include "gpu.h"
+#endif // NCNN_VULKAN
+
+template<class T>
+const T& clamp(const T& v, const T& lo, const T& hi)
+{
+    assert(!(hi < lo));
+    return v < lo ? lo : hi < v ? hi : v;
+}
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_mobilenetv3(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net mobilenetv3;
+
+#if NCNN_VULKAN
+    mobilenetv3.opt.use_vulkan_compute = true;
+#endif // NCNN_VULKAN
+
+    // converted ncnn model from https://github.com/ujsyehao/mobilenetv3-ssd
+    mobilenetv3.load_param("./mobilenetv3_ssdlite_voc.param");
+    mobilenetv3.load_model("./mobilenetv3_ssdlite_voc.bin");
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {123.675f, 116.28f, 103.53f};
+    const float norm_vals[3] = {1.0f, 1.0f, 1.0f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = mobilenetv3.create_extractor();
+
+    ex.input("input", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+
+        // filter out cross-boundary
+        float x1 = clamp(values[2] * target_size, 0.f, float(target_size - 1)) / target_size * img_w;
+        float y1 = clamp(values[3] * target_size, 0.f, float(target_size - 1)) / target_size * img_h;
+        float x2 = clamp(values[4] * target_size, 0.f, float(target_size - 1)) / target_size * img_w;
+        float y2 = clamp(values[5] * target_size, 0.f, float(target_size - 1)) / target_size * img_h;
+
+        object.rect.x = x1;
+        object.rect.y = y1;
+        object.rect.width = x2 - x1;
+        object.rect.height = y2 - y1;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        if (objects[i].prob > 0.6)
+        {
+            const Object& obj = objects[i];
+
+            fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                    obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+            cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+            char text[256];
+            sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+            int baseLine = 0;
+            cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+            int x = obj.rect.x;
+            int y = obj.rect.y - label_size.height - baseLine;
+            if (y < 0)
+                y = 0;
+            if (x + label_size.width > image.cols)
+                x = image.cols - label_size.width;
+
+            cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                          cv::Scalar(255, 255, 255), -1);
+
+            cv::putText(image, text, cv::Point(x, y + label_size.height),
+                        cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+        }
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_mobilenetv3(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/nanodet.cpp
+++ b/3rdparty/ncnn/examples/nanodet.cpp
@ -0,0 +1,420 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.width * faceobjects[i].rect.height;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static void generate_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = cls_pred.h;
+
+    int num_grid_x;
+    int num_grid_y;
+    if (in_pad.w > in_pad.h)
+    {
+        num_grid_x = in_pad.w / stride;
+        num_grid_y = num_grid / num_grid_x;
+    }
+    else
+    {
+        num_grid_y = in_pad.h / stride;
+        num_grid_x = num_grid / num_grid_y;
+    }
+
+    const int num_class = cls_pred.w;
+    const int reg_max_1 = dis_pred.w / 4;
+
+    for (int i = 0; i < num_grid_y; i++)
+    {
+        for (int j = 0; j < num_grid_x; j++)
+        {
+            const int idx = i * num_grid_x + j;
+
+            const float* scores = cls_pred.row(idx);
+
+            // find label with max score
+            int label = -1;
+            float score = -FLT_MAX;
+            for (int k = 0; k < num_class; k++)
+            {
+                if (scores[k] > score)
+                {
+                    label = k;
+                    score = scores[k];
+                }
+            }
+
+            if (score >= prob_threshold)
+            {
+                ncnn::Mat bbox_pred(reg_max_1, 4, (void*)dis_pred.row(idx));
+                {
+                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");
+
+                    ncnn::ParamDict pd;
+                    pd.set(0, 1); // axis
+                    pd.set(1, 1);
+                    softmax->load_param(pd);
+
+                    ncnn::Option opt;
+                    opt.num_threads = 1;
+                    opt.use_packing_layout = false;
+
+                    softmax->create_pipeline(opt);
+
+                    softmax->forward_inplace(bbox_pred, opt);
+
+                    softmax->destroy_pipeline(opt);
+
+                    delete softmax;
+                }
+
+                float pred_ltrb[4];
+                for (int k = 0; k < 4; k++)
+                {
+                    float dis = 0.f;
+                    const float* dis_after_sm = bbox_pred.row(k);
+                    for (int l = 0; l < reg_max_1; l++)
+                    {
+                        dis += l * dis_after_sm[l];
+                    }
+
+                    pred_ltrb[k] = dis * stride;
+                }
+
+                float pb_cx = (j + 0.5f) * stride;
+                float pb_cy = (i + 0.5f) * stride;
+
+                float x0 = pb_cx - pred_ltrb[0];
+                float y0 = pb_cy - pred_ltrb[1];
+                float x1 = pb_cx + pred_ltrb[2];
+                float y1 = pb_cy + pred_ltrb[3];
+
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = x1 - x0;
+                obj.rect.height = y1 - y0;
+                obj.label = label;
+                obj.prob = score;
+
+                objects.push_back(obj);
+            }
+        }
+    }
+}
+
+static int detect_nanodet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net nanodet;
+
+    nanodet.opt.use_vulkan_compute = true;
+    // nanodet.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/RangiLyu/nanodet
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    nanodet.load_param("nanodet_m.param");
+    nanodet.load_model("nanodet_m.bin");
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    const int target_size = 320;
+    const float prob_threshold = 0.4f;
+    const float nms_threshold = 0.5f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+    const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = nanodet.create_extractor();
+
+    ex.input("input.1", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 8
+    {
+        ncnn::Mat cls_pred;
+        ncnn::Mat dis_pred;
+        ex.extract("792", cls_pred);
+        ex.extract("795", dis_pred);
+
+        std::vector<Object> objects8;
+        generate_proposals(cls_pred, dis_pred, 8, in_pad, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat cls_pred;
+        ncnn::Mat dis_pred;
+        ex.extract("814", cls_pred);
+        ex.extract("817", dis_pred);
+
+        std::vector<Object> objects16;
+        generate_proposals(cls_pred, dis_pred, 16, in_pad, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat cls_pred;
+        ncnn::Mat dis_pred;
+        ex.extract("836", cls_pred);
+        ex.extract("839", dis_pred);
+
+        std::vector<Object> objects32;
+        generate_proposals(cls_pred, dis_pred, 32, in_pad, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_nanodet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/nanodetplus_pnnx.cpp
+++ b/3rdparty/ncnn/examples/nanodetplus_pnnx.cpp
@ -0,0 +1,426 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.width * faceobjects[i].rect.height;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return 1.0f / (1.0f + exp(-x));
+}
+
+static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = pred.h;
+
+    int num_grid_x = pred.w;
+    int num_grid_y = pred.h;
+
+    const int num_class = 80; // number of classes. 80 for COCO
+    const int reg_max_1 = (pred.c - num_class) / 4;
+
+    for (int i = 0; i < num_grid_y; i++)
+    {
+        for (int j = 0; j < num_grid_x; j++)
+        {
+            // find label with max score
+            int label = -1;
+            float score = -FLT_MAX;
+            for (int k = 0; k < num_class; k++)
+            {
+                float s = pred.channel(k).row(i)[j];
+                if (s > score)
+                {
+                    label = k;
+                    score = s;
+                }
+            }
+
+            score = sigmoid(score);
+
+            if (score >= prob_threshold)
+            {
+                ncnn::Mat bbox_pred(reg_max_1, 4);
+                for (int k = 0; k < reg_max_1 * 4; k++)
+                {
+                    bbox_pred[k] = pred.channel(num_class + k).row(i)[j];
+                }
+                {
+                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");
+
+                    ncnn::ParamDict pd;
+                    pd.set(0, 1); // axis
+                    pd.set(1, 1);
+                    softmax->load_param(pd);
+
+                    ncnn::Option opt;
+                    opt.num_threads = 1;
+                    opt.use_packing_layout = false;
+
+                    softmax->create_pipeline(opt);
+
+                    softmax->forward_inplace(bbox_pred, opt);
+
+                    softmax->destroy_pipeline(opt);
+
+                    delete softmax;
+                }
+
+                float pred_ltrb[4];
+                for (int k = 0; k < 4; k++)
+                {
+                    float dis = 0.f;
+                    const float* dis_after_sm = bbox_pred.row(k);
+                    for (int l = 0; l < reg_max_1; l++)
+                    {
+                        dis += l * dis_after_sm[l];
+                    }
+
+                    pred_ltrb[k] = dis * stride;
+                }
+
+                float pb_cx = j * stride;
+                float pb_cy = i * stride;
+
+                float x0 = pb_cx - pred_ltrb[0];
+                float y0 = pb_cy - pred_ltrb[1];
+                float x1 = pb_cx + pred_ltrb[2];
+                float y1 = pb_cy + pred_ltrb[3];
+
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = x1 - x0;
+                obj.rect.height = y1 - y0;
+                obj.label = label;
+                obj.prob = score;
+
+                objects.push_back(obj);
+            }
+        }
+    }
+}
+
+static int detect_nanodet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net nanodet;
+
+    nanodet.opt.use_vulkan_compute = true;
+    // nanodet.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/RangiLyu/nanodet
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    //     nanodet.load_param("nanodet-plus-m_320.torchscript.ncnn.param");
+    //     nanodet.load_model("nanodet-plus-m_320.torchscript.ncnn.bin");
+    nanodet.load_param("nanodet-plus-m_416.torchscript.ncnn.param");
+    nanodet.load_model("nanodet-plus-m_416.torchscript.ncnn.bin");
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    //     const int target_size = 320;
+    const int target_size = 416;
+    const float prob_threshold = 0.4f;
+    const float nms_threshold = 0.5f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+    const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = nanodet.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 8
+    {
+        ncnn::Mat pred;
+        ex.extract("231", pred);
+
+        std::vector<Object> objects8;
+        generate_proposals(pred, 8, in_pad, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat pred;
+        ex.extract("228", pred);
+
+        std::vector<Object> objects16;
+        generate_proposals(pred, 16, in_pad, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat pred;
+        ex.extract("225", pred);
+
+        std::vector<Object> objects32;
+        generate_proposals(pred, 32, in_pad, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // stride 64
+    {
+        ncnn::Mat pred;
+        ex.extract("222", pred);
+
+        std::vector<Object> objects64;
+        generate_proposals(pred, 64, in_pad, prob_threshold, objects64);
+
+        proposals.insert(proposals.end(), objects64.begin(), objects64.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_nanodet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/p2pnet.cpp
+++ b/3rdparty/ncnn/examples/p2pnet.cpp
@ -0,0 +1,240 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct CrowdPoint
+{
+    cv::Point pt;
+    float prob;
+};
+
+static void shift(int w, int h, int stride, std::vector<float> anchor_points, std::vector<float>& shifted_anchor_points)
+{
+    std::vector<float> x_, y_;
+    for (int i = 0; i < w; i++)
+    {
+        float x = (i + 0.5) * stride;
+        x_.push_back(x);
+    }
+    for (int i = 0; i < h; i++)
+    {
+        float y = (i + 0.5) * stride;
+        y_.push_back(y);
+    }
+
+    std::vector<float> shift_x((size_t)w * h, 0), shift_y((size_t)w * h, 0);
+    for (int i = 0; i < h; i++)
+    {
+        for (int j = 0; j < w; j++)
+        {
+            shift_x[i * w + j] = x_[j];
+        }
+    }
+    for (int i = 0; i < h; i++)
+    {
+        for (int j = 0; j < w; j++)
+        {
+            shift_y[i * w + j] = y_[i];
+        }
+    }
+
+    std::vector<float> shifts((size_t)w * h * 2, 0);
+    for (int i = 0; i < w * h; i++)
+    {
+        shifts[i * 2] = shift_x[i];
+        shifts[i * 2 + 1] = shift_y[i];
+    }
+
+    shifted_anchor_points.resize((size_t)2 * w * h * anchor_points.size() / 2, 0);
+    for (int i = 0; i < w * h; i++)
+    {
+        for (int j = 0; j < anchor_points.size() / 2; j++)
+        {
+            float x = anchor_points[j * 2] + shifts[i * 2];
+            float y = anchor_points[j * 2 + 1] + shifts[i * 2 + 1];
+            shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2] = x;
+            shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2 + 1] = y;
+        }
+    }
+}
+static void generate_anchor_points(int stride, int row, int line, std::vector<float>& anchor_points)
+{
+    float row_step = (float)stride / row;
+    float line_step = (float)stride / line;
+
+    std::vector<float> x_, y_;
+    for (int i = 1; i < line + 1; i++)
+    {
+        float x = (i - 0.5) * line_step - stride / 2;
+        x_.push_back(x);
+    }
+    for (int i = 1; i < row + 1; i++)
+    {
+        float y = (i - 0.5) * row_step - stride / 2;
+        y_.push_back(y);
+    }
+    std::vector<float> shift_x((size_t)row * line, 0), shift_y((size_t)row * line, 0);
+    for (int i = 0; i < row; i++)
+    {
+        for (int j = 0; j < line; j++)
+        {
+            shift_x[i * line + j] = x_[j];
+        }
+    }
+    for (int i = 0; i < row; i++)
+    {
+        for (int j = 0; j < line; j++)
+        {
+            shift_y[i * line + j] = y_[i];
+        }
+    }
+    anchor_points.resize((size_t)row * line * 2, 0);
+    for (int i = 0; i < row * line; i++)
+    {
+        float x = shift_x[i];
+        float y = shift_y[i];
+        anchor_points[i * 2] = x;
+        anchor_points[i * 2 + 1] = y;
+    }
+}
+static void generate_anchor_points(int img_w, int img_h, std::vector<int> pyramid_levels, int row, int line, std::vector<float>& all_anchor_points)
+{
+    std::vector<std::pair<int, int> > image_shapes;
+    std::vector<int> strides;
+    for (int i = 0; i < pyramid_levels.size(); i++)
+    {
+        int new_h = std::floor((img_h + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i]));
+        int new_w = std::floor((img_w + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i]));
+        image_shapes.push_back(std::make_pair(new_w, new_h));
+        strides.push_back(std::pow(2, pyramid_levels[i]));
+    }
+
+    all_anchor_points.clear();
+    for (int i = 0; i < pyramid_levels.size(); i++)
+    {
+        std::vector<float> anchor_points;
+        generate_anchor_points(std::pow(2, pyramid_levels[i]), row, line, anchor_points);
+        std::vector<float> shifted_anchor_points;
+        shift(image_shapes[i].first, image_shapes[i].second, strides[i], anchor_points, shifted_anchor_points);
+        all_anchor_points.insert(all_anchor_points.end(), shifted_anchor_points.begin(), shifted_anchor_points.end());
+    }
+}
+
+static int detect_crowd(const cv::Mat& bgr, std::vector<CrowdPoint>& crowd_points)
+{
+    ncnn::Option opt;
+    opt.num_threads = 4;
+    opt.use_vulkan_compute = false;
+    opt.use_bf16_storage = false;
+
+    ncnn::Net net;
+    net.opt = opt;
+
+    // model is converted from
+    // https://github.com/TencentYoutuResearch/CrowdCounting-P2PNet
+    // the ncnn model  https://pan.baidu.com/s/1O1CBgvY6yJkrK8Npxx3VMg pwd: ezhx
+    net.load_param("p2pnet.param");
+    net.load_model("p2pnet.bin");
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    int new_width = width / 128 * 128;
+    int new_height = height / 128 * 128;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, new_width, new_height);
+
+    std::vector<int> pyramid_levels(1, 3);
+    std::vector<float> all_anchor_points;
+    generate_anchor_points(in.w, in.h, pyramid_levels, 2, 2, all_anchor_points);
+
+    ncnn::Mat anchor_points = ncnn::Mat(2, all_anchor_points.size() / 2, all_anchor_points.data());
+
+    ncnn::Extractor ex = net.create_extractor();
+    const float mean_vals1[3] = {123.675f, 116.28f, 103.53f};
+    const float norm_vals1[3] = {0.01712475f, 0.0175f, 0.01742919f};
+
+    in.substract_mean_normalize(mean_vals1, norm_vals1);
+
+    ex.input("input", in);
+    ex.input("anchor", anchor_points);
+
+    ncnn::Mat score, points;
+    ex.extract("pred_scores", score);
+    ex.extract("pred_points", points);
+
+    for (int i = 0; i < points.h; i++)
+    {
+        float* score_data = score.row(i);
+        float* points_data = points.row(i);
+        CrowdPoint cp;
+        int x = points_data[0] / new_width * width;
+        int y = points_data[1] / new_height * height;
+        cp.pt = cv::Point(x, y);
+        cp.prob = score_data[1];
+        crowd_points.push_back(cp);
+    }
+
+    return 0;
+}
+
+static void draw_result(const cv::Mat& bgr, const std::vector<CrowdPoint>& crowd_points)
+{
+    cv::Mat image = bgr.clone();
+    const float threshold = 0.5f;
+    for (int i = 0; i < crowd_points.size(); i++)
+    {
+        if (crowd_points[i].prob > threshold)
+        {
+            cv::circle(image, crowd_points[i].pt, 4, cv::Scalar(0, 0, 255), -1, 8, 0);
+        }
+    }
+    cv::imshow("image", image);
+    cv::waitKey();
+}
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat bgr = cv::imread(imagepath, 1);
+    if (bgr.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<CrowdPoint> crowd_points;
+    detect_crowd(bgr, crowd_points);
+    draw_result(bgr, crowd_points);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/peleenetssd_seg.cpp
+++ b/3rdparty/ncnn/examples/peleenetssd_seg.cpp
@ -0,0 +1,196 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_peleenet(const cv::Mat& bgr, std::vector<Object>& objects, ncnn::Mat& resized)
+{
+    ncnn::Net peleenet;
+
+    peleenet.opt.use_vulkan_compute = true;
+
+    // model is converted from https://github.com/eric612/MobileNet-YOLO
+    // and can be downloaded from https://drive.google.com/open?id=1Wt6jKv13sBRMHgrGAJYlOlRF-o80pC0g
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    peleenet.load_param("pelee.param");
+    peleenet.load_model("pelee.bin");
+
+    const int target_size = 304;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {103.9f, 116.7f, 123.6f};
+    const float norm_vals[3] = {0.017f, 0.017f, 0.017f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = peleenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+    ncnn::Mat seg_out;
+    ex.extract("sigmoid", seg_out);
+    resize_bilinear(seg_out, resized, img_w, img_h);
+    //resize_bicubic(seg_out,resized,img_w,img_h); // sharpness
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, ncnn::Mat map)
+{
+    static const char* class_names[] = {"background",
+                                        "person", "rider", "car", "bus",
+                                        "truck", "bike", "motor",
+                                        "traffic light", "traffic sign", "train"
+                                       };
+
+    cv::Mat image = bgr.clone();
+    const int color[] = {128, 255, 128, 244, 35, 232};
+    const int color_count = sizeof(color) / sizeof(int);
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+    int width = map.w;
+    int height = map.h;
+    int size = map.c;
+    int img_index2 = 0;
+    float threshold = 0.45;
+    const float* ptr2 = map;
+    for (int i = 0; i < height; i++)
+    {
+        unsigned char* ptr1 = image.ptr<unsigned char>(i);
+        int img_index1 = 0;
+        for (int j = 0; j < width; j++)
+        {
+            float maxima = threshold;
+            int index = -1;
+            for (int c = 0; c < size; c++)
+            {
+                //const float* ptr3 = map.channel(c);
+                const float* ptr3 = ptr2 + c * width * height;
+                if (ptr3[img_index2] > maxima)
+                {
+                    maxima = ptr3[img_index2];
+                    index = c;
+                }
+            }
+            if (index > -1)
+            {
+                int color_index = (index)*3;
+                if (color_index < color_count)
+                {
+                    int b = color[color_index];
+                    int g = color[color_index + 1];
+                    int r = color[color_index + 2];
+                    ptr1[img_index1] = b / 2 + ptr1[img_index1] / 2;
+                    ptr1[img_index1 + 1] = g / 2 + ptr1[img_index1 + 1] / 2;
+                    ptr1[img_index1 + 2] = r / 2 + ptr1[img_index1 + 2] / 2;
+                }
+            }
+            img_index1 += 3;
+            img_index2++;
+        }
+    }
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    ncnn::Mat seg_out;
+    detect_peleenet(m, objects, seg_out);
+
+    draw_objects(m, objects, seg_out);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/retinaface.cpp
+++ b/3rdparty/ncnn/examples/retinaface.cpp
@ -0,0 +1,434 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct FaceObject
+{
+    cv::Rect_<float> rect;
+    cv::Point2f landmark[5];
+    float prob;
+};
+
+static inline float intersection_area(const FaceObject& a, const FaceObject& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const FaceObject& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const FaceObject& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+// copy from src/layer/proposal.cpp
+static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
+{
+    int num_ratio = ratios.w;
+    int num_scale = scales.w;
+
+    ncnn::Mat anchors;
+    anchors.create(4, num_ratio * num_scale);
+
+    const float cx = base_size * 0.5f;
+    const float cy = base_size * 0.5f;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = round(base_size / sqrt(ar));
+        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = r_w * scale;
+            float rs_h = r_h * scale;
+
+            float* anchor = anchors.row(i * num_scale + j);
+
+            anchor[0] = cx - rs_w * 0.5f;
+            anchor[1] = cy - rs_h * 0.5f;
+            anchor[2] = cx + rs_w * 0.5f;
+            anchor[3] = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, const ncnn::Mat& landmark_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
+{
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.h;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float* anchor = anchors.row(q);
+
+        const ncnn::Mat score = score_blob.channel(q + num_anchors);
+        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
+        const ncnn::Mat landmark = landmark_blob.channel_range(q * 10, 10);
+
+        // shifted anchor
+        float anchor_y = anchor[1];
+
+        float anchor_w = anchor[2] - anchor[0];
+        float anchor_h = anchor[3] - anchor[1];
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // apply center size
+                    float dx = bbox.channel(0)[index];
+                    float dy = bbox.channel(1)[index];
+                    float dw = bbox.channel(2)[index];
+                    float dh = bbox.channel(3)[index];
+
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float pb_cx = cx + anchor_w * dx;
+                    float pb_cy = cy + anchor_h * dy;
+
+                    float pb_w = anchor_w * exp(dw);
+                    float pb_h = anchor_h * exp(dh);
+
+                    float x0 = pb_cx - pb_w * 0.5f;
+                    float y0 = pb_cy - pb_h * 0.5f;
+                    float x1 = pb_cx + pb_w * 0.5f;
+                    float y1 = pb_cy + pb_h * 0.5f;
+
+                    FaceObject obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0 + 1;
+                    obj.rect.height = y1 - y0 + 1;
+                    obj.landmark[0].x = cx + (anchor_w + 1) * landmark.channel(0)[index];
+                    obj.landmark[0].y = cy + (anchor_h + 1) * landmark.channel(1)[index];
+                    obj.landmark[1].x = cx + (anchor_w + 1) * landmark.channel(2)[index];
+                    obj.landmark[1].y = cy + (anchor_h + 1) * landmark.channel(3)[index];
+                    obj.landmark[2].x = cx + (anchor_w + 1) * landmark.channel(4)[index];
+                    obj.landmark[2].y = cy + (anchor_h + 1) * landmark.channel(5)[index];
+                    obj.landmark[3].x = cx + (anchor_w + 1) * landmark.channel(6)[index];
+                    obj.landmark[3].y = cy + (anchor_h + 1) * landmark.channel(7)[index];
+                    obj.landmark[4].x = cx + (anchor_w + 1) * landmark.channel(8)[index];
+                    obj.landmark[4].y = cy + (anchor_h + 1) * landmark.channel(9)[index];
+                    obj.prob = prob;
+
+                    faceobjects.push_back(obj);
+                }
+
+                anchor_x += feat_stride;
+            }
+
+            anchor_y += feat_stride;
+        }
+    }
+}
+
+static int detect_retinaface(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
+{
+    ncnn::Net retinaface;
+
+    retinaface.opt.use_vulkan_compute = true;
+
+    // model is converted from
+    // https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models
+    // https://github.com/deepinsight/insightface/issues/669
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    //     retinaface.load_param("retinaface-R50.param");
+    //     retinaface.load_model("retinaface-R50.bin");
+    retinaface.load_param("mnet.25-opt.param");
+    retinaface.load_model("mnet.25-opt.bin");
+
+    const float prob_threshold = 0.8f;
+    const float nms_threshold = 0.4f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h);
+
+    ncnn::Extractor ex = retinaface.create_extractor();
+
+    ex.input("data", in);
+
+    std::vector<FaceObject> faceproposals;
+
+    // stride 32
+    {
+        ncnn::Mat score_blob, bbox_blob, landmark_blob;
+        ex.extract("face_rpn_cls_prob_reshape_stride32", score_blob);
+        ex.extract("face_rpn_bbox_pred_stride32", bbox_blob);
+        ex.extract("face_rpn_landmark_pred_stride32", landmark_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 32;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 32.f;
+        scales[1] = 16.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects32;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects32);
+
+        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat score_blob, bbox_blob, landmark_blob;
+        ex.extract("face_rpn_cls_prob_reshape_stride16", score_blob);
+        ex.extract("face_rpn_bbox_pred_stride16", bbox_blob);
+        ex.extract("face_rpn_landmark_pred_stride16", landmark_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 16;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 8.f;
+        scales[1] = 4.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects16;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects16);
+
+        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
+    }
+
+    // stride 8
+    {
+        ncnn::Mat score_blob, bbox_blob, landmark_blob;
+        ex.extract("face_rpn_cls_prob_reshape_stride8", score_blob);
+        ex.extract("face_rpn_bbox_pred_stride8", bbox_blob);
+        ex.extract("face_rpn_landmark_pred_stride8", landmark_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 8;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 2.f;
+        scales[1] = 1.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(faceproposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(faceproposals, picked, nms_threshold);
+
+    int face_count = picked.size();
+
+    faceobjects.resize(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        faceobjects[i] = faceproposals[picked[i]];
+
+        // clip to image size
+        float x0 = faceobjects[i].rect.x;
+        float y0 = faceobjects[i].rect.y;
+        float x1 = x0 + faceobjects[i].rect.width;
+        float y1 = y0 + faceobjects[i].rect.height;
+
+        x0 = std::max(std::min(x0, (float)img_w - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)img_h - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)img_w - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)img_h - 1), 0.f);
+
+        faceobjects[i].rect.x = x0;
+        faceobjects[i].rect.y = y0;
+        faceobjects[i].rect.width = x1 - x0;
+        faceobjects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
+{
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < faceobjects.size(); i++)
+    {
+        const FaceObject& obj = faceobjects[i];
+
+        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
+
+        cv::circle(image, obj.landmark[0], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[1], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[2], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[3], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[4], 2, cv::Scalar(0, 255, 255), -1);
+
+        char text[256];
+        sprintf(text, "%.1f%%", obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<FaceObject> faceobjects;
+    detect_retinaface(m, faceobjects);
+
+    draw_faceobjects(m, faceobjects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/rfcn.cpp
+++ b/3rdparty/ncnn/examples/rfcn.cpp
@ -0,0 +1,357 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <math.h>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = objects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = objects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = objects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = objects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int detect_rfcn(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net rfcn;
+
+    rfcn.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/YuwenXiong/py-R-FCN
+    // https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt
+    // https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf
+    // resnet50_rfcn_final.caffemodel
+    rfcn.load_param("rfcn_end2end.param");
+    rfcn.load_model("rfcn_end2end.bin");
+
+    const int target_size = 224;
+
+    const int max_per_image = 100;
+    const float confidence_thresh = 0.6f; // CONF_THRESH
+
+    const float nms_threshold = 0.3f; // NMS_THRESH
+
+    // scale to target detect size
+    int w = bgr.cols;
+    int h = bgr.rows;
+    float scale = 1.f;
+    if (w < h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h);
+
+    const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Mat im_info(3);
+    im_info[0] = h;
+    im_info[1] = w;
+    im_info[2] = scale;
+
+    // step1, extract feature and all rois
+    ncnn::Extractor ex1 = rfcn.create_extractor();
+
+    ex1.input("data", in);
+    ex1.input("im_info", im_info);
+
+    ncnn::Mat rfcn_cls;
+    ncnn::Mat rfcn_bbox;
+    ncnn::Mat rois; // all rois
+    ex1.extract("rfcn_cls", rfcn_cls);
+    ex1.extract("rfcn_bbox", rfcn_bbox);
+    ex1.extract("rois", rois);
+
+    // step2, extract bbox and score for each roi
+    std::vector<std::vector<Object> > class_candidates;
+    for (int i = 0; i < rois.c; i++)
+    {
+        ncnn::Extractor ex2 = rfcn.create_extractor();
+
+        ncnn::Mat roi = rois.channel(i); // get single roi
+        ex2.input("rfcn_cls", rfcn_cls);
+        ex2.input("rfcn_bbox", rfcn_bbox);
+        ex2.input("rois", roi);
+
+        ncnn::Mat bbox_pred;
+        ncnn::Mat cls_prob;
+        ex2.extract("bbox_pred", bbox_pred);
+        ex2.extract("cls_prob", cls_prob);
+
+        int num_class = cls_prob.w;
+        class_candidates.resize(num_class);
+
+        // find class id with highest score
+        int label = 0;
+        float score = 0.f;
+        for (int i = 0; i < num_class; i++)
+        {
+            float class_score = cls_prob[i];
+            if (class_score > score)
+            {
+                label = i;
+                score = class_score;
+            }
+        }
+
+        // ignore background or low score
+        if (label == 0 || score <= confidence_thresh)
+            continue;
+
+        //         fprintf(stderr, "%d = %f\n", label, score);
+
+        // unscale to image size
+        float x1 = roi[0] / scale;
+        float y1 = roi[1] / scale;
+        float x2 = roi[2] / scale;
+        float y2 = roi[3] / scale;
+
+        float pb_w = x2 - x1 + 1;
+        float pb_h = y2 - y1 + 1;
+
+        // apply bbox regression
+        float dx = bbox_pred[4];
+        float dy = bbox_pred[4 + 1];
+        float dw = bbox_pred[4 + 2];
+        float dh = bbox_pred[4 + 3];
+
+        float cx = x1 + pb_w * 0.5f;
+        float cy = y1 + pb_h * 0.5f;
+
+        float obj_cx = cx + pb_w * dx;
+        float obj_cy = cy + pb_h * dy;
+
+        float obj_w = pb_w * exp(dw);
+        float obj_h = pb_h * exp(dh);
+
+        float obj_x1 = obj_cx - obj_w * 0.5f;
+        float obj_y1 = obj_cy - obj_h * 0.5f;
+        float obj_x2 = obj_cx + obj_w * 0.5f;
+        float obj_y2 = obj_cy + obj_h * 0.5f;
+
+        // clip
+        obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f);
+
+        // append object
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
+        obj.label = label;
+        obj.prob = score;
+
+        class_candidates[label].push_back(obj);
+    }
+
+    // post process
+    objects.clear();
+    for (int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidates = class_candidates[i];
+
+        qsort_descent_inplace(candidates);
+
+        std::vector<int> picked;
+        nms_sorted_bboxes(candidates, picked, nms_threshold);
+
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            int z = picked[j];
+            objects.push_back(candidates[z]);
+        }
+    }
+
+    qsort_descent_inplace(objects);
+
+    if (max_per_image > 0 && max_per_image < objects.size())
+    {
+        objects.resize(max_per_image);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_rfcn(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/rvm.cpp
+++ b/3rdparty/ncnn/examples/rvm.cpp
@ -0,0 +1,132 @@
+#include "net.h"
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+static void draw_objects(const cv::Mat& bgr, const cv::Mat& fgr, const cv::Mat& pha)
+{
+    cv::Mat fgr8U;
+    fgr.convertTo(fgr8U, CV_8UC3, 255.0, 0);
+    cv::Mat pha8U;
+    pha.convertTo(pha8U, CV_8UC1, 255.0, 0);
+
+    cv::Mat comp;
+    cv::resize(bgr, comp, pha.size(), 0, 0, 1);
+    for (int i = 0; i < pha8U.rows; i++)
+    {
+        for (int j = 0; j < pha8U.cols; j++)
+        {
+            uchar data = pha8U.at<uchar>(i, j);
+            float alpha = (float)data / 255;
+            comp.at<cv::Vec3b>(i, j)[0] = fgr8U.at<cv::Vec3b>(i, j)[0] * alpha + (1 - alpha) * 155;
+            comp.at<cv::Vec3b>(i, j)[1] = fgr8U.at<cv::Vec3b>(i, j)[1] * alpha + (1 - alpha) * 255;
+            comp.at<cv::Vec3b>(i, j)[2] = fgr8U.at<cv::Vec3b>(i, j)[2] * alpha + (1 - alpha) * 120;
+        }
+    }
+
+    cv::imshow("pha", pha8U);
+    cv::imshow("fgr", fgr8U);
+    cv::imshow("comp", comp);
+    cv::waitKey(0);
+}
+static int detect_rvm(const cv::Mat& bgr, cv::Mat& pha, cv::Mat& fgr)
+{
+    const float downsample_ratio = 0.5f;
+    const int target_width = 512;
+    const int target_height = 512;
+
+    ncnn::Net net;
+    net.opt.use_vulkan_compute = false;
+    //original pretrained model from https://github.com/PeterL1n/RobustVideoMatting
+    //ncnn model https://pan.baidu.com/s/11iEY2RGfzWFtce8ue7T3JQ password: d9t6
+    net.load_param("rvm_512.param");
+    net.load_model("rvm_512.bin");
+
+    //if you use another input size,pleaze change input shape
+    ncnn::Mat r1i = ncnn::Mat(128, 128, 16);
+    ncnn::Mat r2i = ncnn::Mat(64, 64, 20);
+    ncnn::Mat r3i = ncnn::Mat(32, 32, 40);
+    ncnn::Mat r4i = ncnn::Mat(16, 16, 64);
+    r1i.fill(0.0f);
+    r2i.fill(0.0f);
+    r3i.fill(0.0f);
+    r4i.fill(0.0f);
+
+    ncnn::Extractor ex = net.create_extractor();
+    const float mean_vals1[3] = {123.675f, 116.28f, 103.53f};
+    const float norm_vals1[3] = {0.01712475f, 0.0175f, 0.01742919f};
+    const float mean_vals2[3] = {0, 0, 0};
+    const float norm_vals2[3] = {1 / 255.0, 1 / 255.0, 1 / 255.0};
+    ncnn::Mat ncnn_in2 = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_width, target_height);
+    ncnn::Mat ncnn_in1 = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_width * downsample_ratio, target_height * downsample_ratio);
+
+    ncnn_in1.substract_mean_normalize(mean_vals1, norm_vals1);
+    ncnn_in2.substract_mean_normalize(mean_vals2, norm_vals2);
+
+    ex.input("src1", ncnn_in1);
+    ex.input("src2", ncnn_in2);
+    ex.input("r1i", r1i);
+    ex.input("r2i", r2i);
+    ex.input("r3i", r3i);
+    ex.input("r4i", r4i);
+
+    //if use video matting,these output will be input of next infer
+    ex.extract("r4o", r4i);
+    ex.extract("r3o", r3i);
+    ex.extract("r2o", r2i);
+    ex.extract("r1o", r1i);
+
+    ncnn::Mat pha_;
+    ex.extract("pha", pha_);
+    ncnn::Mat fgr_;
+    ex.extract("fgr", fgr_);
+
+    cv::Mat cv_pha = cv::Mat(pha_.h, pha_.w, CV_32FC1, (float*)pha_.data);
+    cv::Mat cv_fgr = cv::Mat(fgr_.h, fgr_.w, CV_32FC3);
+    float* fgr_data = (float*)fgr_.data;
+    for (int i = 0; i < fgr_.h; i++)
+    {
+        for (int j = 0; j < fgr_.w; j++)
+        {
+            cv_fgr.at<cv::Vec3f>(i, j)[2] = fgr_data[0 * fgr_.h * fgr_.w + i * fgr_.w + j];
+            cv_fgr.at<cv::Vec3f>(i, j)[1] = fgr_data[1 * fgr_.h * fgr_.w + i * fgr_.w + j];
+            cv_fgr.at<cv::Vec3f>(i, j)[0] = fgr_data[2 * fgr_.h * fgr_.w + i * fgr_.w + j];
+        }
+    }
+
+    cv_pha.copyTo(pha);
+    cv_fgr.copyTo(fgr);
+
+    return 0;
+}
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    cv::Mat fgr, pha;
+    detect_rvm(m, pha, fgr);
+    draw_objects(m, fgr, pha);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/scrfd.cpp
+++ b/3rdparty/ncnn/examples/scrfd.cpp
@ -0,0 +1,434 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct FaceObject
+{
+    cv::Rect_<float> rect;
+    float prob;
+};
+
+static inline float intersection_area(const FaceObject& a, const FaceObject& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const FaceObject& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const FaceObject& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+// insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors()
+static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
+{
+    int num_ratio = ratios.w;
+    int num_scale = scales.w;
+
+    ncnn::Mat anchors;
+    anchors.create(4, num_ratio * num_scale);
+
+    const float cx = 0;
+    const float cy = 0;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = round(base_size / sqrt(ar));
+        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = r_w * scale;
+            float rs_h = r_h * scale;
+
+            float* anchor = anchors.row(i * num_scale + j);
+
+            anchor[0] = cx - rs_w * 0.5f;
+            anchor[1] = cy - rs_h * 0.5f;
+            anchor[2] = cx + rs_w * 0.5f;
+            anchor[3] = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
+{
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.h;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float* anchor = anchors.row(q);
+
+        const ncnn::Mat score = score_blob.channel(q);
+        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
+
+        // shifted anchor
+        float anchor_y = anchor[1];
+
+        float anchor_w = anchor[2] - anchor[0];
+        float anchor_h = anchor[3] - anchor[1];
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single()
+                    float dx = bbox.channel(0)[index] * feat_stride;
+                    float dy = bbox.channel(1)[index] * feat_stride;
+                    float dw = bbox.channel(2)[index] * feat_stride;
+                    float dh = bbox.channel(3)[index] * feat_stride;
+
+                    // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox()
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float x0 = cx - dx;
+                    float y0 = cy - dy;
+                    float x1 = cx + dw;
+                    float y1 = cy + dh;
+
+                    FaceObject obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0 + 1;
+                    obj.rect.height = y1 - y0 + 1;
+                    obj.prob = prob;
+
+                    faceobjects.push_back(obj);
+                }
+
+                anchor_x += feat_stride;
+            }
+
+            anchor_y += feat_stride;
+        }
+    }
+}
+
+static int detect_scrfd(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
+{
+    ncnn::Net scrfd;
+
+    scrfd.opt.use_vulkan_compute = true;
+
+    // model is converted from
+    // https://github.com/deepinsight/insightface/tree/master/detection/scrfd
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    scrfd.load_param("scrfd_500m-opt2.param");
+    scrfd.load_model("scrfd_500m-opt2.bin");
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    // insightface/detection/scrfd/configs/scrfd/scrfd_500m.py
+    const int target_size = 640;
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.45f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = scrfd.create_extractor();
+
+    ex.input("input.1", in_pad);
+
+    std::vector<FaceObject> faceproposals;
+
+    // stride 32
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("412", score_blob);
+        ex.extract("415", bbox_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 8;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 1.f;
+        scales[1] = 2.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects32;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32);
+
+        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("474", score_blob);
+        ex.extract("477", bbox_blob);
+
+        const int base_size = 64;
+        const int feat_stride = 16;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 1.f;
+        scales[1] = 2.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects16;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16);
+
+        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
+    }
+
+    // stride 8
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("536", score_blob);
+        ex.extract("539", bbox_blob);
+
+        const int base_size = 256;
+        const int feat_stride = 32;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 1.f;
+        scales[1] = 2.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(faceproposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(faceproposals, picked, nms_threshold);
+
+    int face_count = picked.size();
+
+    faceobjects.resize(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        faceobjects[i] = faceproposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale;
+
+        x0 = std::max(std::min(x0, (float)width - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)height - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)width - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)height - 1), 0.f);
+
+        faceobjects[i].rect.x = x0;
+        faceobjects[i].rect.y = y0;
+        faceobjects[i].rect.width = x1 - x0;
+        faceobjects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
+{
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < faceobjects.size(); i++)
+    {
+        const FaceObject& obj = faceobjects[i];
+
+        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
+
+        char text[256];
+        sprintf(text, "%.1f%%", obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<FaceObject> faceobjects;
+    detect_scrfd(m, faceobjects);
+
+    draw_faceobjects(m, faceobjects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/scrfd_crowdhuman.cpp
+++ b/3rdparty/ncnn/examples/scrfd_crowdhuman.cpp
@ -0,0 +1,471 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct FaceObject
+{
+    cv::Rect_<float> rect;
+    float prob;
+};
+
+static inline float intersection_area(const FaceObject& a, const FaceObject& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const FaceObject& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const FaceObject& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+// insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors()
+static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
+{
+    int num_ratio = ratios.w;
+    int num_scale = scales.w;
+
+    ncnn::Mat anchors;
+    anchors.create(4, num_ratio * num_scale);
+
+    const float cx = 0;
+    const float cy = 0;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = round(base_size / sqrt(ar));
+        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = r_w * scale;
+            float rs_h = r_h * scale;
+
+            float* anchor = anchors.row(i * num_scale + j);
+
+            anchor[0] = cx - rs_w * 0.5f;
+            anchor[1] = cy - rs_h * 0.5f;
+            anchor[2] = cx + rs_w * 0.5f;
+            anchor[3] = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
+{
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.h;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float* anchor = anchors.row(q);
+
+        const ncnn::Mat score = score_blob.channel(q);
+        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
+
+        // shifted anchor
+        float anchor_y = anchor[1];
+
+        float anchor_w = anchor[2] - anchor[0];
+        float anchor_h = anchor[3] - anchor[1];
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single()
+                    float dx = bbox.channel(0)[index] * feat_stride;
+                    float dy = bbox.channel(1)[index] * feat_stride;
+                    float dw = bbox.channel(2)[index] * feat_stride;
+                    float dh = bbox.channel(3)[index] * feat_stride;
+
+                    // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox()
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float x0 = cx - dx;
+                    float y0 = cy - dy;
+                    float x1 = cx + dw;
+                    float y1 = cy + dh;
+
+                    FaceObject obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0 + 1;
+                    obj.rect.height = y1 - y0 + 1;
+                    obj.prob = prob;
+
+                    faceobjects.push_back(obj);
+                }
+
+                anchor_x += feat_stride;
+            }
+
+            anchor_y += feat_stride;
+        }
+    }
+}
+
+static int detect_scrfd(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
+{
+    ncnn::Net scrfd;
+
+    scrfd.opt.use_vulkan_compute = true;
+
+    // Insight face does not provided a trained scrfd_crowdhuman model
+    // but I have one for detecing cat face, you can have a try here:
+    // https://drive.google.com/file/d/1JogkKa0f_09HkENbCnXy9hRYxm35wKTn
+
+    scrfd.load_param("scrfd_crowdhuman.param");
+    scrfd.load_model("scrfd_crowdhuman.bin");
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    const int target_size = 640;
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.45f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = scrfd.create_extractor();
+
+    ex.input("input.1", in_pad);
+
+    std::vector<FaceObject> faceproposals;
+
+    // stride 8
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("490", score_blob);
+        ex.extract("493", bbox_blob);
+
+        const int base_size = 8;
+        const int feat_stride = 8;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects32;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32);
+
+        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("510", score_blob);
+        ex.extract("513", bbox_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 16;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects16;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16);
+
+        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("530", score_blob);
+        ex.extract("533", bbox_blob);
+
+        const int base_size = 32;
+        const int feat_stride = 32;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // stride 64
+    {
+        ncnn::Mat score_blob, bbox_blob, kps_blob;
+        ex.extract("550", score_blob);
+        ex.extract("553", bbox_blob);
+
+        const int base_size = 64;
+        const int feat_stride = 64;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // stride 128
+    {
+        ncnn::Mat score_blob, bbox_blob, kps_blob;
+        ex.extract("570", score_blob);
+        ex.extract("573", bbox_blob);
+
+        const int base_size = 128;
+        const int feat_stride = 128;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(faceproposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(faceproposals, picked, nms_threshold);
+
+    int face_count = picked.size();
+
+    faceobjects.resize(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        faceobjects[i] = faceproposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale;
+
+        x0 = std::max(std::min(x0, (float)width - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)height - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)width - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)height - 1), 0.f);
+
+        faceobjects[i].rect.x = x0;
+        faceobjects[i].rect.y = y0;
+        faceobjects[i].rect.width = x1 - x0;
+        faceobjects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
+{
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < faceobjects.size(); i++)
+    {
+        const FaceObject& obj = faceobjects[i];
+
+        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
+
+        char text[256];
+        sprintf(text, "%.1f%%", obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<FaceObject> faceobjects;
+    detect_scrfd(m, faceobjects);
+
+    draw_faceobjects(m, faceobjects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/shufflenetv2.cpp
+++ b/3rdparty/ncnn/examples/shufflenetv2.cpp
@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+static int detect_shufflenetv2(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn::Net shufflenetv2;
+
+    shufflenetv2.opt.use_vulkan_compute = true;
+
+    // https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe
+    // models can be downloaded from https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe/releases
+    shufflenetv2.load_param("shufflenet_v2_x0.5.param");
+    shufflenetv2.load_model("shufflenet_v2_x0.5.bin");
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 224, 224);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = shufflenetv2.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("fc", out);
+
+    // manually call softmax on the fc output
+    // convert result into probability
+    // skip if your model already has softmax operation
+    {
+        ncnn::Layer* softmax = ncnn::create_layer("Softmax");
+
+        ncnn::ParamDict pd;
+        softmax->load_param(pd);
+
+        softmax->forward_inplace(out, shufflenetv2.opt);
+
+        delete softmax;
+    }
+
+    out = out.reshape(out.w * out.h * out.c);
+
+    cls_scores.resize(out.w);
+    for (int j = 0; j < out.w; j++)
+    {
+        cls_scores[j] = out[j];
+    }
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector<std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    // print topk and score
+    for (int i = 0; i < topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_shufflenetv2(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/simplepose.cpp
+++ b/3rdparty/ncnn/examples/simplepose.cpp
@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct KeyPoint
+{
+    cv::Point2f p;
+    float prob;
+};
+
+static int detect_posenet(const cv::Mat& bgr, std::vector<KeyPoint>& keypoints)
+{
+    ncnn::Net posenet;
+
+    posenet.opt.use_vulkan_compute = true;
+
+    // the simple baseline human pose estimation from gluon-cv
+    // https://gluon-cv.mxnet.io/build/examples_pose/demo_simple_pose.html
+    // mxnet model exported via
+    //      pose_net.hybridize()
+    //      pose_net.export('pose')
+    // then mxnet2ncnn
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    posenet.load_param("pose.param");
+    posenet.load_model("pose.bin");
+
+    int w = bgr.cols;
+    int h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h, 192, 256);
+
+    // transforms.ToTensor(),
+    // transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    // R' = (R / 255 - 0.485) / 0.229 = (R - 0.485 * 255) / 0.229 / 255
+    // G' = (G / 255 - 0.456) / 0.224 = (G - 0.456 * 255) / 0.224 / 255
+    // B' = (B / 255 - 0.406) / 0.225 = (B - 0.406 * 255) / 0.225 / 255
+    const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
+    const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = posenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("conv3_fwd", out);
+
+    // resolve point from heatmap
+    keypoints.clear();
+    for (int p = 0; p < out.c; p++)
+    {
+        const ncnn::Mat m = out.channel(p);
+
+        float max_prob = 0.f;
+        int max_x = 0;
+        int max_y = 0;
+        for (int y = 0; y < out.h; y++)
+        {
+            const float* ptr = m.row(y);
+            for (int x = 0; x < out.w; x++)
+            {
+                float prob = ptr[x];
+                if (prob > max_prob)
+                {
+                    max_prob = prob;
+                    max_x = x;
+                    max_y = y;
+                }
+            }
+        }
+
+        KeyPoint keypoint;
+        keypoint.p = cv::Point2f(max_x * w / (float)out.w, max_y * h / (float)out.h);
+        keypoint.prob = max_prob;
+
+        keypoints.push_back(keypoint);
+    }
+
+    return 0;
+}
+
+static void draw_pose(const cv::Mat& bgr, const std::vector<KeyPoint>& keypoints)
+{
+    cv::Mat image = bgr.clone();
+
+    // draw bone
+    static const int joint_pairs[16][2] = {
+        {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}
+    };
+
+    for (int i = 0; i < 16; i++)
+    {
+        const KeyPoint& p1 = keypoints[joint_pairs[i][0]];
+        const KeyPoint& p2 = keypoints[joint_pairs[i][1]];
+
+        if (p1.prob < 0.2f || p2.prob < 0.2f)
+            continue;
+
+        cv::line(image, p1.p, p2.p, cv::Scalar(255, 0, 0), 2);
+    }
+
+    // draw joint
+    for (size_t i = 0; i < keypoints.size(); i++)
+    {
+        const KeyPoint& keypoint = keypoints[i];
+
+        fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob);
+
+        if (keypoint.prob < 0.2f)
+            continue;
+
+        cv::circle(image, keypoint.p, 3, cv::Scalar(0, 255, 0), -1);
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<KeyPoint> keypoints;
+    detect_posenet(m, keypoints);
+
+    draw_pose(m, keypoints);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/squeezencnn/README.md
+++ b/3rdparty/ncnn/examples/squeezencnn/README.md
@ -0,0 +1 @@
+The squeezenet android example project has been moved to https://github.com/nihui/ncnn-android-squeezenet
--- a/3rdparty/ncnn/examples/squeezenet.cpp
+++ b/3rdparty/ncnn/examples/squeezenet.cpp
@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn::Net squeezenet;
+
+    squeezenet.opt.use_vulkan_compute = true;
+
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    squeezenet.load_param("squeezenet_v1.1.param");
+    squeezenet.load_model("squeezenet_v1.1.bin");
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = squeezenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("prob", out);
+
+    cls_scores.resize(out.w);
+    for (int j = 0; j < out.w; j++)
+    {
+        cls_scores[j] = out[j];
+    }
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector<std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    // print topk and score
+    for (int i = 0; i < topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_squeezenet(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/squeezenet_c_api.cpp
+++ b/3rdparty/ncnn/examples/squeezenet_c_api.cpp
@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "c_api.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn_net_t squeezenet = ncnn_net_create();
+
+    ncnn_option_t opt = ncnn_option_create();
+    ncnn_option_set_use_vulkan_compute(opt, 1);
+
+    ncnn_net_set_option(squeezenet, opt);
+
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    ncnn_net_load_param(squeezenet, "squeezenet_v1.1.param");
+    ncnn_net_load_model(squeezenet, "squeezenet_v1.1.bin");
+
+    ncnn_mat_t in = ncnn_mat_from_pixels_resize(bgr.data, NCNN_MAT_PIXEL_BGR, bgr.cols, bgr.rows, bgr.cols * 3, 227, 227, NULL);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    ncnn_mat_substract_mean_normalize(in, mean_vals, 0);
+
+    ncnn_extractor_t ex = ncnn_extractor_create(squeezenet);
+
+    ncnn_extractor_input(ex, "data", in);
+
+    ncnn_mat_t out;
+    ncnn_extractor_extract(ex, "prob", &out);
+
+    const int out_w = ncnn_mat_get_w(out);
+    const float* out_data = (const float*)ncnn_mat_get_data(out);
+
+    cls_scores.resize(out_w);
+    for (int j = 0; j < out_w; j++)
+    {
+        cls_scores[j] = out_data[j];
+    }
+
+    ncnn_mat_destroy(in);
+    ncnn_mat_destroy(out);
+
+    ncnn_extractor_destroy(ex);
+
+    ncnn_option_destroy(opt);
+
+    ncnn_net_destroy(squeezenet);
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector<std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    // print topk and score
+    for (int i = 0; i < topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_squeezenet(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/squeezenet_v1.1.bin
+++ b/3rdparty/ncnn/examples/squeezenet_v1.1.bin
--- a/3rdparty/ncnn/examples/squeezenet_v1.1.caffemodel
+++ b/3rdparty/ncnn/examples/squeezenet_v1.1.caffemodel
--- a/3rdparty/ncnn/examples/squeezenet_v1.1.param
+++ b/3rdparty/ncnn/examples/squeezenet_v1.1.param
@ -0,0 +1,77 @@
+7767517
+75 83
+Input            data             0 1 data 0=227 1=227 2=3
+Convolution      conv1            1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728
+ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1 0=0.000000
+Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
+Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024
+ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=0.000000
+Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
+Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024
+ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0=0.000000
+Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216
+ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0=0.000000
+Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
+Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048
+ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0=0.000000
+Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
+Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024
+ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0=0.000000
+Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216
+ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0=0.000000
+Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
+Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
+Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096
+ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=0.000000
+Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
+Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096
+ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0=0.000000
+Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864
+ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0=0.000000
+Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
+Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192
+ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0=0.000000
+Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
+Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096
+ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0=0.000000
+Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864
+ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0=0.000000
+Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
+Pooling          pool5            1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0
+Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288
+ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=0.000000
+Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
+Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216
+ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0=0.000000
+Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944
+ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0=0.000000
+Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
+Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432
+ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0=0.000000
+Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
+Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216
+ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0=0.000000
+Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944
+ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0=0.000000
+Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
+Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576
+ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0=0.000000
+Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
+Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384
+ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0=0.000000
+Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456
+ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0=0.000000
+Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
+Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768
+ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0=0.000000
+Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
+Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384
+ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0=0.000000
+Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456
+ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0=0.000000
+Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
+Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
+Convolution      conv10           1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000
+ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10 0=0.000000
+Pooling          pool10           1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1
+Softmax          prob             1 1 pool10 prob 0=0
--- a/3rdparty/ncnn/examples/squeezenet_v1.1.param.bin
+++ b/3rdparty/ncnn/examples/squeezenet_v1.1.param.bin
--- a/3rdparty/ncnn/examples/squeezenet_v1.1.prototxt
+++ b/3rdparty/ncnn/examples/squeezenet_v1.1.prototxt
@ -0,0 +1,548 @@
+name: "squeezenet_v1.1_deploy"
+
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu_conv1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire2/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "fire2/squeeze1x1"
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire2/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/squeeze1x1"
+}
+layer {
+  name: "fire2/expand1x1"
+  type: "Convolution"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/expand1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire2/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire2/expand1x1"
+  top: "fire2/expand1x1"
+}
+layer {
+  name: "fire2/expand3x3"
+  type: "Convolution"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/expand3x3"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire2/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire2/expand3x3"
+  top: "fire2/expand3x3"
+}
+layer {
+  name: "fire2/concat"
+  type: "Concat"
+  bottom: "fire2/expand1x1"
+  bottom: "fire2/expand3x3"
+  top: "fire2/concat"
+}
+layer {
+  name: "fire3/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire2/concat"
+  top: "fire3/squeeze1x1"
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire3/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/squeeze1x1"
+}
+layer {
+  name: "fire3/expand1x1"
+  type: "Convolution"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/expand1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire3/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire3/expand1x1"
+  top: "fire3/expand1x1"
+}
+layer {
+  name: "fire3/expand3x3"
+  type: "Convolution"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/expand3x3"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire3/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire3/expand3x3"
+  top: "fire3/expand3x3"
+}
+layer {
+  name: "fire3/concat"
+  type: "Concat"
+  bottom: "fire3/expand1x1"
+  bottom: "fire3/expand3x3"
+  top: "fire3/concat"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "fire3/concat"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire4/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "fire4/squeeze1x1"
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire4/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/squeeze1x1"
+}
+layer {
+  name: "fire4/expand1x1"
+  type: "Convolution"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/expand1x1"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire4/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire4/expand1x1"
+  top: "fire4/expand1x1"
+}
+layer {
+  name: "fire4/expand3x3"
+  type: "Convolution"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/expand3x3"
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire4/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire4/expand3x3"
+  top: "fire4/expand3x3"
+}
+layer {
+  name: "fire4/concat"
+  type: "Concat"
+  bottom: "fire4/expand1x1"
+  bottom: "fire4/expand3x3"
+  top: "fire4/concat"
+}
+layer {
+  name: "fire5/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire4/concat"
+  top: "fire5/squeeze1x1"
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire5/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/squeeze1x1"
+}
+layer {
+  name: "fire5/expand1x1"
+  type: "Convolution"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/expand1x1"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire5/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire5/expand1x1"
+  top: "fire5/expand1x1"
+}
+layer {
+  name: "fire5/expand3x3"
+  type: "Convolution"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/expand3x3"
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire5/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire5/expand3x3"
+  top: "fire5/expand3x3"
+}
+layer {
+  name: "fire5/concat"
+  type: "Concat"
+  bottom: "fire5/expand1x1"
+  bottom: "fire5/expand3x3"
+  top: "fire5/concat"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "fire5/concat"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire6/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fire6/squeeze1x1"
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire6/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/squeeze1x1"
+}
+layer {
+  name: "fire6/expand1x1"
+  type: "Convolution"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/expand1x1"
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire6/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire6/expand1x1"
+  top: "fire6/expand1x1"
+}
+layer {
+  name: "fire6/expand3x3"
+  type: "Convolution"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/expand3x3"
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire6/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire6/expand3x3"
+  top: "fire6/expand3x3"
+}
+layer {
+  name: "fire6/concat"
+  type: "Concat"
+  bottom: "fire6/expand1x1"
+  bottom: "fire6/expand3x3"
+  top: "fire6/concat"
+}
+layer {
+  name: "fire7/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire6/concat"
+  top: "fire7/squeeze1x1"
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire7/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/squeeze1x1"
+}
+layer {
+  name: "fire7/expand1x1"
+  type: "Convolution"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/expand1x1"
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire7/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire7/expand1x1"
+  top: "fire7/expand1x1"
+}
+layer {
+  name: "fire7/expand3x3"
+  type: "Convolution"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/expand3x3"
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire7/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire7/expand3x3"
+  top: "fire7/expand3x3"
+}
+layer {
+  name: "fire7/concat"
+  type: "Concat"
+  bottom: "fire7/expand1x1"
+  bottom: "fire7/expand3x3"
+  top: "fire7/concat"
+}
+layer {
+  name: "fire8/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire7/concat"
+  top: "fire8/squeeze1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire8/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/squeeze1x1"
+}
+layer {
+  name: "fire8/expand1x1"
+  type: "Convolution"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/expand1x1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire8/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire8/expand1x1"
+  top: "fire8/expand1x1"
+}
+layer {
+  name: "fire8/expand3x3"
+  type: "Convolution"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/expand3x3"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire8/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire8/expand3x3"
+  top: "fire8/expand3x3"
+}
+layer {
+  name: "fire8/concat"
+  type: "Concat"
+  bottom: "fire8/expand1x1"
+  bottom: "fire8/expand3x3"
+  top: "fire8/concat"
+}
+layer {
+  name: "fire9/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire8/concat"
+  top: "fire9/squeeze1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire9/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/squeeze1x1"
+}
+layer {
+  name: "fire9/expand1x1"
+  type: "Convolution"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/expand1x1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire9/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire9/expand1x1"
+  top: "fire9/expand1x1"
+}
+layer {
+  name: "fire9/expand3x3"
+  type: "Convolution"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/expand3x3"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire9/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire9/expand3x3"
+  top: "fire9/expand3x3"
+}
+layer {
+  name: "fire9/concat"
+  type: "Concat"
+  bottom: "fire9/expand1x1"
+  bottom: "fire9/expand3x3"
+  top: "fire9/concat"
+}
+layer {
+  name: "drop9"
+  type: "Dropout"
+  bottom: "fire9/concat"
+  top: "fire9/concat"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "conv10"
+  type: "Convolution"
+  bottom: "fire9/concat"
+  top: "conv10"
+  convolution_param {
+    num_output: 1000
+    pad: 1
+    kernel_size: 1
+  }
+}
+layer {
+  name: "relu_conv10"
+  type: "ReLU"
+  bottom: "conv10"
+  top: "conv10"
+}
+layer {
+  name: "pool10"
+  type: "Pooling"
+  bottom: "conv10"
+  top: "pool10"
+  pooling_param {
+    pool: AVE
+    global_pooling: true
+  }
+}
+layer {
+  name: "prob"
+  type: "Softmax"
+  bottom: "pool10"
+  top: "prob"
+}
--- a/3rdparty/ncnn/examples/squeezenetssd.cpp
+++ b/3rdparty/ncnn/examples/squeezenetssd.cpp
@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net squeezenet;
+
+    squeezenet.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/chuanqi305/SqueezeNet-SSD
+    // squeezenet_ssd_voc_deploy.prototxt
+    // https://drive.google.com/open?id=0B3gersZ2cHIxdGpyZlZnbEQ5Snc
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    squeezenet.load_param("squeezenet_ssd_voc.param");
+    squeezenet.load_model("squeezenet_ssd_voc.bin");
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = squeezenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_squeezenet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/synset_words.txt
+++ b/3rdparty/ncnn/examples/synset_words.txt
--- a/3rdparty/ncnn/examples/yolact.cpp
+++ b/3rdparty/ncnn/examples/yolact.cpp
@ -0,0 +1,544 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+    std::vector<float> maskdata;
+    cv::Mat mask;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = objects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = objects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = objects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = objects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolact;
+
+    yolact.opt.use_vulkan_compute = true;
+
+    // original model converted from https://github.com/dbolya/yolact
+    // yolact_resnet50_54_800000.pth
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    yolact.load_param("yolact.param");
+    yolact.load_model("yolact.bin");
+
+    const int target_size = 550;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, target_size, target_size);
+
+    const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
+    const float norm_vals[3] = {1.0 / 58.40f, 1.0 / 57.12f, 1.0 / 57.38f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolact.create_extractor();
+
+    ex.input("input.1", in);
+
+    ncnn::Mat maskmaps;
+    ncnn::Mat location;
+    ncnn::Mat mask;
+    ncnn::Mat confidence;
+
+    ex.extract("619", maskmaps); // 138x138 x 32
+
+    ex.extract("816", location);   // 4 x 19248
+    ex.extract("818", mask);       // maskdim 32 x 19248
+    ex.extract("820", confidence); // 81 x 19248
+
+    int num_class = confidence.w;
+    int num_priors = confidence.h;
+
+    // make priorbox
+    ncnn::Mat priorbox(4, num_priors);
+    {
+        const int conv_ws[5] = {69, 35, 18, 9, 5};
+        const int conv_hs[5] = {69, 35, 18, 9, 5};
+
+        const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
+        const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};
+
+        float* pb = priorbox;
+
+        for (int p = 0; p < 5; p++)
+        {
+            int conv_w = conv_ws[p];
+            int conv_h = conv_hs[p];
+
+            float scale = scales[p];
+
+            for (int i = 0; i < conv_h; i++)
+            {
+                for (int j = 0; j < conv_w; j++)
+                {
+                    // +0.5 because priors are in center-size notation
+                    float cx = (j + 0.5f) / conv_w;
+                    float cy = (i + 0.5f) / conv_h;
+
+                    for (int k = 0; k < 3; k++)
+                    {
+                        float ar = aspect_ratios[k];
+
+                        ar = sqrt(ar);
+
+                        float w = scale * ar / 550;
+                        float h = scale / ar / 550;
+
+                        // This is for backward compatibility with a bug where I made everything square by accident
+                        // cfg.backbone.use_square_anchors:
+                        h = w;
+
+                        pb[0] = cx;
+                        pb[1] = cy;
+                        pb[2] = w;
+                        pb[3] = h;
+
+                        pb += 4;
+                    }
+                }
+            }
+        }
+    }
+
+    const float confidence_thresh = 0.05f;
+    const float nms_threshold = 0.5f;
+    const int keep_top_k = 200;
+
+    std::vector<std::vector<Object> > class_candidates;
+    class_candidates.resize(num_class);
+
+    for (int i = 0; i < num_priors; i++)
+    {
+        const float* conf = confidence.row(i);
+        const float* loc = location.row(i);
+        const float* pb = priorbox.row(i);
+        const float* maskdata = mask.row(i);
+
+        // find class id with highest score
+        // start from 1 to skip background
+        int label = 0;
+        float score = 0.f;
+        for (int j = 1; j < num_class; j++)
+        {
+            float class_score = conf[j];
+            if (class_score > score)
+            {
+                label = j;
+                score = class_score;
+            }
+        }
+
+        // ignore background or low score
+        if (label == 0 || score <= confidence_thresh)
+            continue;
+
+        // CENTER_SIZE
+        float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};
+
+        float pb_cx = pb[0];
+        float pb_cy = pb[1];
+        float pb_w = pb[2];
+        float pb_h = pb[3];
+
+        float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
+        float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
+        float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
+        float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);
+
+        float obj_x1 = bbox_cx - bbox_w * 0.5f;
+        float obj_y1 = bbox_cy - bbox_h * 0.5f;
+        float obj_x2 = bbox_cx + bbox_w * 0.5f;
+        float obj_y2 = bbox_cy + bbox_h * 0.5f;
+
+        // clip
+        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+
+        // append object
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
+        obj.label = label;
+        obj.prob = score;
+        obj.maskdata = std::vector<float>(maskdata, maskdata + mask.w);
+
+        class_candidates[label].push_back(obj);
+    }
+
+    objects.clear();
+    for (int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidates = class_candidates[i];
+
+        qsort_descent_inplace(candidates);
+
+        std::vector<int> picked;
+        nms_sorted_bboxes(candidates, picked, nms_threshold);
+
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            int z = picked[j];
+            objects.push_back(candidates[z]);
+        }
+    }
+
+    qsort_descent_inplace(objects);
+
+    // keep_top_k
+    if (keep_top_k < (int)objects.size())
+    {
+        objects.resize(keep_top_k);
+    }
+
+    // generate mask
+    for (int i = 0; i < (int)objects.size(); i++)
+    {
+        Object& obj = objects[i];
+
+        cv::Mat mask(maskmaps.h, maskmaps.w, CV_32FC1);
+        {
+            mask = cv::Scalar(0.f);
+
+            for (int p = 0; p < maskmaps.c; p++)
+            {
+                const float* maskmap = maskmaps.channel(p);
+                float coeff = obj.maskdata[p];
+                float* mp = (float*)mask.data;
+
+                // mask += m * coeff
+                for (int j = 0; j < maskmaps.w * maskmaps.h; j++)
+                {
+                    mp[j] += maskmap[j] * coeff;
+                }
+            }
+        }
+
+        cv::Mat mask2;
+        cv::resize(mask, mask2, cv::Size(img_w, img_h));
+
+        // crop obj box and binarize
+        obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
+        {
+            obj.mask = cv::Scalar(0);
+
+            for (int y = 0; y < img_h; y++)
+            {
+                if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
+                    continue;
+
+                const float* mp2 = mask2.ptr<const float>(y);
+                uchar* bmp = obj.mask.ptr<uchar>(y);
+
+                for (int x = 0; x < img_w; x++)
+                {
+                    if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
+                        continue;
+
+                    bmp[x] = mp2[x] > 0.5f ? 255 : 0;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+                                        "train", "truck", "boat", "traffic light", "fire hydrant",
+                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
+                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                                        "scissors", "teddy bear", "hair drier", "toothbrush"
+                                       };
+
+    static const unsigned char colors[81][3] = {
+        {56, 0, 255},
+        {226, 255, 0},
+        {0, 94, 255},
+        {0, 37, 255},
+        {0, 255, 94},
+        {255, 226, 0},
+        {0, 18, 255},
+        {255, 151, 0},
+        {170, 0, 255},
+        {0, 255, 56},
+        {255, 0, 75},
+        {0, 75, 255},
+        {0, 255, 169},
+        {255, 0, 207},
+        {75, 255, 0},
+        {207, 0, 255},
+        {37, 0, 255},
+        {0, 207, 255},
+        {94, 0, 255},
+        {0, 255, 113},
+        {255, 18, 0},
+        {255, 0, 56},
+        {18, 0, 255},
+        {0, 255, 226},
+        {170, 255, 0},
+        {255, 0, 245},
+        {151, 255, 0},
+        {132, 255, 0},
+        {75, 0, 255},
+        {151, 0, 255},
+        {0, 151, 255},
+        {132, 0, 255},
+        {0, 255, 245},
+        {255, 132, 0},
+        {226, 0, 255},
+        {255, 37, 0},
+        {207, 255, 0},
+        {0, 255, 207},
+        {94, 255, 0},
+        {0, 226, 255},
+        {56, 255, 0},
+        {255, 94, 0},
+        {255, 113, 0},
+        {0, 132, 255},
+        {255, 0, 132},
+        {255, 170, 0},
+        {255, 0, 188},
+        {113, 255, 0},
+        {245, 0, 255},
+        {113, 0, 255},
+        {255, 188, 0},
+        {0, 113, 255},
+        {255, 0, 0},
+        {0, 56, 255},
+        {255, 0, 113},
+        {0, 255, 188},
+        {255, 0, 94},
+        {255, 0, 18},
+        {18, 255, 0},
+        {0, 255, 132},
+        {0, 188, 255},
+        {0, 245, 255},
+        {0, 169, 255},
+        {37, 255, 0},
+        {255, 0, 151},
+        {188, 0, 255},
+        {0, 255, 37},
+        {0, 255, 0},
+        {255, 0, 170},
+        {255, 0, 37},
+        {255, 75, 0},
+        {0, 0, 255},
+        {255, 207, 0},
+        {255, 0, 226},
+        {255, 245, 0},
+        {188, 255, 0},
+        {0, 255, 18},
+        {0, 255, 75},
+        {0, 255, 151},
+        {255, 56, 0},
+        {245, 255, 0}
+    };
+
+    cv::Mat image = bgr.clone();
+
+    int color_index = 0;
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        if (obj.prob < 0.15)
+            continue;
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        const unsigned char* color = colors[color_index % 81];
+        color_index++;
+
+        cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+
+        // draw mask
+        for (int y = 0; y < image.rows; y++)
+        {
+            const uchar* mp = obj.mask.ptr(y);
+            uchar* p = image.ptr(y);
+            for (int x = 0; x < image.cols; x++)
+            {
+                if (mp[x] == 255)
+                {
+                    p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
+                    p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
+                    p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
+                }
+                p += 3;
+            }
+        }
+    }
+
+    cv::imwrite("result.png", image);
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolact(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/yolov2.cpp
+++ b/3rdparty/ncnn/examples/yolov2.cpp
@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_yolov2(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov2;
+
+    yolov2.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/eric612/MobileNet-YOLO
+    // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy.prototxt
+    // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy_iter_80000.caffemodel
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    yolov2.load_param("mobilenet_yolo.param");
+    yolov2.load_model("mobilenet_yolo.bin");
+
+    const int target_size = 416;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    // the Caffe-YOLOv2-Windows style
+    // X' = X * scale - mean
+    const float mean_vals[3] = {1.0f, 1.0f, 1.0f};
+    const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f};
+    in.substract_mean_normalize(0, norm_vals);
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = yolov2.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov2(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/yolov3.cpp
+++ b/3rdparty/ncnn/examples/yolov3.cpp
@ -0,0 +1,153 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_yolov3(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov3;
+
+    yolov3.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/eric612/MobileNet-YOLO
+    // param : https://drive.google.com/open?id=1V9oKHP6G6XvXZqhZbzNKL6FI_clRWdC-
+    // bin : https://drive.google.com/open?id=1DBcuFCr-856z3FRQznWL_S5h-Aj3RawA
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    yolov3.load_param("mobilenetv2_yolov3.param");
+    yolov3.load_model("mobilenetv2_yolov3.bin");
+
+    const int target_size = 352;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolov3.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov3(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/yolov4.cpp
+++ b/3rdparty/ncnn/examples/yolov4.cpp
@ -0,0 +1,311 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#if CV_MAJOR_VERSION >= 3
+#include <opencv2/videoio/videoio.hpp>
+#endif
+
+#include <vector>
+
+#include <stdio.h>
+
+#define NCNN_PROFILING
+#define YOLOV4_TINY //Using yolov4_tiny, if undef, using original yolov4
+
+#ifdef NCNN_PROFILING
+#include "benchmark.h"
+#endif
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int init_yolov4(ncnn::Net* yolov4, int* target_size)
+{
+    /* --> Set the params you need for the ncnn inference <-- */
+
+    yolov4->opt.num_threads = 4; //You need to compile with libgomp for multi thread support
+
+    yolov4->opt.use_vulkan_compute = true; //You need to compile with libvulkan for gpu support
+
+    yolov4->opt.use_winograd_convolution = true;
+    yolov4->opt.use_sgemm_convolution = true;
+    yolov4->opt.use_fp16_packed = true;
+    yolov4->opt.use_fp16_storage = true;
+    yolov4->opt.use_fp16_arithmetic = true;
+    yolov4->opt.use_packing_layout = true;
+    yolov4->opt.use_shader_pack8 = false;
+    yolov4->opt.use_image_storage = false;
+
+    /* --> End of setting params <-- */
+    int ret = 0;
+
+    // original pretrained model from https://github.com/AlexeyAB/darknet
+    // the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+#ifdef YOLOV4_TINY
+    const char* yolov4_param = "yolov4-tiny-opt.param";
+    const char* yolov4_model = "yolov4-tiny-opt.bin";
+    *target_size = 416;
+#else
+    const char* yolov4_param = "yolov4-opt.param";
+    const char* yolov4_model = "yolov4-opt.bin";
+    *target_size = 608;
+#endif
+
+    ret = yolov4->load_param(yolov4_param);
+    if (ret != 0)
+    {
+        return ret;
+    }
+
+    ret = yolov4->load_model(yolov4_model);
+    if (ret != 0)
+    {
+        return ret;
+    }
+
+    return 0;
+}
+
+static int detect_yolov4(const cv::Mat& bgr, std::vector<Object>& objects, int target_size, ncnn::Net* yolov4)
+{
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {0, 0, 0};
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolov4->create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("output", out);
+
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static int draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, int is_streaming)
+{
+    static const char* class_names[] = {"background", "person", "bicycle",
+                                        "car", "motorbike", "aeroplane", "bus", "train", "truck",
+                                        "boat", "traffic light", "fire hydrant", "stop sign",
+                                        "parking meter", "bench", "bird", "cat", "dog", "horse",
+                                        "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase",
+                                        "frisbee", "skis", "snowboard", "sports ball", "kite",
+                                        "baseball bat", "baseball glove", "skateboard", "surfboard",
+                                        "tennis racket", "bottle", "wine glass", "cup", "fork",
+                                        "knife", "spoon", "bowl", "banana", "apple", "sandwich",
+                                        "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
+                                        "cake", "chair", "sofa", "pottedplant", "bed", "diningtable",
+                                        "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard",
+                                        "cell phone", "microwave", "oven", "toaster", "sink",
+                                        "refrigerator", "book", "clock", "vase", "scissors",
+                                        "teddy bear", "hair drier", "toothbrush"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+
+    if (is_streaming)
+    {
+        cv::waitKey(1);
+    }
+    else
+    {
+        cv::waitKey(0);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    cv::Mat frame;
+    std::vector<Object> objects;
+
+    cv::VideoCapture cap;
+
+    ncnn::Net yolov4;
+
+    const char* devicepath;
+
+    int target_size = 0;
+    int is_streaming = 0;
+
+    if (argc < 2)
+    {
+        fprintf(stderr, "Usage: %s [v4l input device or image]\n", argv[0]);
+        return -1;
+    }
+
+    devicepath = argv[1];
+
+#ifdef NCNN_PROFILING
+    double t_load_start = ncnn::get_current_time();
+#endif
+
+    int ret = init_yolov4(&yolov4, &target_size); //We load model and param first!
+    if (ret != 0)
+    {
+        fprintf(stderr, "Failed to load model or param, error %d", ret);
+        return -1;
+    }
+
+#ifdef NCNN_PROFILING
+    double t_load_end = ncnn::get_current_time();
+    fprintf(stdout, "NCNN Init time %.02lfms\n", t_load_end - t_load_start);
+#endif
+
+    if (strstr(devicepath, "/dev/video") == NULL)
+    {
+        frame = cv::imread(argv[1], 1);
+        if (frame.empty())
+        {
+            fprintf(stderr, "Failed to read image %s.\n", argv[1]);
+            return -1;
+        }
+    }
+    else
+    {
+        cap.open(devicepath);
+
+        if (!cap.isOpened())
+        {
+            fprintf(stderr, "Failed to open %s", devicepath);
+            return -1;
+        }
+
+        cap >> frame;
+
+        if (frame.empty())
+        {
+            fprintf(stderr, "Failed to read from device %s.\n", devicepath);
+            return -1;
+        }
+
+        is_streaming = 1;
+    }
+
+    while (1)
+    {
+        if (is_streaming)
+        {
+#ifdef NCNN_PROFILING
+            double t_capture_start = ncnn::get_current_time();
+#endif
+
+            cap >> frame;
+
+#ifdef NCNN_PROFILING
+            double t_capture_end = ncnn::get_current_time();
+            fprintf(stdout, "NCNN OpenCV capture time %.02lfms\n", t_capture_end - t_capture_start);
+#endif
+            if (frame.empty())
+            {
+                fprintf(stderr, "OpenCV Failed to Capture from device %s\n", devicepath);
+                return -1;
+            }
+        }
+
+#ifdef NCNN_PROFILING
+        double t_detect_start = ncnn::get_current_time();
+#endif
+
+        detect_yolov4(frame, objects, target_size, &yolov4); //Create an extractor and run detection
+
+#ifdef NCNN_PROFILING
+        double t_detect_end = ncnn::get_current_time();
+        fprintf(stdout, "NCNN detection time %.02lfms\n", t_detect_end - t_detect_start);
+#endif
+
+#ifdef NCNN_PROFILING
+        double t_draw_start = ncnn::get_current_time();
+#endif
+
+        draw_objects(frame, objects, is_streaming); //Draw detection results on opencv image
+
+#ifdef NCNN_PROFILING
+        double t_draw_end = ncnn::get_current_time();
+        fprintf(stdout, "NCNN OpenCV draw result time %.02lfms\n", t_draw_end - t_draw_start);
+#endif
+
+        if (!is_streaming)
+        {   //If it is a still image, exit!
+            return 0;
+        }
+    }
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/yolov5.cpp
+++ b/3rdparty/ncnn/examples/yolov5.cpp
@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+#define YOLOV5_V60 1 //YOLOv5 v6.0
+
+#if YOLOV5_V60
+#define MAX_STRIDE 64
+#else
+#define MAX_STRIDE 32
+class YoloV5Focus : public ncnn::Layer
+{
+public:
+    YoloV5Focus()
+    {
+        one_blob_only = true;
+    }
+
+    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int outw = w / 2;
+        int outh = h / 2;
+        int outc = channels * 4;
+
+        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outc; p++)
+        {
+            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
+            float* outptr = top_blob.channel(p);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    *outptr = *ptr;
+
+                    outptr += 1;
+                    ptr += 2;
+                }
+
+                ptr += w;
+            }
+        }
+
+        return 0;
+    }
+};
+
+DEFINE_LAYER_CREATOR(YoloV5Focus)
+#endif //YOLOV5_V60
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = feat_blob.h;
+
+    int num_grid_x;
+    int num_grid_y;
+    if (in_pad.w > in_pad.h)
+    {
+        num_grid_x = in_pad.w / stride;
+        num_grid_y = num_grid / num_grid_x;
+    }
+    else
+    {
+        num_grid_y = in_pad.h / stride;
+        num_grid_x = num_grid / num_grid_y;
+    }
+
+    const int num_class = feat_blob.w - 5;
+
+    const int num_anchors = anchors.w / 2;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float anchor_w = anchors[q * 2];
+        const float anchor_h = anchors[q * 2 + 1];
+
+        const ncnn::Mat feat = feat_blob.channel(q);
+
+        for (int i = 0; i < num_grid_y; i++)
+        {
+            for (int j = 0; j < num_grid_x; j++)
+            {
+                const float* featptr = feat.row(i * num_grid_x + j);
+                float box_confidence = sigmoid(featptr[4]);
+                if (box_confidence >= prob_threshold)
+                {
+                    // find class index with max class score
+                    int class_index = 0;
+                    float class_score = -FLT_MAX;
+                    for (int k = 0; k < num_class; k++)
+                    {
+                        float score = featptr[5 + k];
+                        if (score > class_score)
+                        {
+                            class_index = k;
+                            class_score = score;
+                        }
+                    }
+                    float confidence = box_confidence * sigmoid(class_score);
+                    if (confidence >= prob_threshold)
+                    {
+                        // yolov5/models/yolo.py Detect forward
+                        // y = x[i].sigmoid()
+                        // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                        // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+
+                        float dx = sigmoid(featptr[0]);
+                        float dy = sigmoid(featptr[1]);
+                        float dw = sigmoid(featptr[2]);
+                        float dh = sigmoid(featptr[3]);
+
+                        float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+                        float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+                        float pb_w = pow(dw * 2.f, 2) * anchor_w;
+                        float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+                        float x0 = pb_cx - pb_w * 0.5f;
+                        float y0 = pb_cy - pb_h * 0.5f;
+                        float x1 = pb_cx + pb_w * 0.5f;
+                        float y1 = pb_cy + pb_h * 0.5f;
+
+                        Object obj;
+                        obj.rect.x = x0;
+                        obj.rect.y = y0;
+                        obj.rect.width = x1 - x0;
+                        obj.rect.height = y1 - y0;
+                        obj.label = class_index;
+                        obj.prob = confidence;
+
+                        objects.push_back(obj);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov5;
+
+    yolov5.opt.use_vulkan_compute = true;
+    // yolov5.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/ultralytics/yolov5
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+#if YOLOV5_V60
+    yolov5.load_param("yolov5s_6.0.param");
+    yolov5.load_model("yolov5s_6.0.bin");
+#else
+    yolov5.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
+
+    yolov5.load_param("yolov5s.param");
+    yolov5.load_model("yolov5s.bin");
+#endif
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // letterbox pad to multiple of MAX_STRIDE
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    // pad to target_size rectangle
+    // yolov5/utils/datasets.py letterbox
+    int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
+    int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov5.create_extractor();
+
+    ex.input("images", in_pad);
+
+    std::vector<Object> proposals;
+
+    // anchor setting from yolov5/models/yolov5s.yaml
+
+    // stride 8
+    {
+        ncnn::Mat out;
+        ex.extract("output", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 10.f;
+        anchors[1] = 13.f;
+        anchors[2] = 16.f;
+        anchors[3] = 30.f;
+        anchors[4] = 33.f;
+        anchors[5] = 23.f;
+
+        std::vector<Object> objects8;
+        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat out;
+#if YOLOV5_V60
+        ex.extract("376", out);
+#else
+        ex.extract("781", out);
+#endif
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 30.f;
+        anchors[1] = 61.f;
+        anchors[2] = 62.f;
+        anchors[3] = 45.f;
+        anchors[4] = 59.f;
+        anchors[5] = 119.f;
+
+        std::vector<Object> objects16;
+        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat out;
+#if YOLOV5_V60
+        ex.extract("401", out);
+#else
+        ex.extract("801", out);
+#endif
+        ncnn::Mat anchors(6);
+        anchors[0] = 116.f;
+        anchors[1] = 90.f;
+        anchors[2] = 156.f;
+        anchors[3] = 198.f;
+        anchors[4] = 373.f;
+        anchors[5] = 326.f;
+
+        std::vector<Object> objects32;
+        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov5(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/yolov5_pnnx.cpp
+++ b/3rdparty/ncnn/examples/yolov5_pnnx.cpp
@ -0,0 +1,422 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid_x = feat_blob.w;
+    const int num_grid_y = feat_blob.h;
+
+    const int num_anchors = anchors.w / 2;
+
+    const int num_class = 80;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float anchor_w = anchors[q * 2];
+        const float anchor_h = anchors[q * 2 + 1];
+
+        for (int i = 0; i < num_grid_y; i++)
+        {
+            for (int j = 0; j < num_grid_x; j++)
+            {
+                // find class index with max class score
+                int class_index = 0;
+                float class_score = -FLT_MAX;
+                for (int k = 0; k < num_class; k++)
+                {
+                    float score = feat_blob.channel(q * 85 + 5 + k).row(i)[j];
+                    if (score > class_score)
+                    {
+                        class_index = k;
+                        class_score = score;
+                    }
+                }
+
+                float box_score = feat_blob.channel(q * 85 + 4).row(i)[j];
+
+                float confidence = sigmoid(box_score) * sigmoid(class_score);
+
+                if (confidence >= prob_threshold)
+                {
+                    // yolov5/models/yolo.py Detect forward
+                    // y = x[i].sigmoid()
+                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+
+                    float dx = sigmoid(feat_blob.channel(q * 85 + 0).row(i)[j]);
+                    float dy = sigmoid(feat_blob.channel(q * 85 + 1).row(i)[j]);
+                    float dw = sigmoid(feat_blob.channel(q * 85 + 2).row(i)[j]);
+                    float dh = sigmoid(feat_blob.channel(q * 85 + 3).row(i)[j]);
+
+                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
+                    float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+                    float x0 = pb_cx - pb_w * 0.5f;
+                    float y0 = pb_cy - pb_h * 0.5f;
+                    float x1 = pb_cx + pb_w * 0.5f;
+                    float y1 = pb_cy + pb_h * 0.5f;
+
+                    Object obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0;
+                    obj.rect.height = y1 - y0;
+                    obj.label = class_index;
+                    obj.prob = confidence;
+
+                    objects.push_back(obj);
+                }
+            }
+        }
+    }
+}
+
+static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov5;
+
+    yolov5.opt.use_vulkan_compute = true;
+    // yolov5.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/ultralytics/yolov5
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    yolov5.load_param("yolov5s.ncnn.param");
+    yolov5.load_model("yolov5s.ncnn.bin");
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // yolov5/models/common.py DetectMultiBackend
+    const int max_stride = 64;
+
+    // letterbox pad to multiple of max_stride
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    // pad to target_size rectangle
+    // yolov5/utils/datasets.py letterbox
+    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
+    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov5.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // anchor setting from yolov5/models/yolov5s.yaml
+
+    // stride 8
+    {
+        ncnn::Mat out;
+        ex.extract("out0", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 10.f;
+        anchors[1] = 13.f;
+        anchors[2] = 16.f;
+        anchors[3] = 30.f;
+        anchors[4] = 33.f;
+        anchors[5] = 23.f;
+
+        std::vector<Object> objects8;
+        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat out;
+        ex.extract("out1", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 30.f;
+        anchors[1] = 61.f;
+        anchors[2] = 62.f;
+        anchors[3] = 45.f;
+        anchors[4] = 59.f;
+        anchors[5] = 119.f;
+
+        std::vector<Object> objects16;
+        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat out;
+        ex.extract("out2", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 116.f;
+        anchors[1] = 90.f;
+        anchors[2] = 156.f;
+        anchors[3] = 198.f;
+        anchors[4] = 373.f;
+        anchors[5] = 326.f;
+
+        std::vector<Object> objects32;
+        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov5(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
--- a/3rdparty/ncnn/examples/yolox.cpp
+++ b/3rdparty/ncnn/examples/yolox.cpp
@ -0,0 +1,418 @@
+// This file is wirtten base on the following file:
+// https://github.com/Tencent/ncnn/blob/master/examples/yolov5.cpp
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+// ------------------------------------------------------------------------------
+// Copyright (C) 2020-2021, Megvii Inc. All rights reserved.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+#define YOLOX_NMS_THRESH  0.45 // nms threshold
+#define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob
+#define YOLOX_TARGET_SIZE 640  // target image size after resize, might use 416 for small model
+
+// YOLOX use the same focus in yolov5
+class YoloV5Focus : public ncnn::Layer
+{
+public:
+    YoloV5Focus()
+    {
+        one_blob_only = true;
+    }
+
+    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int outw = w / 2;
+        int outh = h / 2;
+        int outc = channels * 4;
+
+        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outc; p++)
+        {
+            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
+            float* outptr = top_blob.channel(p);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    *outptr = *ptr;
+
+                    outptr += 1;
+                    ptr += 2;
+                }
+
+                ptr += w;
+            }
+        }
+
+        return 0;
+    }
+};
+
+DEFINE_LAYER_CREATOR(YoloV5Focus)
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+struct GridAndStride
+{
+    int grid0;
+    int grid1;
+    int stride;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static void generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
+{
+    for (int i = 0; i < (int)strides.size(); i++)
+    {
+        int stride = strides[i];
+        int num_grid = target_size / stride;
+        for (int g1 = 0; g1 < num_grid; g1++)
+        {
+            for (int g0 = 0; g0 < num_grid; g0++)
+            {
+                GridAndStride gs;
+                gs.grid0 = g0;
+                gs.grid1 = g1;
+                gs.stride = stride;
+                grid_strides.push_back(gs);
+            }
+        }
+    }
+}
+
+static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = feat_blob.h;
+    const int num_class = feat_blob.w - 5;
+    const int num_anchors = grid_strides.size();
+
+    const float* feat_ptr = feat_blob.channel(0);
+    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
+    {
+        const int grid0 = grid_strides[anchor_idx].grid0;
+        const int grid1 = grid_strides[anchor_idx].grid1;
+        const int stride = grid_strides[anchor_idx].stride;
+
+        // yolox/models/yolo_head.py decode logic
+        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        float x_center = (feat_ptr[0] + grid0) * stride;
+        float y_center = (feat_ptr[1] + grid1) * stride;
+        float w = exp(feat_ptr[2]) * stride;
+        float h = exp(feat_ptr[3]) * stride;
+        float x0 = x_center - w * 0.5f;
+        float y0 = y_center - h * 0.5f;
+
+        float box_objectness = feat_ptr[4];
+        for (int class_idx = 0; class_idx < num_class; class_idx++)
+        {
+            float box_cls_score = feat_ptr[5 + class_idx];
+            float box_prob = box_objectness * box_cls_score;
+            if (box_prob > prob_threshold)
+            {
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = w;
+                obj.rect.height = h;
+                obj.label = class_idx;
+                obj.prob = box_prob;
+
+                objects.push_back(obj);
+            }
+
+        } // class loop
+        feat_ptr += feat_blob.w;
+
+    } // point anchor loop
+}
+
+static int detect_yolox(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolox;
+
+    yolox.opt.use_vulkan_compute = true;
+    // yolox.opt.use_bf16_storage = true;
+
+    // Focus in yolov5
+    yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
+
+    // original pretrained model from https://github.com/Megvii-BaseDetection/YOLOX
+    // ncnn model param: https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s_ncnn.tar.gz
+    // NOTE that newest version YOLOX remove normalization of model (minus mean and then div by std),
+    // which might cause your model outputs becoming a total mess, plz check carefully.
+    yolox.load_param("yolox.param");
+    yolox.load_model("yolox.bin");
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)YOLOX_TARGET_SIZE / w;
+        w = YOLOX_TARGET_SIZE;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)YOLOX_TARGET_SIZE / h;
+        h = YOLOX_TARGET_SIZE;
+        w = w * scale;
+    }
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);
+
+    // pad to YOLOX_TARGET_SIZE rectangle
+    int wpad = YOLOX_TARGET_SIZE - w;
+    int hpad = YOLOX_TARGET_SIZE - h;
+    ncnn::Mat in_pad;
+    // different from yolov5, yolox only pad on bottom and right side,
+    // which means users don't need to extra padding info to decode boxes coordinate.
+    ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
+
+    ncnn::Extractor ex = yolox.create_extractor();
+
+    ex.input("images", in_pad);
+
+    std::vector<Object> proposals;
+
+    {
+        ncnn::Mat out;
+        ex.extract("output", out);
+
+        static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX
+        std::vector<int> strides(stride_arr, stride_arr + sizeof(stride_arr) / sizeof(stride_arr[0]));
+        std::vector<GridAndStride> grid_strides;
+        generate_grids_and_stride(YOLOX_TARGET_SIZE, strides, grid_strides);
+        generate_yolox_proposals(grid_strides, out, YOLOX_CONF_THRESH, proposals);
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x) / scale;
+        float y0 = (objects[i].rect.y) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolox(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
				`@ -0,0 +1 @@`
				`The squeezenet android example project has been moved to https://github.com/nihui/ncnn-android-squeezenet`