feat: 切换后端至PaddleOCR-NCNN，切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法，已通过基本的兼容性测试 2.工程改为使用CMake组织，后续为了更好地兼容第三方库，不再提供QMake工程 3.重整权利声明文件，重整代码工程，确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN，切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/accum_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/accum_layer.cpp
@ -0,0 +1,141 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2020, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+
+namespace cv { namespace dnn {
+
+class AccumLayerImpl CV_FINAL : public AccumLayer
+{
+public:
+    AccumLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        top_height = params.get<int>("top_height", 0);
+        top_width = params.get<int>("top_width", 0);
+        divisor = params.get<int>("size_divisible_by", 0);
+        have_reference = params.get<String>("have_reference", "false") == "true";
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        std::vector<int> outShape;
+        int batch = inputs[0][0];
+        outShape.push_back(batch);
+
+        if (have_reference)
+        {
+            CV_Assert(inputs.size() >= 2);
+            int totalchannels = 0;
+            for (int i = 0; i < inputs.size() - 1; i++) {
+                CV_Assert(inputs[i][0] == batch);
+                totalchannels += inputs[i][1];
+            }
+            outShape.push_back(totalchannels);
+
+            int height = inputs.back()[2];
+            int width = inputs.back()[3];
+
+            outShape.push_back(height);
+            outShape.push_back(width);
+        }
+        else
+        {
+            int maxwidth = -1;
+            int maxheight = -1;
+            int totalchannels = 0;
+
+            // Find largest blob size and count total channels
+            for (int i = 0; i < inputs.size(); ++i)
+            {
+                totalchannels += inputs[i][1];
+                maxheight = std::max(maxheight, inputs[i][2]);
+                maxwidth = std::max(maxwidth, inputs[i][3]);
+                CV_Assert(inputs[i][0] == batch);
+            }
+            outShape.push_back(totalchannels);
+
+            int out_h = divisor ? static_cast<int>(ceil(maxheight / divisor) * divisor) : top_height;
+            int out_w = divisor ? static_cast<int>(ceil(maxwidth / divisor) * divisor) : top_width;
+
+            // Layer can specify custom top size which is larger than default
+            if (out_h <= maxheight || out_w <= maxwidth)
+            {
+                out_h = maxheight;
+                out_w = maxwidth;
+            }
+
+            outShape.push_back(out_h);
+            outShape.push_back(out_w);
+        }
+
+        outputs.assign(1, outShape);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        LayerParams resizeParams;
+        resizeParams.set("interpolation", "bilinear");
+        resizeParams.set("align_corners", true);
+        resize = ResizeLayer::create(resizeParams);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const int out_h = outputs[0].size[2];
+        const int out_w = outputs[0].size[3];
+        float* out_data = outputs[0].ptr<float>();
+        std::vector<int> sizes(&outputs[0].size[0], &outputs[0].size[0] + outputs[0].size.dims());
+        for (int i = 0; i < inputs.size() - have_reference; i++)
+        {
+            sizes[1] = inputs[i].size[1];
+            Mat outSlice(sizes, CV_32F, out_data);
+
+            if (out_h == inputs[i].size[2] && out_w == inputs[i].size[3])
+            {
+                inputs[i].copyTo(outSlice);
+            }
+            else
+            {
+                std::vector<Mat> inp_slices, out_slices;
+                inp_slices.push_back(inputs[i]);
+                out_slices.push_back(outSlice);
+
+                resize->finalize(inp_slices, out_slices);
+                resize->forward(inp_slices, out_slices, internals_arr);
+            }
+            out_data += outSlice.total(1);
+        }
+    }
+
+private:
+    int top_height;
+    int top_width;
+    int divisor;
+    bool have_reference;
+    Ptr<ResizeLayer> resize;
+};
+
+Ptr<AccumLayer> AccumLayer::create(const LayerParams& params)
+{
+    return Ptr<AccumLayer>(new AccumLayerImpl(params));
+}
+
+}}  // namespace cv::dnn
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/batch_norm_layer.cpp
@ -0,0 +1,447 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of Batch Normalization layer.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/batch_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
+{
+public:
+    Mat origin_weights, origin_bias;
+    Mat weights_, bias_;
+    UMat umat_weight, umat_bias;
+    mutable int dims;
+
+
+    BatchNormLayerImpl(const LayerParams& params)
+        : dims(-1)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() >= 2);
+
+        hasWeights = params.get<bool>("has_weight", false);
+        hasBias = params.get<bool>("has_bias", false);
+        useGlobalStats = params.get<bool>("use_global_stats", true);
+        if(params.get<bool>("scale_bias", false))
+            hasWeights = hasBias = true;
+        epsilon = params.get<float>("eps", 1E-5);
+
+        size_t n = blobs[0].total();
+        CV_Assert(blobs[1].total() == n &&
+                  blobs[0].isContinuous() && blobs[1].isContinuous() &&
+                  blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
+
+        float varMeanScale = 1.f;
+        if (!hasWeights && !hasBias && blobs.size() > 2 && useGlobalStats) {
+            CV_Assert(blobs.size() == 3); CV_CheckTypeEQ(blobs[2].type(), CV_32FC1, "");
+            varMeanScale = blobs[2].at<float>(0);
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+
+        const int biasBlobIndex = blobs.size() - 1;
+        const int weightsBlobIndex = biasBlobIndex - hasBias;
+
+        if( hasWeights )
+        {
+            CV_Assert((size_t)weightsBlobIndex < blobs.size());
+            const Mat& w = blobs[weightsBlobIndex];
+            CV_Assert(w.isContinuous() && w.type() == CV_32F && w.total() == (size_t)n);
+        }
+
+        if( hasBias )
+        {
+            CV_Assert((size_t)biasBlobIndex < blobs.size());
+            const Mat& b = blobs[weightsBlobIndex];
+            CV_Assert(b.isContinuous() && b.type() == CV_32F && b.total() == (size_t)n);
+        }
+
+        const float* meanData = blobs[0].ptr<float>();
+        const float* stdData = blobs[1].ptr<float>();
+        const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
+        const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
+
+        origin_weights.create(1, (int)n, CV_32F);
+        origin_bias.create(1, (int)n, CV_32F);
+
+        float* dstWeightsData = origin_weights.ptr<float>();
+        float* dstBiasData = origin_bias.ptr<float>();
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            float w = (hasWeights ? weightsData[i] : 1.0f) / sqrt(stdData[i] * varMeanScale + epsilon);
+            dstWeightsData[i] = w;
+            dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
+        }
+    }
+
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        origin_weights.reshape(1, 1).copyTo(weights_);
+        origin_bias.reshape(1, 1).copyTo(bias_);
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = weights_;
+        shift = bias_;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w, b;
+        top->getScaleShift(w, b);
+        if (w.empty() && b.empty())
+            return false;
+
+        const int numChannels = weights_.total();
+        const int numFusedWeights = w.total();
+        const int numFusedBias = b.total();
+
+        if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w.empty()) ||
+            (numFusedBias != numChannels && numFusedBias != 1 && !b.empty()))
+            return false;
+
+        if (!w.empty())
+        {
+            w = w.reshape(1, 1);
+            if (numFusedWeights == 1)
+            {
+                multiply(weights_, w.at<float>(0), weights_);
+                multiply(bias_, w.at<float>(0), bias_);
+            }
+            else
+            {
+                multiply(weights_, w, weights_);
+                multiply(bias_, w, bias_);
+            }
+        }
+        if (!b.empty())
+        {
+            b = b.reshape(1, 1);
+            if (numFusedBias == 1)
+                add(bias_, b.at<float>(0), bias_);
+            else
+                add(bias_, b.reshape(1, 1), bias_);
+        }
+        return true;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        dims = inputs[0].size();
+        if (!useGlobalStats && inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return (backendId == DNN_BACKEND_OPENCV) ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide()) ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4));
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        CV_Assert(blobs.size() >= 2);
+        CV_Assert(inputs.size() == 1);
+
+        if (use_half && inputs[0].dims == 2)
+            return false;
+
+        if (umat_weight.empty())
+        {
+            weights_.copyTo(umat_weight);
+            bias_.copyTo(umat_bias);
+        }
+
+        UMat &inpBlob = inputs[0];
+        int groups = inpBlob.size[0];
+        int channels = inpBlob.size[1];
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
+
+        String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            if (inpBlob.dims == 2)
+            {
+                UMat& src = inputs[ii];
+                UMat& dst = outputs[ii];
+                multiply(src, weights_, dst);
+                add(dst, bias_, dst);
+            }
+            else
+            {
+                MatShape s = shape(groups * channels, planeSize);
+                UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
+                UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
+                int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
+                String buildopt = format("-DNUM=%d", number) + opts;
+                String kname = format("batch_norm%d", number);
+                if (number == 1)
+                    buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
+                else
+                    buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
+                ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
+                if (kernel.empty())
+                    return false;
+                size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
+                kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
+                kernel.set(1, (int)s[0]);
+                kernel.set(2, (int)s[1]);
+                kernel.set(3, (int)channels);
+                kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
+                kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
+                kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
+                bool ret = kernel.run_(2, global, NULL, false);
+                if (!ret)
+                    return false;
+            }
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(blobs.size() >= 2);
+        CV_Assert(inputs.size() == 1);
+
+        Mat &inpBlob = inputs[0];
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &outBlob = outputs[ii];
+
+            for(int num = 0; num < outBlob.size[0]; num++)
+            {
+                for (int n = 0; n < outBlob.size[1]; n++)
+                {
+                    float w = weights_.at<float>(n);
+                    float b = bias_.at<float>(n);
+                    Mat inpBlobPlane(1, planeSize, CV_32F, inpBlob.ptr<float>(num, n));
+                    Mat outBlobPlane(1, planeSize, CV_32F, outBlob.ptr<float>(num, n));
+                    inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
+                }
+            }
+        }
+    }
+
+    void forwardSlice(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            int i = 0;
+            float w = weights_.at<float>(cn);
+            float b = bias_.at<float>(cn);
+#if CV_SIMD128
+            v_float32x4 wV = v_setall_f32(w), bV = v_setall_f32(b);
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
+                x0 = v_muladd(x0, wV, bV);
+                x1 = v_muladd(x1, wV, bV);
+                x2 = v_muladd(x2, wV, bV);
+                x3 = v_muladd(x3, wV, bV);
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
+            }
+#endif
+            for( ; i < len; i++ )
+                dstptr[i] = w * srcptr[i] + b;
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::BatchNormOp>(preferableTarget, std::move(context->stream), weights_, bias_);
+    }
+#endif
+
+    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+    {
+        switch (node->backendId)
+        {
+            case DNN_BACKEND_HALIDE:
+            {
+#ifdef HAVE_HALIDE
+                auto base = node.dynamicCast<HalideBackendNode>();
+                Halide::Func& input = base->funcs.back();
+                Halide::Var x("x"), y("y"), c("c"), n("n");
+                Halide::Func top = attachHalide(input(x, y, c, n));
+                return Ptr<BackendNode>(new HalideBackendNode(base, top));
+#endif  // HAVE_HALIDE
+                break;
+            }
+        }
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Buffer<float> input = halideBuffer(inputs[0]);
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = attachHalide(input(x, y, c, n));
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_HALIDE
+    // attachHalide can work both with Halide::Buffer and Halide::Func. In the
+    // second case it will be a fusion.
+    Halide::Func attachHalide(const Halide::Expr& input)
+    {
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+
+        const int numChannels = weights_.total();
+        auto weights = wrapToHalideBuffer(weights_, {numChannels});
+        auto bias = wrapToHalideBuffer(bias_, {numChannels});
+        top(x, y, c, n) = input * weights(c) + bias(c);
+        return top;
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
+        const size_t numChannels = weights_.total();
+        addConstantData("weights", wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
+        addConstantData("biases", wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        shape[1] = weights_.total();
+        auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
+        auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
+        auto scale_node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#else
+        auto scale_node = std::make_shared<ngraph::op::v0::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#endif
+        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(scale_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(scale_shift));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        params.set("input_scale", scales[0][0]);
+        params.set("input_zeropoint", zeropoints[0][0]);
+
+        params.blobs.clear();
+        params.blobs.push_back(origin_weights);
+        params.blobs.push_back(origin_bias);
+        return true;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 3*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool useGlobalStats;
+};
+
+Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
+{
+    return Ptr<BatchNormLayer>(new BatchNormLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/blank_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/blank_layer.cpp
@ -0,0 +1,198 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+class BlankLayerImpl CV_FINAL : public BlankLayer
+{
+public:
+    BlankLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        for (int i = 0, n = outputs.size(); i < n; ++i)
+        {
+            void *src_handle = inputs[i].handle(ACCESS_READ);
+            void *dst_handle = outputs[i].handle(ACCESS_WRITE);
+            if (src_handle != dst_handle)
+                inputs[i].copyTo(outputs[i]);
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        for (int i = 0, n = outputs.size(); i < n; ++i)
+            if (outputs[i].data != inputs[i].data)
+                inputs[i].copyTo(outputs[i]);
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+        std::vector<size_t> dims = input->getDims();
+        CV_Assert(!dims.empty());
+
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL)
+        {
+            ieLayer.setType("Copy");
+        }
+        else
+        {
+            ieLayer.setType("Split");
+            ieLayer.getParameters()["axis"] = dims.size() - 1;
+            ieLayer.getParameters()["out_sizes"] = dims[0];
+        }
+        ieLayer.setInputPorts({InferenceEngine::Port(dims)});
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ngraph::OutputVector inp{ieInpNode};
+        auto blank = std::make_shared<ngraph::op::Concat>(inp, 0);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+};
+
+Ptr<Layer> BlankLayer::create(const LayerParams& params)
+{
+    // In case of Caffe's Dropout layer from Faster-RCNN framework,
+    // https://github.com/rbgirshick/caffe-fast-rcnn/tree/faster-rcnn
+    // return Power layer.
+    if (!params.get<bool>("scale_train", true))
+    {
+        float scale = 1 - params.get<float>("dropout_ratio", 0.5f);
+        CV_Assert(scale > 0);
+
+        LayerParams powerParams;
+        powerParams.name = params.name;
+        powerParams.type = "Power";
+        powerParams.set("scale", scale);
+
+        return PowerLayer::create(powerParams);
+    }
+    else
+        return Ptr<BlankLayer>(new BlankLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/concat_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/concat_layer.cpp
@ -0,0 +1,419 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/concat.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class ConcatLayerImpl CV_FINAL : public ConcatLayer
+{
+public:
+    ConcatLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 1);
+        padding = params.get<bool>("padding", false);
+        paddingValue = params.get<int>("padding_value", 0);
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        outputs.resize(1, inputs[0]);
+        int cAxis = normalize_axis(axis, inputs[0]);
+
+        int axisSum = 0;
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape curShape = inputs[i];
+
+            if (padding)
+            {
+                for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
+                {
+                    outputs[0][curAxis] = std::max(outputs[0][curAxis], curShape[curAxis]);
+                }
+            }
+            else
+            {
+                CV_Assert(curShape.size() == outputs[0].size());
+                for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
+                {
+                    if (curAxis != cAxis && outputs[0][curAxis] != curShape[curAxis])
+                        CV_Error(Error::StsBadSize, "Inconsistent shape for ConcatLayer");
+                }
+            }
+
+            axisSum += curShape[cAxis];
+        }
+        outputs[0][cAxis] = axisSum;
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) ||  // By channels
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && haveInfEngine() && !padding) ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding);
+    }
+
+    template <class T>
+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const T*> chptrs;
+
+        static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = inputs[i];
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
+
+            cc.chptrs.resize(nchannels*batchsz);
+
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const T* ptr = inp.ptr<T>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+
+        ChannelConcatInvoker()  : inputs(0), output(0), nstripes(0) {}
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const T** ptrs = (const T**)&chptrs[0];
+            T* outptr = output->ptr<T>();
+            size_t blockSize0 = 1 << 16;
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int cAxis = normalize_axis(axis, inputs[0].dims);
+        if (padding)
+            return false;
+
+        int bottom_concat_axis;
+        int concat_size = total(shape(inputs[0]), cAxis + 1);
+        int top_concat_axis = outputs[0].size[cAxis];
+        int num_concats = total(shape(inputs[0]), 0, cAxis);
+        int offset_concat_axis = 0;
+        UMat& outMat = outputs[0];
+        String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
+        String kname = format("concat_%s", use_half ? "half" : "float");
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
+            if (kernel.empty())
+                return false;
+
+            UMat& inpMat = inputs[i];
+            bottom_concat_axis = inputs[i].size[cAxis];
+            size_t nthreads = inputs[i].total();
+
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel.set(2, (int)num_concats);
+            kernel.set(3, (int)concat_size);
+            kernel.set(4, (int)top_concat_axis);
+            kernel.set(5, (int)bottom_concat_axis);
+            kernel.set(6, (int)offset_concat_axis);
+            kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat));
+
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+
+            offset_concat_axis += bottom_concat_axis;
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   inputs_arr.depth() != CV_8S,
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        int cAxis = normalize_axis(axis, inputs[0].dims);
+        Mat& outMat = outputs[0];
+
+        if (padding)
+            outMat.setTo(paddingValue);
+
+        if( cAxis == 1 && outMat.dims == 4 && !padding)
+        {
+            int nstripes = getNumThreads();
+            if (outMat.type() == CV_8S)
+                ChannelConcatInvoker<int8_t>::run(inputs, outMat, nstripes);
+            else
+                ChannelConcatInvoker<float>::run(inputs, outMat, nstripes);
+        }
+        else
+        {
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i].size[cAxis];
+                for (int j = 0; j < outMat.dims; ++j)
+                {
+                    if (j == cAxis) continue;
+                    ranges[j].start = (outMat.size[j] - inputs[i].size[j]) / 2;
+                    ranges[j].end = ranges[j].start + inputs[i].size[j];
+                }
+                inputs[i].copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto concat_axis = normalize_axis(axis, input_wrapper->getRank());
+        return make_cuda_node<cuda4dnn::ConcatOp>(preferableTarget, std::move(context->stream), concat_axis, padding);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        vkcom::Tensor in = VkComTensor(input[0]);
+        int cAxis = normalize_axis(axis, in.dimNum());
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpConcat(cAxis));
+        return Ptr<BackendNode>(new VkComBackendNode(input, op));
+#endif // HAVE_VULKAN
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        int offset = inputBuffers[0].channels();
+        Halide::Expr topExpr = select(c < offset,
+                                      inputBuffers[0](x, y, c, n),
+                                      inputBuffers[1](x, y, c - offset, n));
+        for (int i = 2; i < input.size(); ++i)
+        {
+            offset += inputBuffers[i - 1].channels();
+            topExpr = select(c < offset, topExpr,
+                             inputBuffers[i](x, y, c - offset, n));
+        }
+        top(x, y, c, n) = topExpr;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::ConcatLayer ieLayer(name);
+        ieLayer.setAxis(normalize_axis(axis, input->getDims().size()));
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr data = ngraphDataNode(inputs[0]);
+        const int numDims = data->getDims().size();
+        const int cAxis = normalize_axis(axis, numDims);
+        std::vector<size_t> maxDims(numDims, 0);
+
+        CV_Assert(inputs.size() == nodes.size());
+        ngraph::OutputVector inp_nodes;
+        for (int i = 0; i < nodes.size(); ++i)
+        {
+            inp_nodes.push_back(nodes[i].dynamicCast<InfEngineNgraphNode>()->node);
+
+            std::vector<size_t> inpShape = ngraphDataNode(inputs[i])->getDims();
+            for (int i = 0; i < numDims; ++i)
+                maxDims[i] = std::max(maxDims[i], inpShape[i]);
+        }
+        for (int i = 0; i < inp_nodes.size(); ++i)
+        {
+            bool needPadding = false;
+            std::vector<size_t> inpShape = ngraphDataNode(inputs[i])->getDims();
+            std::vector<int64_t> begins(inpShape.size(), 0), ends(inpShape.size(), 0);
+            for (int j = 0; j < inpShape.size(); ++j)
+            {
+                if (j != cAxis && inpShape[j] != maxDims[j])
+                {
+                    needPadding = true;
+                    begins[j] = static_cast<int64_t>((maxDims[j] - inpShape[j]) / 2);
+                    ends[j] = static_cast<int64_t>(maxDims[j] - inpShape[j] - begins[j]);
+                }
+            }
+            if (needPadding)
+            {
+                inp_nodes[i] = std::make_shared<ngraph::op::v1::Pad>(
+                    inp_nodes[i],
+                    std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data()),
+                    std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data()),
+                    ngraph::op::PadMode::CONSTANT);
+            }
+        }
+        auto concat = std::make_shared<ngraph::op::Concat>(inp_nodes, cAxis);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (padding)
+            params.set("padding_value", zeropoints[1][0]);
+        return true;
+    }
+};
+
+Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
+{
+    return Ptr<ConcatLayer>(new ConcatLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/const_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/const_layer.cpp
@ -0,0 +1,131 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../op_cuda.hpp"
+#include "layers_common.hpp"
+#include "../ie_ngraph.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/const.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class ConstLayerImpl CV_FINAL : public ConstLayer
+{
+public:
+    ConstLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() == 1);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.empty());
+        outputs.assign(1, shape(blobs[0]));
+        return false;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> outputs;
+        outs.getUMatVector(outputs);
+        if (outs.depth() == CV_16S)
+            convertFp16(blobs[0], outputs[0]);
+        else
+            blobs[0].copyTo(outputs[0]);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+        blobs[0].copyTo(outputs[0]);
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ConstLayer ieLayer(name);
+        ieLayer.setData(wrapToInfEngineBlob(blobs[0]));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                           getShape<size_t>(blobs[0]),
+                                                           blobs[0].data);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        CV_Assert(blobs.size() == 1);
+        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        Mat quantizedBlob;
+        blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]);
+        params.blobs.clear();
+        params.blobs.push_back(quantizedBlob);
+        return true;
+    }
+};
+
+Ptr<Layer> ConstLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new ConstLayerImpl(params));
+}
+
+}}  // namespace cv::dnn
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/convolution_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/convolution_layer.cpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/correlation_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/correlation_layer.cpp
@ -0,0 +1,207 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2020, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+
+namespace cv { namespace dnn {
+
+class CorrelationLayerImpl CV_FINAL : public CorrelationLayer
+{
+public:
+    CorrelationLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        pad = params.get<int>("pad", 0);
+        CV_Assert_N(params.has("kernel_size"), params.has("max_displacement"));
+        max_displacement = params.get<int>("max_displacement");
+        kernel = params.get<int>("kernel_size");
+        if (kernel % 2 == 0)
+            CV_Error(Error::StsNotImplemented, "Odd kernel size required.");
+
+        stride_1 = params.get<int>("stride_1", 1);
+        stride_2 = params.get<int>("stride_2", 1);
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 2, inputs[0].size() == 4, inputs[1].size() == 4);
+
+        int padded_height = inputs[0][2] + 2 * pad;
+        int padded_width  = inputs[0][3] + 2 * pad;
+
+        int kernel_radius = (kernel - 1) / 2;
+        int border_size = max_displacement + kernel_radius;
+
+        int neighborhood_grid_radius = max_displacement / stride_2;
+        int neighborhood_grid_width = neighborhood_grid_radius * 2 + 1;
+
+        std::vector<int> outShape;
+
+        int num = inputs[0][0];
+        outShape.push_back(num);
+
+        int out_c = neighborhood_grid_width * neighborhood_grid_width;
+        outShape.push_back(out_c);
+
+        int out_h = ceil(static_cast<float>(padded_height - border_size * 2) / stride_1);
+        int out_w = ceil(static_cast<float>(padded_width - border_size * 2)  / stride_1);
+        CV_Assert_N(out_h >= 1, out_w >= 1);
+
+        outShape.push_back(out_h);
+        outShape.push_back(out_w);
+        outputs.assign(1, outShape);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        int padded_height = inputs[0].size[2] + 2 * pad;
+        int padded_width  = inputs[0].size[3] + 2 * pad;
+
+        int size[] = {inputs[0].size[0], padded_height, padded_width, inputs[0].size[1]};
+        rbot0 = Mat(4, &size[0], CV_32F, float(0));
+        rbot1 = Mat(4, &size[0], CV_32F, float(0));
+    }
+
+    void blobRearrangeKernel2(const Mat& input, Mat& output)
+    {
+        const int num      = input.size[0];
+        const int channels = input.size[1];
+        const int height   = input.size[2];
+        const int width    = input.size[3];
+        const int area     = height * width;
+        const int pad_area = (width + 2 * pad) * (height + 2 * pad);
+
+        const float* in = input.ptr<float>();
+        float* out = output.ptr<float>();
+        for (int n = 0; n < num; n++)
+        {
+            for (int ch = 0; ch < channels; ch++)
+            {
+                for (int xy = 0; xy < area; xy++)
+                {
+                    float value = in[(n * channels + ch) * area + xy];
+                    int xpad  = (xy % width + pad);
+                    int ypad  = (xy / width + pad);
+                    int xypad = ypad * (width + 2 * pad) + xpad;
+                    out[(n * pad_area + xypad) * channels + ch] = value;
+                }
+            }
+        }
+    }
+
+    void correlationKernelSubtraction(const Mat& input0, const Mat& input1, Mat& output, int item)
+    {
+        const int inp_h = input0.size[1];
+        const int inp_w = input0.size[2];
+        const int inp_c = input0.size[3];
+
+        const int out_c = output.size[1];
+        const int out_h = output.size[2];
+        const int out_w = output.size[3];
+
+        int topcount = output.total(1);
+        int neighborhood_grid_radius = max_displacement / stride_2;
+        int neighborhood_grid_width  = neighborhood_grid_radius * 2 + 1;
+
+        const float* inp0_data = input0.ptr<float>();
+        const float* inp1_data = input1.ptr<float>();
+        float* out_data  = output.ptr<float>();
+        int sumelems = kernel * kernel * inp_c;
+        std::vector<float> patch_data(sumelems, 0);
+        for (int y = 0; y < out_h; y++)
+        {
+            for (int x = 0; x < out_w; x++)
+            {
+                int x1 = x * stride_1 + max_displacement;
+                int y1 = y * stride_1 + max_displacement;
+
+                for (int j = 0; j < kernel; j++)
+                {
+                    for (int i = 0; i < kernel; i++)
+                    {
+                        int ji_off = ((j * kernel) + i) * inp_c;
+                        for (int ch = 0; ch < inp_c; ch++)
+                        {
+                            int idx1 = ((item * inp_h + y1 + j) * inp_w + x1 + i) * inp_c + ch;
+                            int idxPatchData = ji_off + ch;
+                            patch_data[idxPatchData] = inp0_data[idx1];
+                        }
+                    }
+                }
+
+                for (int out_ch = 0; out_ch < out_c; out_ch++)
+                {
+                    float sum = 0;
+                    int s2o = (out_ch % neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+                    int s2p = (out_ch / neighborhood_grid_width - neighborhood_grid_radius) * stride_2;
+
+                    int x2 = x1 + s2o;
+                    int y2 = y1 + s2p;
+                    for (int j = 0; j < kernel; j++)
+                    {
+                        for (int i = 0; i < kernel; i++)
+                        {
+                            int ji_off = ((j * kernel) + i) * inp_c;
+                            for (int ch = 0; ch < inp_c; ch++)
+                            {
+                                int idxPatchData = ji_off + ch;
+                                int idx2 = ((item * inp_h + y2 + j) * inp_w + x2 + i) * inp_c + ch;
+                                sum += patch_data[idxPatchData] * inp1_data[idx2];
+                            }
+                        }
+                    }
+                    int index = ((out_ch * out_h + y) * out_w) + x;
+                    out_data[index + item * topcount] = static_cast<float>(sum) / sumelems;
+                }
+            }
+        }
+    }
+
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        blobRearrangeKernel2(inputs[0], rbot0);
+        blobRearrangeKernel2(inputs[1], rbot1);
+        for (int i = 0; i < inputs[0].size[0]; i++)
+        {
+            correlationKernelSubtraction(rbot0, rbot1, outputs[0], i);
+        }
+    }
+
+private:
+    int pad;
+    int kernel;
+    int max_displacement;
+    int stride_1;
+    int stride_2;
+    Mat rbot0;
+    Mat rbot1;
+};
+
+Ptr<CorrelationLayer> CorrelationLayer::create(const LayerParams& params)
+{
+    return Ptr<CorrelationLayer>(new CorrelationLayerImpl(params));
+}
+
+}}  // namespace cv::dnn
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/crop_and_resize_layer.cpp
@ -0,0 +1,186 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "../ie_ngraph.hpp"
+#include "layers_common.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/crop_and_resize.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
+{
+public:
+    CropAndResizeLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert_N(params.has("width"), params.has("height"));
+        outWidth = params.get<float>("width");
+        outHeight = params.get<float>("height");
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV
+               || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH
+               || backendId == DNN_BACKEND_CUDA
+        ;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 2, inputs[0].size() == 4);
+        if (inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "");
+        outputs.resize(1, MatShape(4));
+        outputs[0][0] = inputs[1][2];  // Number of bounding boxes.
+        outputs[0][1] = inputs[0][1];  // Number of channels.
+        outputs[0][2] = outHeight;
+        outputs[0][3] = outWidth;
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat& inp = inputs[0];
+        Mat& out = outputs[0];
+        Mat boxes = inputs[1].reshape(1, inputs[1].total() / 7);
+        const int numChannels = inp.size[1];
+        const int inpHeight = inp.size[2];
+        const int inpWidth = inp.size[3];
+        const int inpSpatialSize = inpHeight * inpWidth;
+        const int outSpatialSize = outHeight * outWidth;
+        CV_Assert_N(inp.isContinuous(), out.isContinuous());
+
+        for (int b = 0; b < boxes.rows; ++b)
+        {
+            float* outDataBox = out.ptr<float>(b);
+            float left = boxes.at<float>(b, 3);
+            float top = boxes.at<float>(b, 4);
+            float right = boxes.at<float>(b, 5);
+            float bottom = boxes.at<float>(b, 6);
+            float boxWidth = right - left;
+            float boxHeight = bottom - top;
+
+            float heightScale = boxHeight * static_cast<float>(inpHeight - 1) / (outHeight - 1);
+            float widthScale = boxWidth * static_cast<float>(inpWidth - 1) / (outWidth - 1);
+            for (int y = 0; y < outHeight; ++y)
+            {
+                float input_y = top * (inpHeight - 1) + y * heightScale;
+                int y0 = static_cast<int>(input_y);
+                const float* inpData_row0 = inp.ptr<float>(0, 0, y0);
+                const float* inpData_row1 = (y0 + 1 < inpHeight) ? (inpData_row0 + inpWidth) : inpData_row0;
+                for (int x = 0; x < outWidth; ++x)
+                {
+                    float input_x = left * (inpWidth - 1) + x * widthScale;
+                    int x0 = static_cast<int>(input_x);
+                    int x1 = std::min(x0 + 1, inpWidth - 1);
+
+                    float* outData = outDataBox + y * outWidth + x;
+                    const float* inpData_row0_c = inpData_row0;
+                    const float* inpData_row1_c = inpData_row1;
+                    for (int c = 0; c < numChannels; ++c)
+                    {
+                        *outData = inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x0] - inpData_row0_c[x0]) +
+                            (input_x - x0) * (inpData_row0_c[x1] - inpData_row0_c[x0] +
+                            (input_y - y0) * (inpData_row1_c[x1] - inpData_row0_c[x1] - inpData_row1_c[x0] + inpData_row0_c[x0]));
+
+                        inpData_row0_c += inpSpatialSize;
+                        inpData_row1_c += inpSpatialSize;
+                        outData += outSpatialSize;
+                    }
+                }
+            }
+        }
+        if (boxes.rows < out.size[0])
+        {
+            // left = top = right = bottom = 0
+            std::vector<cv::Range> dstRanges(4, Range::all());
+            dstRanges[0] = Range(boxes.rows, out.size[0]);
+            out(dstRanges).setTo(inp.ptr<float>(0, 0, 0)[0]);
+        }
+    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        // Slice second input: from 1x1xNx7 to 1x1xNx5
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto rois = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+
+        auto rois_shape = rois->get_shape();
+        std::vector<int64_t> dims(rois_shape.begin(), rois_shape.end()), offsets(4, 0);
+        offsets[3] = 2;
+        dims[3] = 7;
+
+        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                             ngraph::Shape{offsets.size()}, offsets.data());
+        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                             ngraph::Shape{dims.size()}, dims.data());
+        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                        ngraph::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
+        auto slice = std::make_shared<ngraph::op::v1::StridedSlice>(rois,
+                                      lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+
+        // Reshape rois from 4D to 2D
+        std::vector<int64_t> shapeData = {dims[2], 5};
+        auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shapeData.data());
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(slice, shape, true);
+
+        auto roiPooling =
+            std::make_shared<ngraph::op::v0::ROIPooling>(input, reshape,
+                                                         ngraph::Shape{(size_t)outHeight, (size_t)outWidth},
+                                                         1.0f, "bilinear");
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(roiPooling));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::CropAndResizeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+private:
+    int outWidth, outHeight;
+};
+
+Ptr<Layer> CropAndResizeLayer::create(const LayerParams& params)
+{
+    return Ptr<CropAndResizeLayer>(new CropAndResizeLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/cumsum_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/cumsum_layer.cpp
@ -0,0 +1,131 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class CumSumLayerImpl CV_FINAL : public CumSumLayer
+{
+public:
+    CumSumLayerImpl(const LayerParams &params)
+    {
+        axis_raw = params.get<int>("axis", 0);
+        exclusive_raw = params.get<int>("exclusive", 0);
+        reverse_raw = params.get<int>("reverse", 0);
+        setParamsFrom(params);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        // Get x tensor.
+        const auto &src_mat = inputs[0];
+        const auto *src_ptr = src_mat.ptr<float>();
+
+        // Get axis.
+        const int axis = normalize_axis(axis_raw, src_mat.dims);
+
+        // Get y tensor.
+        auto &dst_mat = outputs[0];
+        src_mat.copyTo(dst_mat);
+        auto *dst_ptr = dst_mat.ptr<float>();
+
+        // Get flags.
+        const auto exclusive = exclusive_raw == 1;
+        const auto reverse = reverse_raw == 1;
+
+        // Get parameters to iterate outer dimension.
+        const size_t outer_size = src_mat.total(0, axis);
+        const size_t outer_step_length = src_mat.total(axis);
+
+        // Get parameters to iterate inner dimension.
+        const size_t inner_size = src_mat.size[axis];
+
+        if (!inner_size)
+            return;
+
+        const size_t inner_step_length = src_mat.total(axis + 1);
+        const int inner_step = (reverse ? -1 : 1) * inner_step_length;
+        const int inner_start = reverse ? inner_size - 1 : 0;
+        const int inner_stop = reverse ? -1 : inner_size;
+        const int inner_delta = reverse ? -1 : 1;
+
+        // Get parameters to populate channels.
+        const size_t num_channels = src_mat.total(axis + 1);
+
+        for (size_t outer_dim = 0; outer_dim < outer_size; outer_dim++)
+        {
+            const size_t outer_offset = outer_dim * outer_step_length;
+            size_t src_offset = outer_offset + inner_start * inner_step_length;
+
+            // Populate first element of inner dimension.
+            for (size_t channel = 0; channel < num_channels; channel++)
+            {
+                if (exclusive)
+                {
+                    dst_ptr[src_offset + channel] = 0.0f;
+                }
+                else
+                {
+                    dst_ptr[src_offset + channel] = src_ptr[src_offset + channel];
+                    src_offset += inner_step;
+                }
+            }
+
+            // Populate remaining elements of inner dimension.
+            for (int inner_dim = inner_start + inner_delta; inner_dim != inner_stop; inner_dim += inner_delta)
+            {
+                const size_t dst_offset = outer_offset + inner_dim * inner_step_length;
+
+                for (size_t channel = 0; channel < num_channels; channel++)
+                {
+                    const size_t previous_dst_offset = dst_offset - inner_step;
+                    dst_ptr[dst_offset + channel] = dst_ptr[previous_dst_offset + channel] +
+                            src_ptr[src_offset + channel];
+                    src_offset += inner_step;
+                }
+            }
+        }
+    }
+
+    int axis_raw;
+    int exclusive_raw;
+    int reverse_raw;
+};
+
+Ptr<CumSumLayer> CumSumLayer::create(const LayerParams& params)
+{
+    return Ptr<CumSumLayer>(new CumSumLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/detection_output_layer.cpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/elementwise_layers.cpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/eltwise_layer.cpp
@ -0,0 +1,965 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/eltwise.hpp"
+#include "../cuda4dnn/primitives/shortcut.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
+{
+public:
+    enum EltwiseOp
+    {
+        PROD = 0,
+        SUM = 1,
+        MAX = 2,
+        DIV = 3,
+        MIN = 4,
+    } op;
+    std::vector<float> coeffs;
+
+    enum OutputChannelsMode
+    {
+        ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
+        ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< number of channels of other inputs should not be greater than number of channels of first input
+        ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< there is restriction on number of channels of other inputs
+                                                 //!< extra channels of other inputs is ignored
+        ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to maximal number of input channels
+                                                 //!< @note supported operation: `SUM`
+    } channelsModeInput;
+
+
+    mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
+    mutable /*size_t*/int outputChannels;
+
+    EltwiseLayerImpl(const LayerParams& params)
+        : outputChannels(0)
+    {
+        setParamsFrom(params);
+        hasVecInput = false;
+        op = SUM;
+        if (params.has("operation"))
+        {
+            String operation = toLowerCase(params.get<String>("operation"));
+            if (operation == "prod")
+                op = PROD;
+            else if (operation == "sum")
+                op = SUM;
+            else if (operation == "max")
+                op = MAX;
+            else if (operation == "min")
+                op = MIN;
+            else if (operation == "div")
+                op = DIV;
+            else
+                CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
+        }
+
+        if (params.has("coeff"))
+        {
+            DictValue paramCoeff = params.get("coeff");
+            int i, n = paramCoeff.size();
+            coeffs.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                coeffs[i] = paramCoeff.get<float>(i);
+            }
+        }
+
+        channelsModeInput = ELTWISE_CHANNNELS_SAME;
+        if (params.has("output_channels_mode"))
+        {
+            String v = toLowerCase(params.get<String>("output_channels_mode"));
+            if (v == "same")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_SAME;
+            }
+            else if (v == "input_0")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
+            }
+            else if (v == "input_0_truncate")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
+            }
+            else if (v == "max_input_channels")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
+                if (op != SUM)
+                    CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
+            }
+            else
+                CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
+        }
+        channelsMode = channelsModeInput;
+
+        // TODO Must have checks for other unknown options
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (hasVecInput && ELTWISE_CHANNNELS_SAME)
+            return backendId == DNN_BACKEND_OPENCV;
+
+        if (backendId == DNN_BACKEND_CUDA)
+        {
+            if(channelsModeInput == ELTWISE_CHANNNELS_INPUT_0 || channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+                return op == SUM && coeffs.empty();
+            return channelsModeInput == ELTWISE_CHANNNELS_SAME;
+        }
+
+        return backendId == DNN_BACKEND_OPENCV ||
+               (backendId == DNN_BACKEND_HALIDE && op != DIV) ||  // TODO: not implemented, see PR #15811
+               ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()))
+                || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && channelsMode == ELTWISE_CHANNNELS_SAME));
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() >= 2);
+        CV_Assert(inputs[0].size() >= 2);
+        CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
+        CV_Assert(op == SUM || coeffs.size() == 0);
+
+        int dims = inputs[0].size();
+        // Number of channels in output shape is determined by the first input tensor.
+        bool variableChannels = false;
+        int numChannels = inputs[0][1];
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
+
+            int input_channels = inputs[i][1];
+            if (numChannels != input_channels)
+                variableChannels = true;
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
+            {
+                CV_Assert(numChannels == input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
+            {
+                CV_Assert(numChannels >= input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+            {
+                // nothing to check
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
+            {
+                numChannels = std::max(numChannels, input_channels);
+            }
+            else
+            {
+                CV_Assert(0 && "Internal error");
+            }
+        }
+
+        channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
+        outputChannels = numChannels;
+
+        outputs.assign(1, inputs[0]);
+        outputs[0][1] = numChannels;
+
+        if (dims > 2)
+        {
+            size_t vecIdx = 0;
+            bool isVecFound = false;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                bool allOnes = isAllOnes(inputs[i], 2, dims);
+                if (!allOnes && !isVecFound)
+                {
+                    vecIdx = i;
+                    isVecFound = true;
+                }
+
+                if (!allOnes && i != vecIdx)
+                {
+                    for (size_t j = 2; j < dims; j++)
+                    {
+                         CV_Assert(inputs[vecIdx][j] == inputs[i][j]);
+                    }
+                }
+            }
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound)
+            {
+                for (size_t j = 2; j < dims; j++)
+                {
+                    outputs[0][j] = inputs[vecIdx][j];
+                }
+            }
+        }
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inpShape = shape(inputs[i].size);
+            if (isAllOnes(inpShape, 2, inputs[i].dims))
+            {
+                hasVecInput = true;
+                return;
+            }
+        }
+    }
+
+    class EltwiseInvoker : public ParallelLoopBody
+    {
+        EltwiseLayerImpl& self;
+        std::vector<const Mat*> srcs;
+        std::vector<int> srcNumChannels;
+        int nsrcs;
+        Mat* dst;
+        std::vector<float> coeffs;
+        int nstripes;
+        const ActivationLayer* activ;
+        int channels;
+        size_t planeSize;
+
+        EltwiseInvoker(EltwiseLayerImpl& self_)
+            : self(self_)
+            , nsrcs(0), dst(0), nstripes(0), activ(0), channels(0)
+            , planeSize(0)
+        {}
+
+    public:
+        static void run(EltwiseLayerImpl& self,
+                        const Mat* srcs, int nsrcs, Mat& dst,
+                        int nstripes)
+        {
+            const EltwiseOp op = self.op;
+            CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
+            CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
+            CV_CheckGE(nsrcs, 2, "");
+
+            CV_Assert(self.outputChannels == dst.size[1]);
+
+            EltwiseInvoker p(self);
+            p.srcs.resize(nsrcs);
+            p.srcNumChannels.resize(nsrcs);
+            p.coeffs = self.coeffs;  // can be sorted
+
+            bool sortInputs = false;
+            for( int i = 0; i < nsrcs; i++ )
+            {
+                p.srcs[i] = &srcs[i];
+                CV_CheckEQ(srcs[i].dims, dst.dims, "");
+                CV_Assert(srcs[i].isContinuous());
+                CV_Assert(srcs[i].type() == dst.type());
+                p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
+
+                if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
+                {
+                    CV_Assert(srcs[i].size == dst.size);
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
+                {
+                    CV_Assert(op == SUM);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else
+                {
+                    CV_Assert(0 && "Internal error");
+                }
+
+                if (sortInputs)
+                {
+                    // Sort srcs and coefficients in the desc order by number of channels
+                    for (int j = i; j >= 1; j--)
+                    {
+                        if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
+                        {
+                            std::swap(p.srcs[j - 1], p.srcs[j]);
+                            std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
+                            if (!p.coeffs.empty())
+                                std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                        }
+                        else
+                            break;
+                    }
+                }
+            }
+
+            p.nsrcs = nsrcs;
+            p.dst = &dst;
+            p.nstripes = nstripes;
+            p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
+
+            p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
+            CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
+
+            bool simpleCoeffs = true;
+            if (op == SUM && !p.coeffs.empty())
+            {
+                CV_CheckEQ(p.coeffs.size(), (size_t)nsrcs, "");
+
+                for (size_t i = 0; i < p.coeffs.size(); i++)
+                {
+                    if (p.coeffs[i] != 1)
+                    {
+                        simpleCoeffs = false;
+                        break;
+                    }
+                }
+            }
+            if (simpleCoeffs)
+                p.coeffs.clear();
+            p.activ = self.activ.get();
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            const EltwiseOp op = self.op;
+            size_t total = dst->size[0]*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+            const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
+            float* dstptr0 = dst->ptr<float>();
+            int blockSize0 = 1 << 12;
+
+            for (size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / planeSize);
+                int delta = (int)ofs - sampleIdx * planeSize;
+                int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
+                if( blockSize <= 0 )
+                    break;
+                ofs += blockSize;
+
+                for (int c = 0; c < channels; c++)
+                {
+                    size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
+                    float* dstptr = dstptr0 + dstIdx;
+
+                    // process first two inputs
+                    {
+                        const float* srcptr0 = srcs[0]->ptr<float>() + dstIdx;
+
+                        const int inputIdx = 1;
+                        int src1_channels = srcNumChannels[inputIdx];
+                        if (c >= src1_channels)
+                        {
+                            // no data from second input
+                            if (!coeffsptr || coeffsptr[0] == 1.0f)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j];
+                                }
+                            }
+                            else
+                            {
+                                float c0 = coeffsptr[0];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = c0*srcptr0[j];
+                                }
+                            }
+                        }
+                        else
+                        {
+                            size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
+                            const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
+
+                            if (op == PROD)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j] * srcptrI[j];
+                                }
+                            }
+                            else if (op == DIV)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j] / srcptrI[j];
+                                }
+                            }
+                            else if (op == MAX)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
+                                }
+                            }
+                            else if (op == MIN)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = std::min(srcptr0[j], srcptrI[j]);
+                                }
+                            }
+                            else if (op == SUM)
+                            {
+                                if (!coeffsptr || (coeffsptr[0] == 1.0f && coeffsptr[1] == 1.0f))
+                                {
+                                    for (int j = 0; j < blockSize; j++)
+                                    {
+                                        dstptr[j] = srcptr0[j] + srcptrI[j];
+                                    }
+                                }
+                                else
+                                {
+                                    float c0 = coeffsptr[0];
+                                    float c1 = coeffsptr[1];
+                                    for (int j = 0; j < blockSize; j++)
+                                    {
+                                        dstptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
+                                    }
+                                }
+                            }
+                            else
+                                CV_Error(Error::StsInternal, "");
+                        }
+                    }
+
+                    // aggregate other inputs (3+)
+                    for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
+                    {
+                        int srcI_channels = srcNumChannels[inputIdx];
+                        if (c >= srcI_channels)
+                            continue;  // no data from second input
+                        size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
+                        const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
+
+                        if (op == PROD)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] *= srcptrI[j];
+                            }
+                        }
+                        else if (op == DIV)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] /= srcptrI[j];
+                            }
+                        }
+                        else if (op == MAX)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] = std::max(dstptr[j], srcptrI[j]);
+                            }
+                        }
+                        else if (op == MIN)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] = std::min(dstptr[j], srcptrI[j]);
+                            }
+                        }
+                        else if (op == SUM)
+                        {
+                            if (!coeffsptr || coeffsptr[inputIdx] == 1.0f)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] += srcptrI[j];
+                                }
+                            }
+                            else
+                            {
+                                float cI = coeffsptr[inputIdx];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] += cI * srcptrI[j];
+                                }
+                            }
+                        }
+                        else
+                            CV_Error(Error::StsInternal, "");
+                    }
+                }
+
+                if( activ )
+                {
+                    float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
+                    activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
+                }
+            }
+        }
+    };
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
+            return false;
+
+        if (hasVecInput)
+            return false; // TODO not implemented yet: https://github.com/opencv/opencv/pull/19477
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        switch (op)
+        {
+            case SUM:
+                {
+                    int channels = total(shape(outputs[0]), 0, 2);
+                    int plane_size = total(shape(outputs[0]), 2);
+                    if (channels % 4 == 0 && plane_size % 4 == 0)
+                    {
+                        size_t localsize[] = { 128 };
+                        size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
+                        String opts;
+                        if (inputs_.depth() == CV_16S)
+                            opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
+                        else
+                            opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
+
+                        for (int i = 0; i < (inputs.size() - 1); ++i)
+                        {
+                            String buildopt = format("-DLOOP=%d", i) + opts;
+                            ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
+                            int idx = 0;
+                            UMat inpMat = (i == 0) ? inputs[0] : UMat();
+                            float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
+                            float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
+                            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
+                            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
+                            kernel.set(idx++, (int)plane_size);
+                            kernel.set(idx++, (float)coeff1);
+                            kernel.set(idx++, (float)coeff2);
+                            kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
+                            bool ret = kernel.run(1, globalsize, localsize, false);
+                            if (!ret)
+                                return false;
+                        }
+                    }
+                    else
+                    {
+                        if (inputs_.depth() == CV_16S)
+                            return false;
+
+                        float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
+                        float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
+                        UMat mul0, mul1;
+                        multiply(coeff1, inputs[0], mul0);
+                        multiply(coeff2, inputs[1], mul1);
+                        add(mul0, mul1, outputs[0]);
+                        for (int i = 2; i < inputs.size(); ++i)
+                        {
+                            float coeff = coeffs.empty() ? 1.f : coeffs[i];
+                            multiply(coeff, inputs[i], mul0);
+                            add(mul0, outputs[0], outputs[0]);
+                        }
+                    }
+                }
+                break;
+            case PROD:
+                multiply(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    multiply(inputs[i], outputs[0], outputs[0]);
+                break;
+            case DIV:
+                divide(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    divide(outputs[0], inputs[i], outputs[0]);
+                break;
+            case MAX:
+                max(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    max(inputs[i], outputs[0], outputs[0]);
+                break;
+            case MIN:
+                min(inputs[0], inputs[1], outputs[0]);
+                for (int i = 2; i < inputs.size(); ++i)
+                    min(inputs[i], outputs[0], outputs[0]);
+                break;
+            default:
+                return false;
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(outputs.size() == 1);
+        const int nstripes = getNumThreads();
+
+        if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2)
+        {
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                MatShape inpShape = shape(inputs[i].size);
+                bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims);
+
+                if (allOnes)
+                {
+                    Mat tmpInput = inputs[i];
+                    MatShape outShape = shape(outputs[0].size);
+                    size_t xSize = outShape[2];
+                    for (size_t j = 3; j < outShape.size(); j++)
+                        xSize *= outShape[j];
+
+                    int dimVec[3] = {outShape[0], outShape[1], (int) xSize};
+                    std::vector<int> matSizesVec(&dimVec[0], &dimVec[0] + 3);
+                    inputs[i] = Mat(matSizesVec, tmpInput.type());
+
+                    std::vector<int> idx(outShape.size(), 0);
+                    std::vector<int> outIdx(inpShape.size(), 0);
+
+                    for (size_t j = 0; j < outShape[0]; j++)
+                    {
+                        outIdx[0] = idx[0] = j;
+                        for(size_t k = 0; k < outShape[1]; k++)
+                        {
+                            outIdx[1] = idx[1] = k;
+                            for (size_t x = 0; x < xSize; x++)
+                            {
+                                outIdx[2] = x;
+                                inputs[i].at<float>(outIdx.data()) = tmpInput.at<float>(idx.data());
+                            }
+                        }
+                    }
+                    inputs[i] = inputs[i].reshape(0, outShape);
+                }
+            }
+        }
+
+        EltwiseInvoker::run(*this,
+                            &inputs[0], (int)inputs.size(), outputs[0],
+                            nstripes);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        CV_Assert(channelsModeInput == ELTWISE_CHANNNELS_INPUT_0 ||
+                  channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE ||
+                  channelsModeInput == ELTWISE_CHANNNELS_SAME);
+
+        if(channelsModeInput == ELTWISE_CHANNNELS_INPUT_0 || channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+        {
+            auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+            for (int i = 1; i < inputs.size(); i++)
+            {
+                auto from_wrapper = inputs[i].dynamicCast<CUDABackendWrapper>();
+                if (input_wrapper->getShape()[1] != from_wrapper->getShape()[1])
+                {
+                    CV_Assert(op == SUM);
+                    CV_Assert(coeffs.empty());
+                    return make_cuda_node<cuda4dnn::ShortcutOp>(preferableTarget, std::move(context->stream));
+                }
+            }
+        }
+
+        auto op_ = [this] {
+            switch (op) {
+            case MAX: return cuda4dnn::EltwiseOpType::MAX;
+            case MIN: return cuda4dnn::EltwiseOpType::MIN;
+            case SUM: return cuda4dnn::EltwiseOpType::SUM;
+            case PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
+            case DIV: return cuda4dnn::EltwiseOpType::DIV;
+            }
+            return cuda4dnn::EltwiseOpType::SUM;
+        }();
+
+        return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, coeffs);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Expr topExpr;
+        std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
+        switch (op)
+        {
+            case SUM:
+                if (coeffs.empty())
+                {
+                    topExpr = inputBuffers[0](x, y, c, n) +
+                              inputBuffers[1](x, y, c, n);
+                    for (int i = 2; i < inputBuffers.size(); ++i)
+                        topExpr += inputBuffers[i](x, y, c, n);
+                }
+                else
+                {
+                  topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
+                            coeffs[1] * inputBuffers[1](x, y, c, n);
+                  for (int i = 2; i < inputBuffers.size(); ++i)
+                      topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
+                }
+                break;
+            case PROD:
+                topExpr = inputBuffers[0](x, y, c, n) *
+                          inputBuffers[1](x, y, c, n);
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr *= inputBuffers[i](x, y, c, n);
+                break;
+            case DIV:
+                topExpr = inputBuffers[0](x, y, c, n) /
+                          inputBuffers[1](x, y, c, n);
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr /= inputBuffers[i](x, y, c, n);
+                break;
+            case MAX:
+                topExpr = max(inputBuffers[0](x, y, c, n),
+                              inputBuffers[1](x, y, c, n));
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
+                break;
+            case MIN:
+                topExpr = min(inputBuffers[0](x, y, c, n),
+                              inputBuffers[1](x, y, c, n));
+                for (int i = 2; i < inputBuffers.size(); ++i)
+                    topExpr = min(topExpr, inputBuffers[i](x, y, c, n));
+                break;
+            default:
+                return Ptr<BackendNode>();
+        }
+        top(x, y, c, n) = topExpr;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::EltwiseLayer ieLayer(name);
+
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
+
+        if (op == SUM)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
+        else if (op == PROD)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
+        else if (op == DIV)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::DIV);
+        else if (op == MAX)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
+        else if (op == MIN)
+            ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MIN);
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        if (!coeffs.empty())
+            l.getParameters()["coeff"] = coeffs;
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        if (!coeffs.empty()) {
+            auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
+            curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        for (size_t i = 1; i < nodes.size(); i++)
+        {
+            auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+            if (!coeffs.empty()) {
+                auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[i]);
+                next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
+            }
+            switch (op) {
+                case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
+                case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
+                case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
+                case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
+                case MIN:  curr_node = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
+                default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+            }
+        }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (op == SUM)
+        {
+            std::vector<float> newCoeffs;
+            float offset = zeropoints[1][0];
+            float out_sc = scales[1][0];
+            for (int i = 0; i < scales[0].size(); i++)
+            {
+                float coeff = coeffs.empty() ? 1.f : coeffs[i];
+                float newcoeff = (scales[0][i] * coeff) / out_sc;
+                newCoeffs.push_back(newcoeff);
+                offset -= (newcoeff * zeropoints[0][i]);
+            }
+            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
+            params.set("offset", offset);
+            return true;
+        }
+        else if (op == PROD)
+        {
+            std::vector<float> newCoeffs = scales[0];
+            newCoeffs[0] /= scales[1][0];
+            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
+            params.set("offset", zeropoints[1][0]);
+            params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+            return true;
+        }
+        return op == MAX;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        CV_Assert(inputs.size());
+
+        // FIXIT: handle inputs with different number of channels
+        long flops = inputs.size() * total(inputs[0]);
+
+        return flops;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
+    }
+
+    Ptr<ActivationLayer> activ;
+
+private:
+    bool hasVecInput;
+};
+
+Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
+{
+    return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/flatten_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/flatten_layer.cpp
@ -0,0 +1,246 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <float.h>
+#include <algorithm>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class FlattenLayerImpl CV_FINAL : public FlattenLayer
+{
+public:
+    FlattenLayerImpl(const LayerParams &params)
+    {
+        _startAxis = params.get<int>("axis", 1);
+        _endAxis = params.get<int>("end_axis", -1);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i] == inputs[0]);
+        }
+
+        int numAxes = inputs[0].size();
+        int startAxis = normalize_axis(_startAxis, numAxes);
+        int endAxis = normalize_axis(_endAxis, numAxes);
+
+        CV_Assert(startAxis >= 0);
+        CV_Assert(endAxis >= startAxis && endAxis < (int)numAxes);
+
+        size_t flattenedDimensionSize = total(inputs[0], startAxis, endAxis + 1);
+
+        MatShape outputShapeVec;
+        for (int i = 0; i < startAxis; i++)
+        {
+            outputShapeVec.push_back(inputs[0][i]);
+        }
+        outputShapeVec.push_back(flattenedDimensionSize);
+        for (size_t i = endAxis + 1; i < numAxes; i++)
+        {
+            outputShapeVec.push_back(inputs[0][i]);
+        }
+        CV_Assert(outputShapeVec.size() <= 4);
+
+        outputs.resize(inputs.size(), outputShapeVec);
+
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        int numAxes = inputs[0].dims;
+        _startAxis = normalize_axis(_startAxis, numAxes);
+        _endAxis = normalize_axis(_endAxis, numAxes);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
+    {
+        std::vector<UMat> inpvec;
+        std::vector<UMat> outputs;
+
+        inputs_arr.getUMatVector(inpvec);
+        outputs_arr.getUMatVector(outputs);
+
+        std::vector<UMat*> inputs(inpvec.size());
+        for (int i = 0; i < inpvec.size(); i++)
+            inputs[i] = &inpvec[i];
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape outShape = shape(outputs[i]);
+            UMat& output = outputs_arr.getUMatRef(i);
+            output = inputs[i]->reshape(1, (int)outShape.size(), &outShape[0]);
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   outputs_arr.isUMatVector(),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape outShape = shape(outputs[i]);
+            if (inputs[i].data != outputs[i].data)
+            {
+                inputs[i].reshape(1, (int)outShape.size(), &outShape[0]).copyTo(outputs[i]);
+            }
+        }
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Flatten");
+        ieLayer.getParameters()["axis"] = (size_t)_startAxis;
+        ieLayer.getParameters()["end_axis"] = _endAxis;  // Do not cast to size_t because it might be negative.
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                    const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+{
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> dims = ieInpNode->get_shape();
+
+        int numAxes = dims.size();
+        int startAxis = normalize_axis(_startAxis, numAxes);
+        int endAxis = normalize_axis(_endAxis, numAxes);
+
+        CV_Assert(startAxis >= 0);
+        CV_Assert(endAxis >= startAxis && endAxis < numAxes);
+        int64_t flattenedDimensionSize = std::accumulate(dims.begin() + startAxis,
+                                         dims.begin() + endAxis + 1, 1, std::multiplies<size_t>());
+
+        std::vector<int64_t> outputShapeVec(dims.begin(), dims.begin() + startAxis);
+        outputShapeVec.push_back(flattenedDimensionSize);
+        outputShapeVec.insert(outputShapeVec.end(), dims.begin() + endAxis + 1, dims.end());
+
+        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape({outputShapeVec.size()}), outputShapeVec.data());
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, shape, true);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(reshape));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
+    int _startAxis;
+    int _endAxis;
+};
+
+Ptr<FlattenLayer> FlattenLayer::create(const LayerParams& params)
+{
+    return Ptr<FlattenLayer>(new FlattenLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/flow_warp_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/flow_warp_layer.cpp
@ -0,0 +1,117 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2020, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+
+namespace cv { namespace dnn {
+
+class FlowWarpLayerImpl CV_FINAL : public FlowWarpLayer
+{
+public:
+    FlowWarpLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        String fill_string = toLowerCase(params.get<String>("FillParameter", "ZERO"));
+        if (fill_string != "zero")
+            CV_Error(Error::StsNotImplemented, "Only zero filling supported.");
+        fill_value = 0;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 2);
+        CV_Assert_N(inputs[0][0] == inputs[1][0], inputs[1][1] == 2,
+                    inputs[0][2] == inputs[1][2], inputs[0][3] == inputs[1][3]);
+
+        outputs.assign(1, inputs[0]);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const int out_n = outputs[0].size[0];
+        const int out_c = outputs[0].size[1];
+        const int out_h = outputs[0].size[2];
+        const int out_w = outputs[0].size[3];
+
+        const int area = out_w * out_h;
+        const int total = area * out_c;
+
+        const float* image_data = inputs[0].ptr<float>();
+        const float* flow_data  = inputs[1].ptr<float>();
+        float* out_data = outputs[0].ptr<float>();
+
+        for (int n = 0; n < out_n; n++)
+        {
+            int off = total * n;
+            for (int x = 0; x < out_w; x++)
+            {
+                for (int y = 0; y < out_h; y++)
+                {
+                    int idx = 2 * area * n + y * out_w + x;
+                    float fx = flow_data[idx];
+                    float fy = flow_data[idx + area];
+
+                    float x2 = x + fx;
+                    float y2 = y + fy;
+
+                    if (x2 >= 0 && y2 >= 0 && x2 < out_w && y2 < out_h)
+                    {
+                        int ix2_L = x2;
+                        float alpha = x2 - ix2_L;
+
+                        int iy2_T = y2;
+                        float beta = y2 - iy2_T;
+
+                        int ix2_R = std::min(ix2_L + 1, out_w - 1);
+                        int iy2_B = std::min(iy2_T + 1, out_h - 1);
+
+                        for (int c = 0; c < out_c; c++)
+                        {
+                            float TL = image_data[off + c * area + iy2_T * out_w + ix2_L];
+                            float TR = image_data[off + c * area + iy2_T * out_w + ix2_R];
+                            float BL = image_data[off + c * area + iy2_B * out_w + ix2_L];
+                            float BR = image_data[off + c * area + iy2_B * out_w + ix2_R];
+
+                            out_data[off + c * area + y * out_w + x] = (1 - alpha) * (1 - beta) * TL +
+                                                                       (1 - alpha) * beta       * BL +
+                                                                        alpha      * (1 - beta) * TR +
+                                                                        alpha      * beta       * BR;
+                        }
+                    }
+                    else
+                    {
+                        for (int c = 0; c < out_c; c++)
+                            out_data[off + c * area + y * out_w + x] = fill_value;
+                    }
+                }
+            }
+        }
+    }
+
+private:
+    float fill_value;
+};
+
+Ptr<FlowWarpLayer> FlowWarpLayer::create(const LayerParams& params)
+{
+    return Ptr<FlowWarpLayer>(new FlowWarpLayerImpl(params));
+}
+
+}}  // namespace cv::dnn
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/fully_connected_layer.cpp
@ -0,0 +1,687 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+using namespace cv::dnn::ocl4dnn;
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/matmul.hpp"
+#include "../cuda4dnn/primitives/inner_product.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
+{
+public:
+    enum { VEC_ALIGN = 8 };
+
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
+    std::vector<UMat> umat_blobs;
+    std::vector<UMat> half_blobs;
+#endif
+
+    FullyConnectedLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        bias = params.get<bool>("bias_term", true);
+        axis = params.get<int>("axis", 1);
+        if (!blobs.empty())
+        {
+            CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+            int numOutput = params.get<int>("num_output");
+            int innerSize = (int)blobs[0].total() / numOutput;
+
+            CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
+            CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
+
+            weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
+            int vecsize = weightsMat.cols;
+            if (vecsize % VEC_ALIGN != 0)
+            {
+                int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+                Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
+                Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
+                wpadding.setTo(Scalar::all(0.));
+                weightsMat = weightsBuf.colRange(0, vecsize);
+                blobs[0].copyTo(weightsMat);
+            }
+
+            if (bias)
+                biasMat = blobs[1] = blobs[1].reshape(1, 1);
+            else
+                biasMat = Mat::zeros(1, numOutput, weightsMat.type());
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &) const CV_OVERRIDE
+    {
+        int numOutput, cAxis;
+        if (blobs.empty())
+        {
+            CV_CheckEQ(inputs.size(), (size_t)2, "");
+            numOutput = inputs[1].back();
+            cAxis = inputs[0].size() - 1;
+            int dims = inputs[0].size();
+            CV_CheckEQ(inputs[1].size(), (size_t)dims, "");
+            CV_CheckGE(dims, 2, "");
+            for (int i = 0; i < dims - 2; i++)
+                CV_CheckEQ(inputs[0][i], inputs[1][i], "");
+            CV_CheckEQ(inputs[0].back(), inputs[1][dims - 2], "");
+        }
+        else
+        {
+            CV_CheckEQ(inputs.size(), (size_t)1, "");
+            CV_CheckEQ(blobs[0].dims, 2, "");
+            numOutput = blobs[0].size[0];
+            CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
+            cAxis = normalize_axis(axis, inputs[0]);
+        }
+
+        MatShape outShape(cAxis + 1);
+        for (int i = 0; i < cAxis; ++i)
+            outShape[i] = inputs[0][i];
+        outShape.back() = numOutput;
+
+        outputs.resize(1, outShape);
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1) ||
+               (((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && !blobs.empty()) ||
+                backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && axis == 1);
+    }
+
+    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
+    }
+
+    class FullyConnected : public ParallelLoopBody
+    {
+    public:
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {}
+
+        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
+                        Mat& dstMat, const ActivationLayer* activ, int nstripes)
+        {
+            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
+                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
+                       srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
+                       srcMat.type() == CV_32F &&
+                       (biasMat.empty() || (biasMat.type() == srcMat.type() &&
+                                           biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
+
+            FullyConnected p;
+
+            p.srcMat = &srcMat;
+            p.weights = &weights;
+            p.biasMat = &biasMat;
+            p.dstMat = &dstMat;
+            p.nstripes = nstripes;
+            p.activ = activ;
+            p.useAVX = checkHardwareSupport(CPU_AVX);
+            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
+            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+            p.useRVV = checkHardwareSupport(CPU_RVV);
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int valign = FullyConnectedLayerImpl::VEC_ALIGN;
+            int nsamples = srcMat->rows;
+            int nw0 = weights->rows;
+            int k, vecsize = srcMat->cols;
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            size_t total = (size_t)nsamples*nw0;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
+            size_t wstep = weights->step1();
+            AutoBuffer<float> srcbuf(vecsize_aligned + valign);
+            float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));
+
+            for( k = vecsize; k < vecsize_aligned; k++ )
+                sptr[k] = 0.f;
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / nw0);
+                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
+                const float* sptr_ = srcMat->ptr<float>(sampleIdx);
+                const float* wptr = weights->ptr<float>(delta);
+                float* dptr = dstMat->ptr<float>(sampleIdx) + delta;
+                const float* biasptr = biasMat->ptr<float>() + delta;
+                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+
+                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
+
+            #if CV_TRY_AVX512_SKX
+                if( useAVX512 )
+                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+            #if CV_TRY_AVX2
+                if( useAVX2 )
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+            #if CV_TRY_AVX
+                if( useAVX )
+                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+            #if CV_TRY_RVV
+                if( useRVV )
+                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
+            #endif
+                {
+                    int i = 0;
+
+            #if CV_SIMD128
+                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
+                    {
+                        v_float32x4 vs0 = v_setall_f32(0.f);
+                        v_float32x4 vs1 = v_setall_f32(0.f);
+                        v_float32x4 vs2 = v_setall_f32(0.f);
+                        v_float32x4 vs3 = v_setall_f32(0.f);
+
+                        for( k = 0; k < vecsize; k += 4 )
+                        {
+                            v_float32x4 v = v_load_aligned(sptr + k);
+                            vs0 = v_fma(v, v_load_aligned(wptr + k), vs0);
+                            vs1 = v_fma(v, v_load_aligned(wptr + wstep + k), vs1);
+                            vs2 = v_fma(v, v_load_aligned(wptr + wstep*2 + k), vs2);
+                            vs3 = v_fma(v, v_load_aligned(wptr + wstep*3 + k), vs3);
+                        }
+
+                        v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
+                        s += v_load(biasptr + i);
+                        v_store(dptr + i, s);
+                    }
+            #endif
+
+                    for( ; i < nw; i++, wptr += wstep )
+                    {
+                        float s0=biasptr[i];
+
+                        for( k = 0; k < vecsize; k++ )
+                        {
+                            float v = sptr[k];
+                            s0 += v*wptr[k];
+                        }
+                        dptr[i] = s0;
+                    }
+                }
+
+                if(activ)
+                    activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);
+
+                ofs += nw;
+            }
+        }
+
+        const Mat *srcMat, *weights, *biasMat;
+        const ActivationLayer* activ;
+        Mat* dstMat;
+        int nstripes;
+        bool useAVX;
+        bool useAVX2;
+        bool useAVX512;
+        bool useRVV;
+    };
+
+#ifdef HAVE_OPENCL
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        innerProductOp.release();
+        umat_blobs.clear();
+        half_blobs.clear();
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (inputs.size() == 2)
+        {
+            int dims = outputs[0].dims;
+            int m = inputs[0].size[dims - 2];
+            int n = inputs[0].size[dims - 1];
+            int k = inputs[1].size[dims - 1];
+            int rows = inputs[0].total() / (m * n);
+
+            MatShape sh_A = shape(rows, m * n);
+            MatShape sh_B = shape(rows, n * k);
+            MatShape sh_C = shape(rows, m * k);
+            UMat inp = inputs[0].reshape(1, sh_A.size(), &sh_A[0]);
+            UMat weight = inputs[1].reshape(1, sh_B.size(), &sh_B[0]);
+            UMat out = outputs[0].reshape(1, sh_C.size(), &sh_C[0]);
+
+            UMat A, B, C, A_fp32, B_fp32, C_fp32;
+            for (int i = 0; i < rows; ++i)
+            {
+                A = inp.row(i).reshape(1, m);
+                B = weight.row(i).reshape(1, n);
+                C = out.row(i).reshape(1, m);
+
+                if (use_half)
+                {
+                    convertFp16(A, A_fp32);
+                    convertFp16(B, B_fp32);
+                    convertFp16(C, C_fp32);
+                }
+                else
+                {
+                    A_fp32 = A;
+                    B_fp32 = B;
+                    C_fp32 = C;
+                }
+                cv::gemm(A_fp32, B_fp32, 1, noArray(), 0, C_fp32);
+                if (use_half)
+                {
+                    convertFp16(A_fp32, A);
+                    convertFp16(B_fp32, B);
+                    convertFp16(C_fp32, C);
+                }
+            }
+            return true;
+        }
+
+        int axisCan = normalize_axis(axis, inputs[0].dims);
+        int numOutput = blobs[0].size[0];
+        int innerSize = blobs[0].size[1];
+        int outerSize = total(shape(inputs[0]), 0, axisCan);
+        bool ret = true;
+
+        if (innerProductOp.empty())
+        {
+            size_t n = blobs.size();
+            umat_blobs.resize(n);
+            for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);
+
+            OCL4DNNInnerProductConfig config;
+            config.num_output = numOutput;
+            config.bias_term = bias;
+            config.M = outerSize;
+            config.K = innerSize;
+            config.use_half = use_half;
+
+            if (use_half)
+            {
+                half_blobs.resize(umat_blobs.size());
+                for (int i = 0; i < umat_blobs.size(); i++)
+                {
+                    if (!umat_blobs[i].empty())
+                        convertFp16(umat_blobs[i], half_blobs[i]);
+                }
+            }
+
+            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
+        }
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inshape, outshape;
+            inshape = shape(outerSize, innerSize);
+            outshape = shape(outerSize, numOutput);
+
+            UMat srcMat, dstMat;
+            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
+            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
+
+            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
+                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
+                                         dstMat))
+            {
+                ret = false;
+                break;
+            }
+
+            if (!use_half && bias && (outerSize > 1))
+            {
+                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
+                UMat& biases = umat_blobs[1];
+                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+            }
+        }
+
+        if (ret) return true;
+
+        UMat& weights = umat_blobs[0];
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inshape, outshape;
+            inshape = shape(outerSize, innerSize);
+            outshape = shape(outerSize, numOutput);
+
+            UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
+            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
+            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
+
+            if (use_half)
+            {
+                convertFp16(srcMat, srcMat_fp32);
+                convertFp16(dstMat, dstMat_fp32);
+            }
+            else
+            {
+                srcMat_fp32 = srcMat;
+                dstMat_fp32 = dstMat;
+            }
+
+            cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);
+
+            if (bias)
+            {
+                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
+                UMat& biases = umat_blobs[1];
+                cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
+            }
+            if (use_half)
+            {
+                convertFp16(srcMat_fp32, srcMat);
+                convertFp16(dstMat_fp32, dstMat);
+            }
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+
+        if (!blobs.empty())
+        {
+            int axisCan = normalize_axis(axis, input[0].dims);
+            int outerSize = input[0].total(0, axisCan);
+
+            for (size_t i = 0; i < input.size(); i++)
+            {
+                Mat srcMat = input[i].reshape(1, outerSize);
+                Mat dstMat = output[i].reshape(1, outerSize);
+
+                const int nstripes = getNumThreads();
+                FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);
+            }
+        }
+        else
+        {
+            float* inpData = input[0].ptr<float>();
+            float* weightData = input[1].ptr<float>();
+            float* outData = output[0].ptr<float>();
+
+            int dims = output[0].dims;
+            int numSlice = output[0].total() / output[0].total(dims - 2);
+            int m = input[0].size[dims - 2];
+            int n = input[0].size[dims - 1];
+            int k = input[1].size[dims - 1];
+            for (int i = 0; i < numSlice; i++)
+            {
+                Mat inpSlice(m, n, CV_32F, inpData);
+                Mat weightSlice(n, k, CV_32F, weightData);
+                Mat outSlice(m, k, CV_32F, outData);
+
+                outSlice = inpSlice * weightSlice;
+                inpData += inpSlice.total();
+                weightData += weightSlice.total();
+                outData += outSlice.total();
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        if (weightsMat.empty())
+        {
+            CV_Assert(!bias);
+            return make_cuda_node<cuda4dnn::MatMulOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle));
+        }
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());
+        auto biasMat_ = bias ? biasMat : Mat();
+        return make_cuda_node<cuda4dnn::InnerProductOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), flatten_start_axis, weightsMat, biasMat_);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        int inW, inH, inC, inN, outC = blobs[0].size[0];
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+        auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::RDom r(0, inW, 0, inH, 0, inC);
+        Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *
+                                   weights(r.x, r.y, r.z, c));
+        if (bias)
+        {
+            Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});
+            topExpr += bias(c);
+        }
+        top(x, y, c, n) = topExpr;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::FullyConnectedLayer ieLayer(name);
+
+        const int outNum = blobs[0].size[0];
+        ieLayer.setOutputNum(outNum);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW), l);
+        if (bias)
+            addConstantData("biases", wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C), l);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::shared_ptr<ngraph::Node> matmul;
+
+        if (nodes.size() == 2)
+        {
+            auto& inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+            matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, false, false);
+        }
+        else
+        {
+            std::vector<int64_t> data = {(int64_t)ieInpNode->get_shape()[0], (int64_t)blobs[0].size[1]};
+            auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
+            auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);
+
+            std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, weight_shape, blobs[0].data);
+            matmul = std::make_shared<ngraph::op::MatMul>(inp, ieWeights, false, true);
+        }
+
+        if (bias) {
+            auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                              ngraph::Shape{(size_t)blobs[1].size[1]}, blobs[1].data);
+            matmul = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(matmul));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (blobs.empty())
+            return false;
+
+        int numOutput = blobs[0].size[0];
+        float inputScale = scales[0][0], outputScale = scales[1][0];
+        int inputZp = zeropoints[0][0];
+
+        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
+        Mat biasQuantized(1, numOutput, CV_32S);
+        Mat outputMultiplier(1, numOutput, CV_32F);
+
+        double realMin, realMax, weightsScale;
+        for( int i = 0; i < numOutput; i++ )
+        {
+            // Quantize weights
+            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
+            realMin = std::min(realMin, 0.0);
+            realMax = std::max(realMax, 0.0);
+            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
+            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+
+            // Quantize biases
+            float biasScale = inputScale * weightsScale;
+            biasQuantized.at<int>(i) = (int)std::round(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+
+            // Store multiplier
+            outputMultiplier.at<float>(i) = biasScale / outputScale;
+        }
+
+        params.blobs.clear();
+        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
+        params.blobs.push_back(biasQuantized);
+        params.blobs.push_back(outputMultiplier);
+        return true;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(inputs); // suppress unused variable warning
+        long flops = 0;
+
+        int innerSize = blobs[0].size[1];
+        for(int i = 0; i < outputs.size(); i++)
+        {
+            flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
+        }
+
+        return flops;
+
+    }
+
+    bool bias;
+    Mat weightsMat, biasMat;
+    Ptr<ActivationLayer> activ;
+};
+
+Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
+{
+    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/layers_common.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/layers_common.cpp
@ -0,0 +1,254 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+namespace util
+{
+
+std::string makeName(const std::string& str1, const std::string& str2)
+{
+    return str1 + str2;
+}
+
+bool getParameter(const LayerParams &params, const std::string& nameBase, const std::string& nameAll,
+                  std::vector<size_t>& parameter, bool hasDefault = false, const std::vector<size_t>& defaultValue = std::vector<size_t>(2, 0))
+{
+    std::string nameH = makeName(nameBase, std::string("_h"));
+    std::string nameW = makeName(nameBase, std::string("_w"));
+    std::string nameAll_ = nameAll;
+    if (nameAll_ == "")
+        nameAll_ = nameBase;
+
+    if (params.has(nameH) && params.has(nameW))
+    {
+        CV_Assert(params.get<int>(nameH) >= 0 && params.get<int>(nameW) >= 0);
+        parameter.push_back(params.get<int>(nameH));
+        parameter.push_back(params.get<int>(nameW));
+        return true;
+    }
+    else
+    {
+        if (params.has(nameAll_))
+        {
+            DictValue param = params.get(nameAll_);
+            for (int i = 0; i < param.size(); i++) {
+                CV_Assert(param.get<int>(i) >= 0);
+                parameter.push_back(param.get<int>(i));
+            }
+            if (parameter.size() == 1)
+                parameter.resize(2, parameter[0]);
+            return true;
+        }
+        else
+        {
+            if (hasDefault)
+            {
+                parameter = defaultValue;
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}
+
+void getKernelSize(const LayerParams &params, std::vector<size_t>& kernel)
+{
+    if (!util::getParameter(params, "kernel", "kernel_size", kernel))
+        CV_Error(cv::Error::StsBadArg, "kernel_size (or kernel_h and kernel_w) not specified");
+
+    for (int i = 0; i < kernel.size(); i++)
+        CV_Assert(kernel[i] > 0);
+}
+
+void getStrideAndPadding(const LayerParams &params, std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end,
+                         std::vector<size_t>& strides, cv::String& padMode, size_t kernel_size = 2)
+{
+    if (params.has("pad_l") && params.has("pad_t") && params.has("pad_r") && params.has("pad_b")) {
+        CV_Assert(params.get<int>("pad_t") >= 0 && params.get<int>("pad_l") >= 0 &&
+                  params.get<int>("pad_b") >= 0 && params.get<int>("pad_r") >= 0);
+        pads_begin.push_back(params.get<int>("pad_t"));
+        pads_begin.push_back(params.get<int>("pad_l"));
+        pads_end.push_back(params.get<int>("pad_b"));
+        pads_end.push_back(params.get<int>("pad_r"));
+    }
+    else {
+        util::getParameter(params, "pad", "pad", pads_begin, true, std::vector<size_t>(kernel_size, 0));
+        if (pads_begin.size() < 4)
+            pads_end = pads_begin;
+        else
+        {
+            pads_end = std::vector<size_t>(pads_begin.begin() + pads_begin.size() / 2, pads_begin.end());
+            pads_begin.resize(pads_begin.size() / 2);
+        }
+        CV_Assert(pads_begin.size() == pads_end.size());
+    }
+    util::getParameter(params, "stride", "stride", strides, true, std::vector<size_t>(kernel_size, 1));
+
+    padMode = "";
+    if (params.has("pad_mode"))
+    {
+        padMode = params.get<String>("pad_mode");
+    }
+
+    for (int i = 0; i < strides.size(); i++)
+        CV_Assert(strides[i] > 0);
+}
+}
+
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<bool>& globalPooling,
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end,
+                            std::vector<size_t>& strides, cv::String &padMode)
+{
+    bool is_global = params.get<bool>("global_pooling", false);
+    globalPooling.resize(3);
+    globalPooling[0] = params.get<bool>("global_pooling_d", is_global);
+    globalPooling[1] = params.get<bool>("global_pooling_h", is_global);
+    globalPooling[2] = params.get<bool>("global_pooling_w", is_global);
+
+    if (globalPooling[0] || globalPooling[1] || globalPooling[2])
+    {
+        util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode);
+        if ((globalPooling[0] && params.has("kernel_d")) ||
+            (globalPooling[1] && params.has("kernel_h")) ||
+            (globalPooling[2] && params.has("kernel_w")) ||
+            params.has("kernel_size")) {
+            CV_Error(cv::Error::StsBadArg, "In global_pooling mode, kernel_size (or kernel_h and kernel_w) cannot be specified");
+        }
+
+        kernel.resize(3);
+        kernel[0] = params.get<int>("kernel_d", 1);
+        kernel[1] = params.get<int>("kernel_h", 1);
+        kernel[2] = params.get<int>("kernel_w", 1);
+
+        for (int i = 0, j = globalPooling.size() - pads_begin.size(); i < pads_begin.size(); i++, j++) {
+            if ((pads_begin[i] != 0 || pads_end[i] != 0) && globalPooling[j])
+                CV_Error(cv::Error::StsBadArg, "In global_pooling mode, pads must be = 0");
+        }
+        for (int i = 0, j = globalPooling.size() - strides.size(); i < strides.size(); i++, j++) {
+            if (strides[i] != 1 && globalPooling[j])
+                CV_Error(cv::Error::StsBadArg, "In global_pooling mode, strides must be = 1");
+        }
+    }
+    else
+    {
+        util::getKernelSize(params, kernel);
+        util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode, kernel.size());
+    }
+}
+
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides,
+                                std::vector<size_t>& dilations, cv::String &padMode, std::vector<size_t>& adjust_pads)
+{
+    util::getKernelSize(params, kernel);
+    util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode, kernel.size());
+    util::getParameter(params, "dilation", "dilation", dilations, true, std::vector<size_t>(kernel.size(), 1));
+    util::getParameter(params, "adj", "adj", adjust_pads, true, std::vector<size_t>(kernel.size(), 0));
+
+    for (int i = 0; i < dilations.size(); i++)
+        CV_Assert(dilations[i] > 0);
+}
+
+// From TensorFlow code:
+// Total padding on rows and cols is
+// Pr = (R' - 1) * S + Kr - R
+// Pc = (C' - 1) * S + Kc - C
+// where (R', C') are output dimensions, (R, C) are input dimensions, S
+// is stride, (Kr, Kc) are filter dimensions.
+// We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+// and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+// we pad more on the right and bottom than on the top and left.
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out)
+{
+    if (padMode == "VALID")
+    {
+        for (int i = 0; i < inp.size(); i++)
+            out.push_back((inp[i] - dilation[i] * (kernel[i] - 1) - 1 + stride[i]) / stride[i]);
+    }
+    else if (padMode == "SAME")
+    {
+        for (int i = 0; i < inp.size(); i++)
+            out.push_back((inp[i] - 1 + stride[i]) / stride[i]);
+    }
+    else
+    {
+        CV_Error(Error::StsError, "Unsupported padding mode");
+    }
+}
+
+void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                         const std::vector<size_t>& strides, const String &padMode,
+                         std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end)
+{
+    if (padMode == "SAME" || padMode == "VALID")
+    {
+        pads_begin.assign(kernel.size(), 0);
+        pads_end.assign(kernel.size(), 0);
+    }
+    if (padMode == "SAME")
+    {
+        CV_Assert_N(kernel.size() == strides.size(), kernel.size() == inp.size());
+        for (int i = 0; i < pads_begin.size(); i++) {
+            // There are test cases with stride > kernel.
+            if (strides[i] <= kernel[i])
+            {
+                int pad = (kernel[i] - 1 - (inp[i] - 1 + strides[i]) % strides[i]) / 2;
+                pads_begin[i] = pads_end[i] = pad;
+            }
+        }
+    }
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/layers_common.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/layers_common.hpp
@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#include <opencv2/dnn.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+// dispatched AVX/AVX2 optimizations
+#include "./layers_common.simd.hpp"
+#include "layers/layers_common.simd_declarations.hpp"
+#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/ocl4dnn.hpp"
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations,
+                                cv::String &padMode, std::vector<size_t>& adjust_pads);
+
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<bool>& globalPooling,
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode);
+
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out);
+
+ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& strides, const String &padMode,
+                          std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
+}
+}
+
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/layers_common.simd.hpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/lrn_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/lrn_layer.cpp
@ -0,0 +1,530 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/dnn/shape_utils.hpp"
+#include "opencv2/core/hal/hal.hpp"
+#include <algorithm>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+using namespace cv::dnn::ocl4dnn;
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/lrn.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class LRNLayerImpl CV_FINAL : public LRNLayer
+{
+public:
+    LRNLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        type = -1;
+        String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
+        if (nrmType == "ACROSS_CHANNELS")
+            type = CHANNEL_NRM;
+        else if (nrmType == "WITHIN_CHANNEL")
+            type = SPATIAL_NRM;
+        else
+            CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
+
+        size = params.get<int>("local_size", 5);
+        if (size % 2 != 1 || size <= 0)
+            CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
+
+        alpha = params.get<double>("alpha", 1);
+        beta = params.get<double>("beta", 0.75);
+        bias = params.get<double>("bias", 1);
+        normBySize = params.get<bool>("norm_by_size", true);
+    }
+
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNLRN<float> > lrnOp;
+#endif
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) {
+            return bias == (int)bias;
+        }
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+            return bias == (int)bias;
+        }
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && (size % 2 == 1) && (type == CHANNEL_NRM));
+    }
+
+#ifdef HAVE_OPENCL
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        lrnOp.release();
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (lrnOp.empty())
+        {
+            OCL4DNNLRNConfig config;
+            config.lrn_type = type == CHANNEL_NRM ?
+                              LRNParameter_NormRegion_ACROSS_CHANNELS :
+                              LRNParameter_NormRegion_WITHIN_CHANNEL;
+
+            CHECK_EQ(size % 2, 1)<< "LRN only supports odd values for local_size";
+            config.local_size = size;
+            config.alpha = alpha;
+            config.beta = beta;
+            config.k = bias;
+            CHECK_EQ(4, inputs[0].dims) << "Input must have 4 axes, "
+                     << "corresponding to (num, channels, height, width)";
+            config.batch_size = inputs[0].size[0];
+            config.channels = inputs[0].size[1];
+            config.height = inputs[0].size[2];
+            config.width = inputs[0].size[3];
+            config.norm_by_size = normBySize;
+            config.use_half = use_half;
+
+            lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
+        }
+
+        if (!lrnOp->Forward(inputs[0], outputs[0]))
+            return false;
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_Assert(inputs_arr.total() == outputs_arr.total());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == outputs.size());
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i].dims == 4);
+
+            Mat &src = inputs[i];
+            Mat &dst = outputs[i];
+
+            switch (type)
+            {
+                case CHANNEL_NRM:
+                    channelNormalization(src, dst);
+                    break;
+                case SPATIAL_NRM:
+                    spatialNormalization(src, dst);
+                    break;
+                default:
+                    CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+                    break;
+            }
+        }
+    }
+
+    class ChannelLRN : public ParallelLoopBody
+    {
+    public:
+        ChannelLRN(const float* src, float* dst, int channels, int ksize,
+                   float alpha1, float bias1, float beta1,
+                   size_t planeSize, int nsamples, int nstripes)
+        {
+            src_ = src; dst_ = dst;
+            channels_ = channels;
+            ksize_ = ksize;
+            alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
+            planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int nsamples = nsamples_, nstripes = nstripes_;
+            size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
+            size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
+            size_t rstart = r.start*elemsPerStripe;
+            size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
+            rstart = std::min(rstart, planeSize_n);
+            rend = std::min(rend, planeSize_n);
+            float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
+            int k, channels = channels_, ksize = ksize_;
+
+            AutoBuffer<float> buf_((channels + ksize + 1)*2);
+            float* acc = buf_.data();
+            float* buf = acc + channels + ksize + 1;
+            for( k = 0; k <= ksize; k++ )
+                buf[-k-1] = buf[channels + k] = 0.f;
+
+            for( size_t ofs = rstart; ofs < rend; )
+            {
+                int sampleIdx = (int)(ofs/planeSize);
+                if( sampleIdx >= nsamples )
+                    break;
+                size_t ofs0 = ofs - sampleIdx*planeSize;
+                size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
+                const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
+                float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;
+
+                for( ; ofs < ofs1; ofs++, src++, dst++ )
+                {
+                    for( k = 0; k < channels; k++ )
+                        buf[k] = src[k*planeSize];
+                    float s = 0;
+                    for( k = 0; k < ksize; k++ )
+                        s += buf[k]*buf[k];
+                    for( k = 0; k < channels; k++ )
+                    {
+                        float x1 = buf[k + ksize];
+                        float x0 = buf[k - ksize - 1];
+                        s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
+                        acc[k] = (float)(alpha1*s + bias1);
+                    }
+
+                    hal::log32f(acc, acc, channels);
+                    for( k = 0; k < channels; k++ )
+                        acc[k] *= beta1;
+                    hal::exp32f(acc, acc, channels);
+
+                    for( k = 0; k < channels; k++ )
+                        dst[k*planeSize] = buf[k]*acc[k];
+                }
+            }
+        }
+
+        const float* src_;
+        float* dst_;
+        float alpha1_, bias1_, beta1_;
+        size_t planeSize_;
+        int channels_, ksize_, nsamples_, nstripes_;
+    };
+
+    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int ksize = (size - 1) / 2;
+        int sizeNormFactor = normBySize ? size : 1;
+        size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
+
+        int nstripes = std::max(getNumThreads(), 1);
+
+        ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
+                        ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
+        parallel_for_(Range(0, nstripes), clrn, nstripes);
+    }
+
+    void sqrBoxFilter_(const Mat &src, Mat &dst)
+    {
+        Mat srcRawWrapper(src.rows, src.cols, src.type(), src.data, src.step[0]);
+        cv::sqrBoxFilter(srcRawWrapper, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
+    }
+
+    void spatialNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int sizeNormFactor = normBySize ? size*size : 1;
+
+        Mat srcMat = srcBlob;
+        Mat dstMat = dstBlob;
+
+        for (int n = 0; n < num; n++)
+        {
+            for (int cn = 0; cn < channels; cn++)
+            {
+                Mat src = getPlane(srcMat, n, cn);
+                Mat dst = getPlane(dstMat, n, cn);
+
+                sqrBoxFilter_(src, dst);
+
+                dst.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
+                cv::pow(dst, beta, dst);
+                cv::divide(src, dst, dst);
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::LRNType type_;
+        if (type == CHANNEL_NRM)
+            type_ = cuda4dnn::LRNType::ACROSS_CHANNELS;
+        else if (type == SPATIAL_NRM)
+            type_ = cuda4dnn::LRNType::WITHIN_CHANNEL;
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown normalization region");
+
+        float alphaSize = alpha;
+        if (!normBySize) {
+            switch (type) {
+            case CHANNEL_NRM: alphaSize = alpha * size; break;
+            case SPATIAL_NRM: alphaSize = alpha * size * size; break;
+            }
+        }
+
+        std::size_t largestInputSize = 0;
+        for(auto& wrapper : inputs) {
+            auto input_wrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+            auto shape = input_wrapper->getShape();
+            largestInputSize = std::max<std::size_t>(
+                largestInputSize,
+                std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<int>())
+            );
+        }
+
+        return make_cuda_node<cuda4dnn::LRNOp>(preferableTarget,
+            std::move(context->cudnn_handle), type_, size, alphaSize, beta, bias, largestInputSize);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpLRN(size / 2, bias, alpha, beta, normBySize));
+        return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
+#endif
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        float alphaSize = alpha;
+        if (normBySize)
+            alphaSize /= (type == CHANNEL_NRM ? size : size * size);
+        int width, height, channels, numImgs;
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &width, &height, &channels, &numImgs);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Func padded_sq(name + "_padded_sq");
+        Halide::Func sq("sq");
+        sq(x, y, c, n) = inputBuffer(x, y, c, n) * inputBuffer(x, y, c, n);
+
+        Halide::Func bounded =
+            Halide::BoundaryConditions::constant_exterior(sq, 0, 0, width,
+                                                          0, height,
+                                                          0, channels,
+                                                          0, numImgs);
+        padded_sq(x, y, c, n) = bounded(x, y, c, n);
+
+        Halide::Expr base;
+        if (type == CHANNEL_NRM)
+        {
+            Halide::RDom r((1 - size) / 2, size);
+            base = alphaSize * sum(padded_sq(x, y, c + r, n));
+        }
+        else  // SPATIAL_NRM
+        {
+            Halide::RDom r((1 - size) / 2, size, (1 - size) / 2, size);
+            base = alphaSize * sum(padded_sq(x + r.x, y + r.y, c, n));
+        }
+        base += static_cast<float>(bias);
+        top(x, y, c, n) = inputBuffer(x, y, c, n) / pow(base, beta);
+        return Ptr<BackendNode>(new HalideBackendNode({ padded_sq, top }));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
+                                      const std::vector<Mat*> &inputs,
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const CV_OVERRIDE
+    {
+#ifdef  HAVE_HALIDE
+        if (targetId != DNN_TARGET_CPU)
+        {
+            Layer::applyHalideScheduler(node, inputs, outputs, targetId);
+            return;
+        }
+        int outW, outH, outC, outN;
+        getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n"), yo("yo"), yi("yi"), tile("tile");
+        Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1];
+        Halide::Func& padded_sq = node.dynamicCast<HalideBackendNode>()->funcs[0];
+
+        if (outW < 8 || outH <= 2)
+            return;
+
+        top.reorder(x, c, y, n)
+           .split(y, yo, yi, 2)
+           .fuse(yo, n, tile)
+           .parallel(tile)
+           .unroll(yi)
+           .vectorize(x, 8);
+        padded_sq.store_at(top, tile)
+                 .compute_at(top, yi);
+#endif  // HAVE_HALIDE
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        float alphaSize = alpha;
+        if (!normBySize)
+            alphaSize *= (type == SPATIAL_NRM ? size*size : size);
+
+        InferenceEngine::Builder::NormLayer ieLayer(name);
+        ieLayer.setSize(size);
+        ieLayer.setAlpha(alphaSize);
+        ieLayer.setBeta(beta);
+        ieLayer.setAcrossMaps(type == CHANNEL_NRM);
+
+        InferenceEngine::Builder::Layer l = ieLayer;
+        l.getParameters()["k"] = bias;
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        float alphaSize = alpha;
+        if (!normBySize)
+            alphaSize *= (type == SPATIAL_NRM ? size*size : size);
+
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> axes;
+        if (type != SPATIAL_NRM) {
+            axes = {1};
+        } else {
+            axes.resize(ieInpNode->get_shape().size() - 2);
+            std::iota(axes.begin(), axes.end(), 2);
+        }
+        auto ngraph_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes.data());
+        auto lrn = std::make_shared<ngraph::op::LRN>(ieInpNode, ngraph_axes, alphaSize, beta, bias, size);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(lrn));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        CV_Assert(inputs.size() > 0);
+        long flops = 0;
+
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            if (type == CHANNEL_NRM)
+            {
+                int channels = inputs[i][1];
+                int ksize = (size - 1) / 2;
+
+                flops += inputs[i][0]*(std::min(ksize, channels)*2*total(inputs[i], 2) + channels*4*total(inputs[i], 2));
+
+                if (ksize < channels)
+                {
+                    flops += (size + 2*(channels - size))*total(inputs[i], 2);
+                }
+            }
+            else
+            {
+                flops += total(inputs[i])*(2*size*size + 2);
+            }
+        }
+        return flops;
+    }
+
+private:
+    enum Type
+    {
+        CHANNEL_NRM,
+        SPATIAL_NRM
+    };
+};
+
+Ptr<LRNLayer> LRNLayer::create(const LayerParams& params)
+{
+    return Ptr<LRNLayer>(new LRNLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/max_unpooling_layer.cpp
@ -0,0 +1,196 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of Batch Normalization layer.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/max_unpooling.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class MaxUnpoolLayerImpl CV_FINAL : public MaxUnpoolLayer
+{
+public:
+    MaxUnpoolLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        poolKernel = Size(params.get<int>("pool_k_w"), params.get<int>("pool_k_h"));
+        poolPad = Size(params.get<int>("pool_pad_w"), params.get<int>("pool_pad_h"));
+        poolStride = Size(params.get<int>("pool_stride_w"), params.get<int>("pool_stride_h"));
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && !poolPad.width && !poolPad.height);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 2 || inputs.size() == 3);
+        CV_Assert(total(inputs[0]) == total(inputs[1]));
+
+        MatShape outShape;
+        if (inputs.size() == 2)
+        {
+            outShape = inputs[0];
+            outShape[2] = (outShape[2] - 1) * poolStride.height + poolKernel.height - 2 * poolPad.height;
+            outShape[3] = (outShape[3] - 1) * poolStride.width + poolKernel.width - 2 * poolPad.width;
+        }
+        else
+            outShape = inputs[2];
+
+        outputs.clear();
+        outputs.push_back(outShape);
+
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == 2 || inputs.size() == 3);
+        Mat& input = inputs[0];
+        Mat& indices = inputs[1];
+
+        CV_Assert(input.total() == indices.total());
+        CV_Assert(input.size[0] == 1);
+        CV_Assert(input.isContinuous());
+
+        for(int i_n = 0; i_n < outputs.size(); i_n++)
+        {
+            Mat& outBlob = outputs[i_n];
+            outBlob.setTo(0);
+            CV_Assert(input.size[1] == outBlob.size[1]);
+            int outPlaneTotal = outBlob.size[2]*outBlob.size[3];
+
+            for (int i_c = 0; i_c < input.size[1]; i_c++)
+            {
+                Mat outPlane = getPlane(outBlob, 0, i_c);
+                int wh_area = input.size[2]*input.size[3];
+                const float* inptr = input.ptr<float>(0, i_c);
+                const float* idxptr = indices.ptr<float>(0, i_c);
+                float* outptr = outPlane.ptr<float>();
+
+                for(int i_wh = 0; i_wh < wh_area; i_wh++)
+                {
+                    int index = idxptr[i_wh];
+                    if (!(0 <= index && index < outPlaneTotal))
+                    {
+                        std::cerr
+                            << "i_n=" << i_n << std::endl
+                            << "i_c=" << i_c << std::endl
+                            << "i_wh=" << i_wh << std::endl
+                            << "index=" << index << std::endl
+                            << "maxval=" << inptr[i_wh] << std::endl
+                            << "outPlaneTotal=" << outPlaneTotal << std::endl
+                            << "input.size=" << input.size << std::endl
+                            << "indices.size=" << indices.size << std::endl
+                            << "outBlob=" << outBlob.size << std::endl
+                            ;
+                        CV_Assert(0 <= index && index < outPlaneTotal);
+                    }
+                    outptr[index] = inptr[i_wh];
+                }
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::MaxUnpoolingConfiguration config;
+        auto& window_size = config.window_size;
+        window_size.resize(2);
+        window_size[0] = poolKernel.height;
+        window_size[1] = poolKernel.width;
+
+        auto& strides = config.strides;
+        strides.resize(2);
+        strides[0] = poolStride.height;
+        strides[1] = poolStride.width;
+
+        auto& pads_begin = config.pads_begin;
+        pads_begin.resize(2);
+        pads_begin[0] = poolPad.height;
+        pads_begin[1] = poolPad.width;
+
+        return make_cuda_node<cuda4dnn::MaxUnpoolingOp>(preferableTarget, std::move(context->stream), config);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        // Meaningless operation if false because if kernel > stride
+        // it is not deterministic and if kernel < stride we just
+        // skip a part of input data (you'd better change your model).
+        if (poolKernel.width != poolStride.width ||
+            poolKernel.height != poolStride.height)
+            CV_Error(cv::Error::StsNotImplemented,
+                     "Halide backend for maximum unpooling "
+                     "is not support cases when kernel != stride");
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Buffer<float> inputBuffer = halideBuffer(input[0]);
+        Halide::Buffer<float> indices = halideBuffer(input[1]);
+
+        Halide::Expr pooledX = x / poolKernel.width;
+        Halide::Expr pooledY = y / poolKernel.height;
+
+        const int outW = inputBuffer.width() * poolKernel.width;
+        top(x, y, c, n) = select(y * outW + x == indices(pooledX, pooledY, c, n),
+                                 inputBuffer(pooledX, pooledY, c, n), 0.0f);
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+};
+
+Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
+{
+    return Ptr<MaxUnpoolLayer>(new MaxUnpoolLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/mvn_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/mvn_layer.cpp
@ -0,0 +1,463 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_cuda.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/mvn.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class MVNLayerImpl CV_FINAL : public MVNLayer
+{
+public:
+    MVNLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        normVariance = params.get<bool>("normalize_variance", true);
+        acrossChannels = params.get<bool>("across_channels", false);
+        eps = params.get<double>("eps", 1e-9);
+        fuse_batch_norm = false;
+        fuse_relu = false;
+        relu_slope = 0.f;
+        zeroDev = false;
+    }
+
+    Mat scale, shift;
+#ifdef HAVE_OPENCL
+    UMat umat_scale, umat_shift;
+#endif
+    bool fuse_batch_norm;
+
+    Ptr<ReLULayer> activ_relu;
+    float relu_slope;
+    bool fuse_relu;
+    bool zeroDev;  // TODO: Doesn't considered in Intel's Inference Engine backend.
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        if (!layer.empty() && !fuse_relu && !fuse_batch_norm)
+        {
+            layer->getScaleShift(scale, shift);
+            fuse_batch_norm = !scale.empty() || !shift.empty();
+            return fuse_batch_norm;
+        }
+
+        if (!layer.empty() && preferableTarget == DNN_TARGET_OPENCL)
+        {
+            activ_relu = layer.dynamicCast<ReLULayer>();
+            if( !activ_relu.empty() )
+                relu_slope = activ_relu->negativeSlope;
+        }
+        fuse_relu = !activ_relu.empty();
+        return fuse_relu;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int i, newRows = 1;
+        for( i = 0; i < splitDim; i++ )
+            newRows *= inputs[0].size[i];
+        zeroDev = inputs[0].total() == newRows;
+#ifdef HAVE_OPENCL
+        umat_scale.release();
+        umat_shift.release();
+#endif
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        {
+            bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
+            return !zeroDev && (!isMyriad || eps <= 1e-7f);
+        }
+#endif
+#ifdef HAVE_DNN_NGRAPH
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return true;
+#endif
+        {
+            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
+    {
+        if (umat_scale.empty() && !scale.empty())
+            scale.copyTo(umat_scale);
+        if (umat_shift.empty() && !shift.empty())
+            shift.copyTo(umat_shift);
+        UMat& bnorm_weight = umat_scale;
+        UMat& bnorm_bias = umat_shift;
+
+        const unsigned LOCAL_SIZE = 128;
+        bool use_half = (inputs[0].depth() == CV_16S);
+        String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
+                             use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
+                             LOCAL_SIZE
+        );
+
+        int splitDim = (acrossChannels) ? 1 : 2;
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            UMat &inpMat = inputs[inpIdx];
+            UMat &outMat = outputs[inpIdx];
+            int newRows = total(shape(inpMat), 0, splitDim);
+            CV_Assert(newRows != 0);
+
+            MatShape s = shape(newRows, inpMat.total() / newRows);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
+            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
+            float alpha = 1.0f / s[1];
+
+            String buildopt = "-DNUM=4" + opts;
+            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN_FUSE");
+            size_t localsize[] = { LOCAL_SIZE };
+            size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
+
+            int argId = 0;
+            k.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
+            k.set(argId++, (int)s[1]);
+            k.set(argId++, alpha);
+            k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat));
+            k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat));
+            bool ret = k.run(1, globalsize, localsize, false);
+            if (!ret)
+                return false;
+
+            buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
+                               (fuse_relu) ? "-DFUSE_RELU" : "");
+
+            ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MVN_FUSE");
+            argId = 0;
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(meanMat));
+            k1.set(argId++, (int)s[1]);
+            k1.set(argId++, (float)alpha);
+            k1.set(argId++, (float)eps);
+            k1.set(argId++, (float)relu_slope);
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
+            k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
+            ret = k1.run_(1, globalsize, localsize, false);
+            if (!ret)
+                return false;
+        }
+        return true;
+    }
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        if (umat_scale.empty() && !scale.empty())
+            scale.copyTo(umat_scale);
+        if (umat_shift.empty() && !shift.empty())
+            shift.copyTo(umat_shift);
+        UMat& bnorm_weight = umat_scale;
+        UMat& bnorm_bias = umat_shift;
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int row_size = total(shape(inputs[0]), 0, splitDim);
+        int plane_size = total(shape(inputs[0]), splitDim);
+        if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
+            return fast_forward_ocl(inputs, outputs);
+
+        if (inputs[0].depth() == CV_16S)
+            return false;
+
+        String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
+
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            UMat &inpMat = inputs[inpIdx];
+            UMat &outMat = outputs[inpIdx];
+            int newRows = total(shape(inpMat), 0, splitDim);
+            CV_Assert(newRows != 0);
+
+            MatShape s = shape(newRows, inpMat.total() / newRows);
+            UMat oneMat = UMat::ones(s[1], 1, CV_32F);
+            UMat meanMat = UMat(s[0], 1, CV_32F);
+            UMat devMat  = UMat(s[0], 1, CV_32F);
+            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
+            float alpha = 1.0f / s[1];
+
+            bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
+                                                   inpMat, 0, oneMat, 0, 0.0f, meanMat, 0);
+            if (!ret)
+                return false;
+
+            int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
+            size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
+            String buildopt = format("-DNUM=%d", number) + opts;
+            if (normVariance)
+            {
+                String kname = format("calc_mean%d", number);
+                ocl::Kernel kernel(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt + " -DKERNEL_MEAN");
+                if (kernel.empty())
+                    return false;
+
+                kernel.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
+                kernel.set(1, (int)s[0]);
+                kernel.set(2, (int)s[1]);
+                kernel.set(3, ocl::KernelArg::PtrReadOnly(meanMat));
+                kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmpMat));
+                ret = kernel.run(2, global, NULL, false);
+                if (!ret)
+                    return false;
+
+                ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, s[0], s[1], alpha,
+                                                  tmpMat, 0, oneMat, 0, 0.0f, devMat, 0);
+                if (!ret)
+                    return false;
+            }
+
+            String kname = format("mvn%d", number);
+            buildopt += format("%s%s%s -DKERNEL_MVN", (normVariance) ? " -DNORM_VARIANCE" : "",
+                               (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "",
+                               (fuse_relu) ? " -DFUSE_RELU" : "");
+            ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);
+            if (kernel1.empty())
+                return false;
+            kernel1.set(0, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel1.set(1, (int)s[0]);
+            kernel1.set(2, (int)s[1]);
+            kernel1.set(3, (float)eps);
+            kernel1.set(4, ocl::KernelArg::PtrReadOnly(meanMat));
+            kernel1.set(5, ocl::KernelArg::PtrReadOnly(devMat));
+            kernel1.set(6, ocl::KernelArg::PtrReadOnly(bnorm_weight));
+            kernel1.set(7, ocl::KernelArg::PtrReadOnly(bnorm_bias));
+            kernel1.set(8, (int)inpMat.size[1]);
+            kernel1.set(9, (float)relu_slope);
+            kernel1.set(10, ocl::KernelArg::PtrWriteOnly(outMat));
+            ret = kernel1.run(2, global, NULL, false);
+            if (!ret)
+                return false;
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            Mat &inpBlob = inputs[inpIdx];
+            Mat &outBlob = outputs[inpIdx];
+
+            int splitDim = (acrossChannels) ? 1 : 2;
+            int i, newRows = 1;
+            for( i = 0; i < splitDim; i++ )
+                newRows *= inpBlob.size[i];
+
+            Mat inpMat = inpBlob.reshape(1, newRows);
+            Mat outMat = outBlob.reshape(1, newRows);
+
+            if ( inpBlob.total() == newRows )
+            {
+                // MVN is applied to single values at an every row.
+                if (shift.empty())
+                {
+                    outBlob.setTo(0);
+                }
+                else
+                {
+                    for ( i = 0; i < newRows; i++ )
+                    {
+                        outMat.row(i).setTo(((float*)shift.data)[i]);
+                    }
+                }
+                return;
+            }
+
+            Scalar mean, dev;
+            for ( i = 0; i < newRows; i++)
+            {
+                Mat inpRow = inpMat.row(i);
+                Mat outRow = outMat.row(i);
+                float weight = 1.f;
+                float bias = 0.f;
+                if (fuse_batch_norm)
+                {
+                    weight = i < scale.cols ? ((float*)scale.data)[i] : weight;
+                    bias = i < shift.cols ? ((float*)shift.data)[i] : bias;
+                }
+                cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
+                double alpha = 1;
+                if (normVariance)
+                {
+                    alpha = 1 / std::sqrt(eps + dev[0]*dev[0]);
+                }
+                double normalizationScale = 1.0;
+                double normalizationShift = 0.0;
+                if (fuse_batch_norm)
+                {
+                    normalizationScale = alpha * weight;
+                    normalizationShift = -mean[0] * normalizationScale + bias;
+                }
+                else
+                {
+                    normalizationScale = alpha;
+                    normalizationShift = -mean[0] * alpha;
+                }
+                inpRow.convertTo(outRow, outRow.type(), normalizationScale, normalizationShift);
+            }
+        }
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::MVNLayer ieLayer(name);
+        ieLayer.setAcrossChannels(acrossChannels);
+        ieLayer.setNormalize(normVariance);
+        ieLayer.setEpsilon(eps);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
+        auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
+#else
+        int64_t start_axis = acrossChannels ? 1 : 2;
+        std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
+        std::iota(axes_v.begin(), axes_v.end(), start_axis);
+        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
+        auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
+#endif
+        return Ptr<BackendNode>(new InfEngineNgraphNode(mvn));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::MVNConfiguration config;
+        config.split_axis = acrossChannels ? 1 : 2;
+        config.normalize_variance = normVariance;
+        config.epsilon = eps;
+        config.input_shapes.resize(inputs.size());
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            auto wrapper = inputs[i].dynamicCast<CUDABackendWrapper>();
+            auto shape = wrapper->getShape();
+            config.input_shapes[i].assign(std::begin(shape), std::end(shape));
+        }
+
+        return make_cuda_node<cuda4dnn::MVNOp>(preferableTarget, std::move(context->stream), config);
+    }
+#endif
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 6*total(inputs[i]) + 3*total(inputs[i], 0, normVariance ? 2 : 1);
+        }
+        return flops;
+    }
+};
+
+Ptr<MVNLayer> MVNLayer::create(const LayerParams& params)
+{
+    return Ptr<MVNLayer>(new MVNLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -0,0 +1,402 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/normalize_bbox.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
+{
+public:
+    NormalizeBBoxLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        pnorm = params.get<float>("p", 2);
+        epsilon = params.get<float>("eps", 1e-10f);
+        acrossSpatial = params.get<bool>("across_spatial", true);
+        startAxis = params.get<int>("start_axis", 1);
+        CV_Assert(!params.has("across_spatial") || !params.has("end_axis"));
+        endAxis = params.get<int>("end_axis", acrossSpatial ? -1 : startAxis);
+        CV_Assert(pnorm > 0);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            if (pnorm != 2)
+                return false;
+
+            bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
+            if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && isMyriad)
+                return !acrossSpatial;
+
+            return startAxis == 1;
+        }
+        return backendId == DNN_BACKEND_OPENCV ||
+               (backendId == DNN_BACKEND_CUDA && (pnorm == 1 || pnorm == 2));
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        internals.resize(1, inputs[0]);
+        internals[0][0] = 1;  // Batch size.
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        CV_Assert(inputs.size() == 1);
+        endAxis = endAxis == -1 ? (inputs[0].dims - 1) : endAxis;
+        startAxis = startAxis == -1 ? (inputs[0].dims - 1) : startAxis;
+        acrossSpatial = (startAxis == 1 && endAxis == inputs[0].dims - 1);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        if (inputs_.depth() == CV_16S)
+            return false;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+        CV_Assert(inputs[0].total() == outputs[0].total());
+
+        const UMat& inp0 = inputs[0];
+        UMat& buffer = internals[0];
+        startAxis = normalize_axis(startAxis, inp0.dims);
+        endAxis = normalize_axis(endAxis, inp0.dims);
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        size_t planeSize = inp0.total() / (num * numPlanes);
+        MatShape s = shape(1, inputs[0].total());
+        UMat inp = inputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
+        UMat out = outputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
+        for (size_t i = 0; i < num; ++i)
+        {
+            s = shape(numPlanes, planeSize);
+            UMat src = inp.row(i).reshape(1, s.size(), &s[0]);
+            UMat dst = out.row(i).reshape(1, s.size(), &s[0]);
+
+            UMat abs_mat;
+            absdiff(src, cv::Scalar::all(0), abs_mat);
+            pow(abs_mat, pnorm, buffer);
+
+            if (planeSize == 1)
+            {
+                // add eps to avoid overflow
+                float absSum = sum(buffer)[0] + epsilon;
+                float norm = pow(absSum, 1.0f / pnorm);
+                multiply(src, 1.0f / norm, dst);
+            }
+            else
+            {
+                Mat norm;
+                reduce(buffer, norm, 0, REDUCE_SUM);
+                norm += epsilon;
+
+                // compute inverted norm to call multiply instead divide
+                cv::pow(norm, -1.0f / pnorm, norm);
+
+                repeat(norm, numPlanes, 1, buffer);
+                multiply(src, buffer, dst);
+            }
+
+            if (!blobs.empty())
+            {
+                // scale the output
+                Mat scale = blobs[0];
+                if (scale.total() == 1)
+                {
+                    // _scale: 1 x 1
+                    multiply(dst, scale.at<float>(0, 0), dst);
+                }
+                else
+                {
+                    // _scale: _channels x 1
+                    CV_Assert(scale.total() == numPlanes);
+                    repeat(scale, 1, dst.cols, buffer);
+                    multiply(dst, buffer, dst);
+                }
+            }
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+        CV_Assert(inputs[0].total() == outputs[0].total());
+
+        const Mat& inp0 = inputs[0];
+        Mat& buffer = internals[0];
+        startAxis = normalize_axis(startAxis, inp0.dims);
+        endAxis = normalize_axis(endAxis, inp0.dims);
+
+        const float* inpData = inp0.ptr<float>();
+        float* outData = outputs[0].ptr<float>();
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        CV_Assert(num * numPlanes != 0);
+        size_t planeSize = inp0.total() / (num * numPlanes);
+        for (size_t n = 0; n < num; ++n)
+        {
+            Mat src = Mat(numPlanes, planeSize, CV_32F, (void*)inpData);
+            Mat dst = Mat(numPlanes, planeSize, CV_32F, (void*)outData);
+            cv::pow(abs(src), pnorm, buffer);
+
+            if (planeSize == 1)
+            {
+                // add eps to avoid overflow
+                float absSum = sum(buffer)[0] + epsilon;
+                float norm = pow(absSum, 1.0f / pnorm);
+                multiply(src, 1.0f / norm, dst);
+            }
+            else
+            {
+                Mat norm;
+                reduce(buffer, norm, 0, REDUCE_SUM);
+                norm += epsilon;
+
+                // compute inverted norm to call multiply instead divide
+                cv::pow(norm, -1.0f / pnorm, norm);
+
+                repeat(norm, numPlanes, 1, buffer);
+                multiply(src, buffer, dst);
+            }
+
+            if (!blobs.empty())
+            {
+                // scale the output
+                Mat scale = blobs[0];
+                if (scale.total() == 1)
+                {
+                    // _scale: 1 x 1
+                    dst *= scale.at<float>(0, 0);
+                }
+                else
+                {
+                    // _scale: _channels x 1
+                    CV_Assert(scale.total() == numPlanes);
+                    repeat(scale, 1, dst.cols, buffer);
+                    multiply(dst, buffer, dst);
+                }
+            }
+            inpData += numPlanes * planeSize;
+            outData += numPlanes * planeSize;
+        }
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+        std::vector<size_t> dims = input->getDims();
+        if (dims.size() == 4)
+        {
+            InferenceEngine::Builder::NormalizeLayer ieLayer(name);
+
+            ieLayer.setChannelShared(false);
+            ieLayer.setAcrossMaps(acrossSpatial);
+            ieLayer.setEpsilon(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            const int numChannels = dims[1];
+            InferenceEngine::Blob::Ptr weights;
+            if (blobs.empty())
+            {
+                weights = InferenceEngine::make_shared_blob<float>({
+                              InferenceEngine::Precision::FP32,
+                              {(size_t)numChannels}, InferenceEngine::Layout::C
+                          });
+                weights->allocate();
+
+                Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels);
+                Mat(numChannels, 1, CV_32F, Scalar(1)).copyTo(weightsMat);
+                l.getParameters()["channel_shared"] = false;
+            }
+            else
+            {
+                CV_Assert(numChannels == blobs[0].total());
+                weights = wrapToInfEngineBlob(blobs[0], {(size_t)numChannels}, InferenceEngine::Layout::C);
+                l.getParameters()["channel_shared"] = blobs[0].total() == 1;
+            }
+            addConstantData("weights", weights, l);
+            l.getParameters()["across_spatial"] = acrossSpatial;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::GRNLayer ieLayer(name);
+            ieLayer.setBeta(epsilon);
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            l.getParameters()["bias"] = epsilon;
+
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        const size_t batch = ieInpNode->get_shape()[0];
+        const size_t numChannels = ieInpNode->get_shape()[1];
+
+        std::vector<int64_t> axes_data;
+        if (!acrossSpatial) {
+            axes_data.push_back(1);
+        } else {
+            axes_data.resize(ieInpNode->get_shape().size() - 1);
+            std::iota(axes_data.begin(), axes_data.end(), 1);
+        }
+        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
+        auto norm = std::make_shared<ngraph::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
+
+        CV_Assert(blobs.empty() || numChannels == blobs[0].total());
+        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        shape[0] = blobs.empty() ? 1 : batch;
+        shape[1] = numChannels;
+        if (!blobs.empty())
+        {
+            auto weight = std::make_shared<ngraph::op::Constant>(
+                                      ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
+            auto mul = std::make_shared<ngraph::op::v1::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#else
+            auto mul = std::make_shared<ngraph::op::v0::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#endif
+            return Ptr<BackendNode>(new InfEngineNgraphNode(mul));
+        }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(norm));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        if(pnorm != 1 && pnorm != 2)
+            CV_Error(Error::StsNotImplemented, "Unsupported normalization mode");
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto input_shape = input_wrapper->getShape();
+
+        NormalizeConfiguration<float> config;
+        config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+        config.axis_start = normalize_axis(startAxis, input_shape.size());
+        config.axis_end = normalize_axis(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
+        config.norm = pnorm;
+        config.eps = epsilon;
+
+        const auto& weightsMat = blobs.empty() ? Mat() : blobs[0];
+        return make_cuda_node<cuda4dnn::NormalizeOp>(preferableTarget, std::move(context->stream), weightsMat, config);
+    }
+#endif
+
+
+private:
+    int startAxis, endAxis;
+};
+
+
+Ptr<NormalizeBBoxLayer> NormalizeBBoxLayer::create(const LayerParams &params)
+{
+    return Ptr<NormalizeBBoxLayer>(new NormalizeBBoxLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/not_implemented_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/not_implemented_layer.cpp
@ -0,0 +1,194 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "../dnn_common.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+namespace detail {
+
+class NotImplementedImpl CV_FINAL : public NotImplemented
+{
+public:
+    NotImplementedImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(params.has("type"));
+        std::stringstream ss;
+        ss << "Node for layer '" << params.name << "' of type '" << params.get("type") << "' wasn't initialized.";
+        msg = ss.str();
+    }
+
+    CV_DEPRECATED_EXTERNAL
+    virtual void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs, OutputArrayOfArrays outputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED_EXTERNAL
+    virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED_EXTERNAL
+    void finalize(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED std::vector<Mat> finalize(const std::vector<Mat> &inputs)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED void run(const std::vector<Mat> &inputs,
+                           CV_OUT std::vector<Mat> &outputs,
+                           CV_IN_OUT std::vector<Mat> &internals)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual int inputNameToIndex(String inputName) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual int outputNameToIndex(const String& outputName) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initCUDA(
+            void *context,
+            const std::vector<Ptr<BackendWrapper>>& inputs,
+            const std::vector<Ptr<BackendWrapper>>& outputs
+    ) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
+                                      const std::vector<Mat*> &inputs,
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void unsetAttached() CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+private:
+    std::string msg;
+};
+
+Ptr<Layer> NotImplemented::create(const LayerParams& params)
+{
+    return makePtr<NotImplementedImpl>(params);
+}
+
+Ptr<Layer> notImplementedRegisterer(LayerParams &params)
+{
+    return detail::NotImplemented::create(params);
+}
+
+void NotImplemented::Register()
+{
+    LayerFactory::registerLayer("NotImplemented", detail::notImplementedRegisterer);
+}
+
+void NotImplemented::unRegister()
+{
+    LayerFactory::unregisterLayer("NotImplemented");
+}
+
+} // namespace detail
+
+CV__DNN_INLINE_NS_END
+}}  // namespace cv::dnn
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/padding_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/padding_layer.cpp
@ -0,0 +1,293 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of padding layer, which adds paddings to input blob.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <vector>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/padding.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class PaddingLayerImpl CV_FINAL : public PaddingLayer
+{
+public:
+    PaddingLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        paddingValue = params.get<float>("value", 0);
+        inputDims = params.get<int>("input_dims", -1);
+        paddingType = params.get<String>("type", "constant");
+
+        CV_Assert(params.has("paddings"));
+        const DictValue& paddingsParam = params.get("paddings");
+        CV_Assert((paddingsParam.size() & 1) == 0);
+
+        paddings.resize(paddingsParam.size() / 2);
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            paddings[i].first = paddingsParam.get<int>(i * 2);  // Pad before.
+            paddings[i].second = paddingsParam.get<int>(i * 2 + 1);  // Pad after.
+            CV_Assert_N(paddings[i].first >= 0, paddings[i].second >= 0);
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inpShape = inputs[0];
+        CV_Assert(inpShape.size() >= paddings.size());
+        CV_Assert(inputDims == -1 || inpShape.size() == inputDims || inpShape.size() > paddings.size());
+
+        outputs.resize(1, inpShape);
+        int offset = (inputDims == -1 ? 0 : (inpShape.size() > inputDims ? 1 : 0));
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            outputs[0][offset + i] = inpShape[offset + i] + paddings[i].first + paddings[i].second;
+        }
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        // Compute dstRanges.
+        const MatSize& inpShape = inputs[0].size;
+
+        if (inputDims != -1 && inputs[0].dims != inputDims)
+        {
+            paddings.insert(paddings.begin(), std::make_pair(0, 0));
+        }
+
+        dstRanges.resize(paddings.size());
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            dstRanges[i].start = paddings[i].first;
+            dstRanges[i].end = paddings[i].first + inpShape[i];
+        }
+
+        // Add the rest of dimensions.
+        for (int i = dstRanges.size(); i < inputs[0].dims; ++i)
+        {
+            dstRanges.push_back(Range::all());
+            paddings.push_back(std::make_pair(0, 0));
+        }
+        inputDims = -1;  // Next time paddings are filled for all the dimensions.
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
+            if (INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) && isMyriad)
+                return dstRanges.size() == 4 && paddings[0].first == 0 && paddings[0].second == 0;
+
+            return (dstRanges.size() <= 4 || !isArmComputePlugin());
+        }
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && dstRanges.size() == 4);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        if (paddingType == "constant")
+        {
+            if (inputs_arr.depth() == CV_16S)
+            {
+                std::vector<float> paddingValue_fp32(1, paddingValue);
+                std::vector<int16_t> paddingValue_fp16(1);
+                cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
+                outputs[0].setTo(paddingValue_fp16[0]);
+            }
+            else if (inputs_arr.depth() == CV_8S)
+                outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
+            else
+                outputs[0].setTo(paddingValue);
+            inputs[0].copyTo(outputs[0](dstRanges));
+        }
+        else if (paddingType == "reflect")
+        {
+            CV_Assert(inputs.size() == 1);
+            CV_Assert(outputs.size() == 1);
+            CV_Assert(inputs[0].dims == 4);
+            CV_Assert(outputs[0].dims == 4);
+
+            if (inputs[0].size[0] != outputs[0].size[0] || inputs[0].size[1] != outputs[0].size[1])
+                CV_Error(Error::StsNotImplemented, "Only spatial reflection padding is supported.");
+
+            const int inpHeight = inputs[0].size[2];
+            const int inpWidth = inputs[0].size[3];
+            const int outHeight = outputs[0].size[2];
+            const int outWidth = outputs[0].size[3];
+            const int padTop = dstRanges[2].start;
+            const int padBottom = outHeight - dstRanges[2].end;
+            const int padLeft = dstRanges[3].start;
+            const int padRight = outWidth - dstRanges[3].end;
+            CV_CheckLT(padTop, inpHeight, ""); CV_CheckLT(padBottom, inpHeight, "");
+            CV_CheckLT(padLeft, inpWidth, ""); CV_CheckLT(padRight, inpWidth, "");
+
+            for (size_t n = 0; n < inputs[0].size[0]; ++n)
+            {
+                for (size_t ch = 0; ch < inputs[0].size[1]; ++ch)
+                {
+                    copyMakeBorder(getPlane(inputs[0], n, ch),
+                                   getPlane(outputs[0], n, ch),
+                                   padTop, padBottom, padLeft, padRight,
+                                   BORDER_REFLECT_101);
+                }
+            }
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown padding type: " + paddingType);
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::PaddingType ptype;
+        if (paddingType == "constant")
+            ptype = PaddingType::CONSTANT;
+        else if (paddingType == "reflect")
+            ptype = PaddingType::REFLECTION101;
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported padding mode");
+
+        return make_cuda_node<cuda4dnn::PaddingOp>(preferableTarget, std::move(context->stream), ptype, paddingValue, dstRanges);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        int inW, inH, inC, inN;
+        int minN = std::max(dstRanges[0].start, 0);
+        int minC = std::max(dstRanges[1].start, 0);
+        int minY = std::max(dstRanges[2].start, 0);
+        int minX = std::max(dstRanges[3].start, 0);
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Func padded =
+            Halide::BoundaryConditions::constant_exterior(inputBuffer, paddingValue);
+        top(x, y, c, n) = padded(x - minX, y - minY, c - minC, n - minN);
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Pad");
+
+        std::vector<int> begins(paddings.size(), 0), ends(paddings.size(), 0);
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            begins[i] = paddings[i].first;
+            ends[i] = paddings[i].second;
+        }
+        ieLayer.getParameters()["pads_begin"] = begins;
+        ieLayer.getParameters()["pads_end"] = ends;
+        ieLayer.getParameters()["pad_mode"] = paddingType;
+        if (paddingType == "constant")
+            ieLayer.getParameters()["pad_value"] = paddingValue;
+
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> begins(paddings.size(), 0), ends(paddings.size(), 0);
+        for (int i = 0; i < paddings.size(); ++i)
+        {
+            begins[i] = static_cast<int64_t>(paddings[i].first);
+            ends[i]   = static_cast<int64_t>(paddings[i].second);
+        }
+        auto padding_below = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data());
+        auto padding_above = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data());
+        auto pad_mode = paddingType == "constant" ? ngraph::op::PadMode::CONSTANT : ngraph::op::PadMode::REFLECT; // SYMMETRIC
+        auto arg_pad_value = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{}, &paddingValue);;
+
+        auto pad = paddingType == "constant" ?
+             std::make_shared<ngraph::op::v1::Pad>(ieInpNode, padding_below, padding_above, arg_pad_value, pad_mode) :
+             std::make_shared<ngraph::op::v1::Pad>(ieInpNode, padding_below, padding_above, pad_mode);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(pad));
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        float outputScale = scales[1][0];
+        int outputZp = zeropoints[1][0];
+        float padValue = outputZp + std::round(params.get<float>("value", 0)/outputScale);
+        params.set("value", padValue);
+        return true;
+    }
+
+private:
+    std::vector<std::pair<int, int> > paddings;  // Pairs pad before, pad after.
+    std::vector<Range> dstRanges;
+    int inputDims;
+    float paddingValue;
+    std::string paddingType;
+};
+
+Ptr<PaddingLayer> PaddingLayer::create(const LayerParams &params)
+{
+    return Ptr<PaddingLayer>(new PaddingLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/permute_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/permute_layer.cpp
@ -0,0 +1,494 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include <float.h>
+#include <algorithm>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/permute.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+class PermuteLayerImpl CV_FINAL : public PermuteLayer
+{
+public:
+    void checkNeedForPermutation()
+    {
+        _needsPermute = false;
+        for (size_t i = 0; i < _numAxes; ++i)
+        {
+            if (_order[i] != i)
+            {
+                _needsPermute = true;
+                break;
+            }
+        }
+    }
+
+    PermuteLayerImpl(const LayerParams &params)
+        : _count(0), _needsPermute(false), _numAxes(0)
+    {
+        if (!params.has("order"))
+        {
+            return;
+        }
+
+        DictValue paramOrder = params.get("order");
+        _numAxes = paramOrder.size();
+
+        for (size_t i = 0; i < _numAxes; i++)
+        {
+            int currentOrder = paramOrder.get<int>(i);
+            if (currentOrder < 0 || currentOrder > _numAxes)
+            {
+                CV_Error(Error::StsBadArg,
+                         format("Orders of dimensions in Permute layer parameter"
+                                "must be in [0...%zu]", _numAxes - 1));
+            }
+            if (std::find(_order.begin(), _order.end(), currentOrder) != _order.end())
+            {
+                CV_Error(Error::StsBadArg,
+                         "Permute layer parameter contains duplicated orders.");
+            }
+            _order.push_back(currentOrder);
+        }
+
+        setParamsFrom(params);
+        checkNeedForPermutation();
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && preferableTarget == DNN_TARGET_CPU)
+            return _order.size() <= 4 || !isArmComputePlugin();
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine()) ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        if(!_needsPermute)
+        {
+            Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+            return true;
+        }
+
+        CV_Assert(inputs.size() > 0);
+        CV_Assert((int)_numAxes == inputs[0].size());
+
+        MatShape shapeBefore = inputs[0], shapeAfter;
+        for (size_t i = 0; i < _numAxes; i++)
+        {
+            shapeAfter.push_back(shapeBefore[_order[i]]);
+        }
+
+        outputs.clear();
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(total(inputs[i]) == total(shapeAfter));
+            outputs.push_back(shapeAfter);
+        }
+
+        return false;
+    }
+
+    void computeStrides(const MatShape &shapeBefore, const MatShape &shapeAfter)
+    {
+        _oldStride.resize(_numAxes);
+        _newStride.resize(_numAxes);
+
+        _oldStride[_numAxes - 1] = 1;
+        _newStride[_numAxes - 1] = 1;
+
+        for(int i = _numAxes - 2; i >= 0; i--)
+        {
+            _oldStride[i] = _oldStride[i + 1] * shapeBefore[i + 1];
+            _newStride[i] = _newStride[i + 1] * shapeAfter[i + 1];
+        }
+
+        _count = _oldStride[0] * shapeBefore[0];
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        if(!_needsPermute)
+        {
+            return;
+        }
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() > 0);
+        const Mat& inp0 = inputs[0];
+        CV_Assert((int)_numAxes == inp0.dims);
+
+        computeStrides(shape(inputs[0]), shape(outputs[0]));
+
+#ifdef HAVE_OPENCL
+        uorder.release();
+        uold_stride.release();
+        unew_stride.release();
+#endif
+    }
+
+    template <class T>
+    class PermuteInvoker : public ParallelLoopBody
+    {
+    public:
+        const Mat* inp;
+        Mat* out;
+        const std::vector<size_t>* order;
+        int nstripes;
+
+        static void run(const Mat& inp, Mat& out, const std::vector<size_t>& order, int nstripes)
+        {
+            PermuteInvoker p;
+            p.inp = &inp;
+            p.out = &out;
+            p.order = &order;
+            p.nstripes = nstripes;
+
+            CV_Assert( out.size[0] == inp.size[order[0]] &&
+                      out.size[1] == inp.size[order[1]] &&
+                      out.size[2] == inp.size[order[2]] &&
+                      out.size[3] == inp.size[order[3]]);
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        PermuteInvoker() : inp(0), out(0), order(0), nstripes(0) {}
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int n0 = out->size[0], n1 = out->size[1], n2 = out->size[2], n3 = out->size[3];
+
+            size_t orows = (size_t)n0*n1*n2;
+            size_t stripeSize = (orows + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, orows);
+
+            const size_t esz = sizeof(T);
+            size_t ostep0 = out->step[0]/esz, ostep1 = out->step[1]/esz, ostep2 = out->step[2]/esz;
+            const size_t* ord = &order->at(0);
+            size_t istep0 = inp->step[ord[0]]/esz, istep1 = inp->step[ord[1]]/esz,
+            istep2 = inp->step[ord[2]]/esz, istep3 = inp->step[ord[3]]/esz;
+
+            size_t val = stripeStart;
+            int i2 = (int)(val % n2);
+            val /= n2;
+            int i1 = (int)(val % n1);
+            int i0 = (int)(val / n1);
+
+            const T* inptr_orig = inp->ptr<T>();
+            T* outptr_orig = out->ptr<T>();
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; ofs++ )
+            {
+                const T* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
+                T* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
+
+                for( int i3 = 0; i3 < n3; i3++ )
+                    outptr[i3] = inptr[i3*istep3];
+
+                if( ++i2 >= n2 )
+                {
+                    i2 = 0;
+                    if( ++i1 >= n1 )
+                    {
+                        i1 = 0;
+                        if( ++i0 >= n0 )
+                            break;
+                    }
+                }
+            }
+        }
+    };
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (!_needsPermute)
+            return false;
+
+        if (uorder.empty())
+        {
+            std::vector<int> orderVec(_order.begin(), _order.end());;
+            Mat morder(1, orderVec.size(), CV_32SC1, &orderVec[0]);
+
+            std::vector<int> oldStrideVec(_oldStride.begin(), _oldStride.end());
+            Mat mold_stride(1, _oldStride.size(), CV_32SC1, &oldStrideVec[0]);
+
+            std::vector<int> newStrideVec(_newStride.begin(), _newStride.end());
+            Mat mnew_stride(1, newStrideVec.size(), CV_32SC1, &newStrideVec[0]);
+
+            morder.copyTo(uorder);
+            mold_stride.copyTo(uold_stride);
+            mnew_stride.copyTo(unew_stride);
+        }
+
+        bool use_half = (inps.depth() == CV_16S);
+        String opts = format("-DDtype=%s", use_half ? "half" : "float");
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts);
+
+            kernel.set(0, (int)_count);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i]));
+            kernel.set(2, ocl::KernelArg::PtrReadOnly(uorder));
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(uold_stride));
+            kernel.set(4, ocl::KernelArg::PtrReadOnly(unew_stride));
+            kernel.set(5, (int)_numAxes);
+            kernel.set(6, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+
+            if (!kernel.run(1, &_count, NULL, false))
+                return false;
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   inputs_arr.depth() != CV_8S,
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        size_t k, ninputs = inputs.size();
+        if(!_needsPermute)
+        {
+            for (k = 0; k < ninputs; k++)
+            {
+                CV_Assert(outputs[k].total() == inputs[k].total());
+                if (outputs[k].data != inputs[k].data)
+                    inputs[k].copyTo(outputs[k]);
+            }
+        }
+        else
+        {
+            size_t i, j, count = _count, numAxes = _numAxes;
+            const size_t* newStride = &_newStride[0];
+            const size_t* oldStride = &_oldStride[0];
+            const size_t* order = &_order[0];
+
+            for (k = 0; k < ninputs; k++)
+            {
+                const Mat& inp = inputs[k];
+                Mat& out = outputs[k];
+
+                CV_Assert(inp.dims == numAxes && inp.size == inputs[0].size);
+                CV_Assert(out.dims == numAxes && out.size == outputs[0].size);
+
+                CV_Assert(inp.isContinuous() && out.isContinuous());
+                // CV_Assert(inp.type() == CV_32F && out.type() == CV_32F);
+
+                if( numAxes == 4 )
+                {
+                    int nstripes = getNumThreads();
+                    if (inp.type() == CV_8S)
+                        PermuteInvoker<int8_t>::run(inp, out, _order, nstripes);
+                    else
+                        PermuteInvoker<float>::run(inp, out, _order, nstripes);
+                }
+                else
+                {
+                    if (inp.type() == CV_8S)
+                    {
+                        const int8_t *srcData = inp.ptr<int8_t>();
+                        int8_t *dstData = out.ptr<int8_t>();
+
+                        for (i = 0; i < count; ++i)
+                        {
+                            size_t oldPosition = 0;
+                            size_t newPosition = i;
+
+                            for (j = 0; j < numAxes; ++j)
+                            {
+                                oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                                newPosition %= newStride[j];
+                            }
+                            dstData[i] = srcData[oldPosition];
+                        }
+                    }
+                    else
+                    {
+                        const float *srcData = inp.ptr<float>();
+                        float *dstData = out.ptr<float>();
+
+                        for (i = 0; i < count; ++i)
+                        {
+                            size_t oldPosition = 0;
+                            size_t newPosition = i;
+
+                            for (j = 0; j < numAxes; ++j)
+                            {
+                                oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                                newPosition %= newStride[j];
+                            }
+                            dstData[i] = srcData[oldPosition];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::PermuteLayer ieLayer(name);
+        ieLayer.setOrder(_order);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> order(_order.begin(), _order.end());
+        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape({order.size()}), order.data());
+        auto transpose = std::make_shared<ngraph::op::Transpose>(ieInpNode, tr_axes);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(transpose));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::PermuteOp>(preferableTarget, std::move(context->stream), _order);
+    }
+#endif
+
+
+#ifdef HAVE_VULKAN
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+        CV_Assert(!_order.empty());
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPermute(_order));
+        return Ptr<BackendNode>(new VkComBackendNode(input, op));
+    }
+#endif // HAVE_VULKAN
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
+    size_t _count;
+    std::vector<size_t> _order;
+
+    std::vector<int> _oldDimensionSize;
+    std::vector<int> _newDimensionSize;
+
+    std::vector<size_t> _oldStride;
+    std::vector<size_t> _newStride;
+    bool _needsPermute;
+
+#ifdef HAVE_OPENCL
+    UMat uorder, uold_stride, unew_stride;
+#endif
+
+    size_t _numAxes;
+};
+
+Ptr<PermuteLayer> PermuteLayer::create(const LayerParams &params)
+{
+    return Ptr<PermuteLayer>(new PermuteLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/pooling_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/pooling_layer.cpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/prior_box_layer.cpp
@ -0,0 +1,765 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
+#include <ngraph/op/prior_box.hpp>
+#include <ngraph/op/prior_box_clustered.hpp>
+#else
+#include <ngraph/op/experimental/layers/prior_box.hpp>
+#include <ngraph/op/experimental/layers/prior_box_clustered.hpp>
+#endif
+#endif
+
+#include "../op_vkcom.hpp"
+
+#include <float.h>
+#include <algorithm>
+#include <cmath>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/prior_box.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
+{
+public:
+    static bool getParameterDict(const LayerParams &params,
+                                 const std::string &parameterName,
+                                 DictValue& result)
+    {
+        if (!params.has(parameterName))
+        {
+            return false;
+        }
+
+        result = params.get(parameterName);
+        return true;
+    }
+
+    template<typename T>
+    T getParameter(const LayerParams &params,
+                   const std::string &parameterName,
+                   const size_t &idx=0,
+                   const bool required=true,
+                   const T& defaultValue=T())
+    {
+        DictValue dictValue;
+        bool success = getParameterDict(params, parameterName, dictValue);
+        if(!success)
+        {
+            if(required)
+            {
+                std::string message = _layerName;
+                message += " layer parameter does not contain ";
+                message += parameterName;
+                message += " parameter.";
+                CV_Error(Error::StsBadArg, message);
+            }
+            else
+            {
+                return defaultValue;
+            }
+        }
+        return dictValue.get<T>(idx);
+    }
+
+    void getAspectRatios(const LayerParams &params)
+    {
+        DictValue aspectRatioParameter;
+        bool aspectRatioRetieved = getParameterDict(params, "aspect_ratio", aspectRatioParameter);
+        if (!aspectRatioRetieved)
+            return;
+
+        for (int i = 0; i < aspectRatioParameter.size(); ++i)
+        {
+            float aspectRatio = aspectRatioParameter.get<float>(i);
+            bool alreadyExists = fabs(aspectRatio - 1.f) < 1e-6f;
+
+            for (size_t j = 0; j < _aspectRatios.size() && !alreadyExists; ++j)
+            {
+                alreadyExists = fabs(aspectRatio - _aspectRatios[j]) < 1e-6;
+            }
+            if (!alreadyExists)
+            {
+                _aspectRatios.push_back(aspectRatio);
+                if (_flip)
+                {
+                    _aspectRatios.push_back(1./aspectRatio);
+                }
+            }
+        }
+    }
+
+    static void getParams(const std::string& name, const LayerParams &params,
+                          std::vector<float>* values)
+    {
+        DictValue dict;
+        if (getParameterDict(params, name, dict))
+        {
+            values->resize(dict.size());
+            for (int i = 0; i < dict.size(); ++i)
+            {
+                (*values)[i] = dict.get<float>(i);
+            }
+        }
+        else
+            values->clear();
+    }
+
+    void getVariance(const LayerParams &params)
+    {
+        DictValue varianceParameter;
+        bool varianceParameterRetrieved = getParameterDict(params, "variance", varianceParameter);
+        CV_Assert(varianceParameterRetrieved);
+
+        int varianceSize = varianceParameter.size();
+        if (varianceSize > 1)
+        {
+            // Must and only provide 4 variance.
+            CV_Assert(varianceSize == 4);
+
+            for (int i = 0; i < varianceSize; ++i)
+            {
+                float variance = varianceParameter.get<float>(i);
+                CV_Assert(variance > 0);
+                _variance.push_back(variance);
+            }
+        }
+        else
+        {
+            if (varianceSize == 1)
+            {
+                float variance = varianceParameter.get<float>(0);
+                CV_Assert(variance > 0);
+                _variance.push_back(variance);
+            }
+            else
+            {
+                // Set default to 0.1.
+                _variance.push_back(0.1f);
+            }
+        }
+    }
+
+    PriorBoxLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        _flip = getParameter<bool>(params, "flip", 0, false, true);
+        _clip = getParameter<bool>(params, "clip", 0, false, true);
+        _bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);
+
+        getParams("min_size", params, &_minSize);
+        getAspectRatios(params);
+        getVariance(params);
+
+        if (params.has("max_size"))
+        {
+            getParams("max_size", params, &_maxSize);
+            CV_Assert(_minSize.size() == _maxSize.size());
+            for (int i = 0; i < _maxSize.size(); i++)
+                CV_Assert(_minSize[i] < _maxSize[i]);
+        }
+
+        std::vector<float> widths, heights;
+        getParams("width", params, &widths);
+        getParams("height", params, &heights);
+        _explicitSizes = !widths.empty();
+        CV_Assert(widths.size() == heights.size());
+
+        if (_explicitSizes)
+        {
+            CV_Assert(_aspectRatios.empty());
+            CV_Assert(!params.has("min_size"));
+            CV_Assert(!params.has("max_size"));
+            _boxWidths = widths;
+            _boxHeights = heights;
+        }
+        else
+        {
+            CV_Assert(!_minSize.empty());
+            for (int i = 0; i < _minSize.size(); ++i)
+            {
+                float minSize = _minSize[i];
+                CV_Assert(minSize > 0);
+                _boxWidths.push_back(minSize);
+                _boxHeights.push_back(minSize);
+
+                if (_maxSize.size() > 0)
+                {
+                    float size = sqrt(minSize * _maxSize[i]);
+                    _boxWidths.push_back(size);
+                    _boxHeights.push_back(size);
+                }
+
+                // rest of priors
+                for (size_t r = 0; r < _aspectRatios.size(); ++r)
+                {
+                    float arSqrt = sqrt(_aspectRatios[r]);
+                    _boxWidths.push_back(minSize * arSqrt);
+                    _boxHeights.push_back(minSize / arSqrt);
+                }
+            }
+        }
+        CV_Assert(_boxWidths.size() == _boxHeights.size());
+        _numPriors = _boxWidths.size();
+
+        if (params.has("step_h") || params.has("step_w")) {
+          CV_Assert(!params.has("step"));
+          _stepY = getParameter<float>(params, "step_h");
+          CV_Assert(_stepY > 0.);
+          _stepX = getParameter<float>(params, "step_w");
+          CV_Assert(_stepX > 0.);
+        } else if (params.has("step")) {
+          const float step = getParameter<float>(params, "step");
+          CV_Assert(step > 0);
+          _stepY = step;
+          _stepX = step;
+        } else {
+          _stepY = 0;
+          _stepX = 0;
+        }
+        if (params.has("offset_h") || params.has("offset_w"))
+        {
+            CV_Assert_N(!params.has("offset"), params.has("offset_h"), params.has("offset_w"));
+            getParams("offset_h", params, &_offsetsY);
+            getParams("offset_w", params, &_offsetsX);
+            CV_Assert(_offsetsX.size() == _offsetsY.size());
+            _numPriors *= std::max((size_t)1, 2 * (_offsetsX.size() - 1));
+        }
+        else
+        {
+            float offset = getParameter<float>(params, "offset", 0, false, 0.5);
+            _offsetsX.assign(1, offset);
+            _offsetsY.assign(1, offset);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_DNN_NGRAPH
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return _explicitSizes || _stepX == _stepY;
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && haveInfEngine() &&
+                   ( _explicitSizes || (_minSize.size() == 1 && _maxSize.size() <= 1)))
+               || (backendId == DNN_BACKEND_VKCOM && haveVulkan());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(!inputs.empty());
+
+        int layerHeight = inputs[0][2];
+        int layerWidth = inputs[0][3];
+
+        // Since all images in a batch has same height and width, we only need to
+        // generate one set of priors which can be shared across all images.
+        size_t outNum = 1;
+        // 2 channels. First channel stores the mean of each prior coordinate.
+        // Second channel stores the variance of each prior coordinate.
+        size_t outChannels = 2;
+
+        outputs.resize(1, shape(outNum, outChannels,
+                                layerHeight * layerWidth * _numPriors * 4));
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        CV_CheckGT(inputs.size(), (size_t)1, "");
+        CV_CheckEQ(inputs[0].dims, 4, ""); CV_CheckEQ(inputs[1].dims, 4, "");
+        int layerWidth = inputs[0].size[3];
+        int layerHeight = inputs[0].size[2];
+
+        int imageWidth = inputs[1].size[3];
+        int imageHeight = inputs[1].size[2];
+
+        _stepY = _stepY == 0 ? (static_cast<float>(imageHeight) / layerHeight) : _stepY;
+        _stepX = _stepX == 0 ? (static_cast<float>(imageWidth) / layerWidth) : _stepX;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inps.depth() == CV_16S);
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        if (umat_offsetsX.empty())
+        {
+            Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
+            Mat offsetsY(1, _offsetsY.size(), CV_32FC1, &_offsetsY[0]);
+            Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
+            Mat widths(1, _boxWidths.size(), CV_32FC1, &_boxWidths[0]);
+            Mat heights(1, _boxHeights.size(), CV_32FC1, &_boxHeights[0]);
+
+            offsetsX.copyTo(umat_offsetsX);
+            offsetsY.copyTo(umat_offsetsY);
+            variance.copyTo(umat_variance);
+            widths.copyTo(umat_widths);
+            heights.copyTo(umat_heights);
+        }
+
+        String opts;
+        if (use_half)
+            opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4";
+        else
+            opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4";
+
+        size_t nthreads = _layerHeight * _layerWidth;
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts);
+
+        kernel.set(0, (int)nthreads);
+        kernel.set(1, (float)_stepX);
+        kernel.set(2, (float)_stepY);
+        kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
+        kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
+        kernel.set(5, (int)_offsetsX.size());
+        kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_widths));
+        kernel.set(7, ocl::KernelArg::PtrReadOnly(umat_heights));
+        kernel.set(8, (int)_boxWidths.size());
+        kernel.set(9, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+        kernel.set(10, (int)_layerHeight);
+        kernel.set(11, (int)_layerWidth);
+        kernel.set(12, (int)_imageHeight);
+        kernel.set(13, (int)_imageWidth);
+        kernel.run(1, &nthreads, NULL, false);
+
+        // clip the prior's coordinate such that it is within [0, 1]
+        if (_clip)
+        {
+            ocl::Kernel kernel("clip", ocl::dnn::prior_box_oclsrc, opts);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors * 4;
+            if (!kernel.args((int)nthreads, ocl::KernelArg::PtrReadWrite(outputs[0]))
+                       .run(1, &nthreads, NULL, false))
+                return false;
+        }
+
+        // set the variance.
+        {
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts);
+            int offset = total(shape(outputs[0]), 2);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors;
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, (int)offset);
+            kernel.set(2, (int)_variance.size());
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
+            kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == 2);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        float* outputPtr = outputs[0].ptr<float>();
+        float _boxWidth, _boxHeight;
+        for (size_t h = 0; h < _layerHeight; ++h)
+        {
+            for (size_t w = 0; w < _layerWidth; ++w)
+            {
+                for (size_t i = 0; i < _boxWidths.size(); ++i)
+                {
+                    _boxWidth = _boxWidths[i];
+                    _boxHeight = _boxHeights[i];
+                    for (int j = 0; j < _offsetsX.size(); ++j)
+                    {
+                        float center_x = (w + _offsetsX[j]) * _stepX;
+                        float center_y = (h + _offsetsY[j]) * _stepY;
+                        outputPtr = addPrior(center_x, center_y, _boxWidth, _boxHeight, _imageWidth,
+                                             _imageHeight, _bboxesNormalized, outputPtr);
+                    }
+                }
+            }
+        }
+        // clip the prior's coordinate such that it is within [0, 1]
+        if (_clip)
+        {
+            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+            outputPtr = outputs[0].ptr<float>();
+            for (size_t d = 0; d < _outChannelSize; ++d)
+            {
+                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            }
+        }
+        // set the variance.
+        outputPtr = outputs[0].ptr<float>(0, 1);
+        if(_variance.size() == 1)
+        {
+            Mat secondChannel(1, outputs[0].size[2], CV_32F, outputPtr);
+            secondChannel.setTo(Scalar::all(_variance[0]));
+        }
+        else
+        {
+            int count = 0;
+            for (size_t h = 0; h < _layerHeight; ++h)
+            {
+                for (size_t w = 0; w < _layerWidth; ++w)
+                {
+                    for (size_t i = 0; i < _numPriors; ++i)
+                    {
+                        for (int j = 0; j < 4; ++j)
+                        {
+                            outputPtr[count] = _variance[j];
+                            ++count;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        if (_explicitSizes)
+        {
+            InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name);
+            ieLayer.setSteps({_stepY, _stepX});
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+
+            CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
+            CV_Assert(_boxWidths.size() == _boxHeights.size());
+            l.getParameters()["width"] = _boxWidths;
+            l.getParameters()["height"] = _boxHeights;
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+        else
+        {
+            InferenceEngine::Builder::PriorBoxLayer ieLayer(name);
+
+            CV_Assert(!_explicitSizes);
+            ieLayer.setMinSize(_minSize[0]);
+            if (!_maxSize.empty())
+                ieLayer.setMaxSize(_maxSize[0]);
+
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            ieLayer.setOffset(_offsetsX[0]);
+
+            ieLayer.setClip(_clip);
+            ieLayer.setFlip(false);  // We already flipped aspect ratios.
+
+            InferenceEngine::Builder::Layer l = ieLayer;
+            if (_stepX == _stepY)
+            {
+                l.getParameters()["step"] = _stepX;
+                l.getParameters()["step_h"] = 0.0f;
+                l.getParameters()["step_w"] = 0.0f;
+            }
+            else
+            {
+                l.getParameters()["step"] = 0.0f;
+                l.getParameters()["step_h"] = _stepY;
+                l.getParameters()["step_w"] = _stepX;
+            }
+            if (!_aspectRatios.empty())
+            {
+                l.getParameters()["aspect_ratio"] = _aspectRatios;
+            }
+            CV_Assert(!_variance.empty());
+            l.getParameters()["variance"] = _variance;
+            return Ptr<BackendNode>(new InfEngineBackendNode(l));
+        }
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() == 2);
+        auto layer = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto image = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto layer_shape = std::make_shared<ngraph::op::ShapeOf>(layer);
+        auto image_shape = std::make_shared<ngraph::op::ShapeOf>(image);
+
+        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{2});
+        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{4});
+        auto strides      = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{1});
+
+        auto slice_layer = std::make_shared<ngraph::op::v1::StridedSlice>(layer_shape,
+                                            lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+        auto slice_image = std::make_shared<ngraph::op::v1::StridedSlice>(image_shape,
+                                            lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+
+        if (_explicitSizes)
+        {
+            CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
+            CV_Assert(_boxWidths.size() == _boxHeights.size());
+            ngraph::op::PriorBoxClusteredAttrs attrs;
+            attrs.widths = _boxWidths;
+            attrs.heights = _boxHeights;
+            attrs.clip = _clip;
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            attrs.offset = _offsetsX[0];
+            attrs.step_heights = _stepY;
+            attrs.step_widths = _stepX;
+            attrs.variances = _variance;
+
+            auto priorBox = std::make_shared<ngraph::op::PriorBoxClustered>(slice_layer, slice_image, attrs);
+            auto axis = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{0});
+            auto unsqueeze = std::make_shared<ngraph::op::v0::Unsqueeze>(priorBox, axis);
+            return Ptr<BackendNode>(new InfEngineNgraphNode(unsqueeze));
+        }
+        else
+        {
+            ngraph::op::PriorBoxAttrs attrs;
+            attrs.min_size = _minSize;
+            attrs.max_size = _maxSize;
+            // doesn't work with empty aspectRatio
+            attrs.aspect_ratio = !_aspectRatios.empty()? _aspectRatios : std::vector<float>{1.0f};
+            attrs.clip = _clip;
+            attrs.flip = false;
+            attrs.variance = _variance;
+            CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
+            attrs.offset = _offsetsX[0];
+
+            attrs.step = _stepX;
+            attrs.scale_all_sizes = !_aspectRatios.empty();
+
+            auto priorBox = std::make_shared<ngraph::op::PriorBox>(slice_layer, slice_image, attrs);
+            auto axis = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{0});
+            auto unsqueeze = std::make_shared<ngraph::op::v0::Unsqueeze>(priorBox, axis);
+            return Ptr<BackendNode>(new InfEngineNgraphNode(unsqueeze));
+        }
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto feature_map_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto feature_map_shape = feature_map_wrapper->getShape();
+
+        auto image_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
+        auto image_shape = image_wrapper->getShape();
+
+        PriorBoxConfiguration config;
+        config.feature_map_width = feature_map_shape.rbegin()[0];
+        config.feature_map_height = feature_map_shape.rbegin()[1];
+        config.image_width = image_shape.rbegin()[0];
+        config.image_height = image_shape.rbegin()[1];
+
+        config.num_priors = _numPriors;
+        config.box_widths = _boxWidths;
+        config.box_heights = _boxHeights;
+        config.offsets_x = _offsetsX;
+        config.offsets_y = _offsetsY;
+        config.stepX = _stepX;
+        config.stepY = _stepY;
+
+        config.variance = _variance;
+
+        config.clip = _clip;
+        config.normalize = _bboxesNormalized;
+
+        return make_cuda_node<cuda4dnn::PriorBoxOp>(preferableTarget, std::move(context->stream), config);
+    }
+#endif
+
+
+#ifdef HAVE_VULKAN
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
+    {
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPriorBox(_stepX, _stepY,
+                                                                _clip, _numPriors,
+                                                                _variance, _offsetsX,
+                                                                _offsetsY, _boxWidths,
+                                                                _boxHeights));
+        return Ptr<BackendNode>(new VkComBackendNode(input, op));
+    }
+#endif // HAVE_VULKAN
+
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            flops += total(inputs[i], 2) * _numPriors * 4;
+        }
+
+        return flops;
+    }
+
+private:
+    std::vector<float> _minSize;
+    std::vector<float> _maxSize;
+
+    float _stepX, _stepY;
+
+    std::vector<float> _aspectRatios;
+    std::vector<float> _variance;
+    std::vector<float> _offsetsX;
+    std::vector<float> _offsetsY;
+    // Precomputed final widths and heights based on aspect ratios or explicit sizes.
+    std::vector<float> _boxWidths;
+    std::vector<float> _boxHeights;
+
+#ifdef HAVE_OPENCL
+    UMat umat_offsetsX;
+    UMat umat_offsetsY;
+    UMat umat_widths;
+    UMat umat_heights;
+    UMat umat_variance;
+#endif
+
+    bool _flip;
+    bool _clip;
+    bool _explicitSizes;
+    bool _bboxesNormalized;
+
+    size_t _numPriors;
+
+    static const size_t _numAxes = 4;
+    static const std::string _layerName;
+
+    static float* addPrior(float center_x, float center_y, float width, float height,
+                           float imgWidth, float imgHeight, bool normalized, float* dst)
+    {
+        if (normalized)
+        {
+            dst[0] = (center_x - width * 0.5f) / imgWidth;    // xmin
+            dst[1] = (center_y - height * 0.5f) / imgHeight;  // ymin
+            dst[2] = (center_x + width * 0.5f) / imgWidth;    // xmax
+            dst[3] = (center_y + height * 0.5f) / imgHeight;  // ymax
+        }
+        else
+        {
+            dst[0] = center_x - width * 0.5f;          // xmin
+            dst[1] = center_y - height * 0.5f;         // ymin
+            dst[2] = center_x + width * 0.5f - 1.0f;   // xmax
+            dst[3] = center_y + height * 0.5f - 1.0f;  // ymax
+        }
+        return dst + 4;
+    }
+};
+
+const std::string PriorBoxLayerImpl::_layerName = std::string("PriorBox");
+
+Ptr<PriorBoxLayer> PriorBoxLayer::create(const LayerParams &params)
+{
+    return Ptr<PriorBoxLayer>(new PriorBoxLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/proposal_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/proposal_layer.cpp
@ -0,0 +1,451 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
+#include <ngraph/op/proposal.hpp>
+#else
+#include <ngraph/op/experimental/layers/proposal.hpp>
+#endif
+#endif
+
+namespace cv { namespace dnn {
+
+class ProposalLayerImpl CV_FINAL : public ProposalLayer
+{
+public:
+    ProposalLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+
+        featStride = params.get<uint32_t>("feat_stride", 16);
+        baseSize = params.get<uint32_t>("base_size", 16);
+        // uint32_t minSize = params.get<uint32_t>("min_size", 16);
+        keepTopBeforeNMS = params.get<uint32_t>("pre_nms_topn", 6000);
+        keepTopAfterNMS = params.get<uint32_t>("post_nms_topn", 300);
+        nmsThreshold = params.get<float>("nms_thresh", 0.7);
+        ratios = params.get("ratio");
+        scales = params.get("scale");
+
+        {
+            LayerParams lp;
+            lp.set("step", featStride);
+            lp.set("flip", false);
+            lp.set("clip", false);
+            lp.set("normalized_bbox", false);
+            lp.set("offset", 0.5 * baseSize / featStride);
+
+            // Unused values.
+            float variance[] = {0.1f, 0.1f, 0.2f, 0.2f};
+            lp.set("variance", DictValue::arrayReal<float*>(&variance[0], 4));
+
+            // Compute widths and heights explicitly.
+            std::vector<float> widths, heights;
+            widths.reserve(ratios.size() * scales.size());
+            heights.reserve(ratios.size() * scales.size());
+            for (int i = 0; i < ratios.size(); ++i)
+            {
+                float ratio = ratios.get<float>(i);
+                float width = std::floor(baseSize / sqrt(ratio) + 0.5f);
+                float height = std::floor(width * ratio + 0.5f);
+                for (int j = 0; j < scales.size(); ++j)
+                {
+                    float scale = scales.get<float>(j);
+                    widths.push_back(scale * width);
+                    heights.push_back(scale * height);
+                }
+            }
+            lp.set("width", DictValue::arrayReal<float*>(&widths[0], widths.size()));
+            lp.set("height", DictValue::arrayReal<float*>(&heights[0], heights.size()));
+
+            priorBoxLayer = PriorBoxLayer::create(lp);
+        }
+        {
+            int order[] = {0, 2, 3, 1};
+            LayerParams lp;
+            lp.set("order", DictValue::arrayInt<int*>(&order[0], 4));
+
+            deltasPermute = PermuteLayer::create(lp);
+            scoresPermute = PermuteLayer::create(lp);
+        }
+        {
+            LayerParams lp;
+            lp.set("code_type", "CENTER_SIZE");
+            lp.set("num_classes", 1);
+            lp.set("share_location", true);
+            lp.set("background_label_id", 1);  // We won't pass background scores so set it out of range [0, num_classes)
+            lp.set("variance_encoded_in_target", true);
+            lp.set("keep_top_k", keepTopAfterNMS);
+            lp.set("top_k", keepTopBeforeNMS);
+            lp.set("nms_threshold", nmsThreshold);
+            lp.set("normalized_bbox", false);
+            lp.set("clip", true);
+
+            detectionOutputLayer = DetectionOutputLayer::create(lp);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            bool isMyriad = preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL;
+            return !isMyriad;
+        }
+#endif
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        // We need to allocate the following blobs:
+        // - output priors from PriorBoxLayer
+        // - permuted priors
+        // - permuted scores
+        CV_Assert(inputs.size() == 3);
+
+        const MatShape& scores = inputs[0];
+        const MatShape& bboxDeltas = inputs[1];
+
+        std::vector<MatShape> layerInputs, layerOutputs, layerInternals;
+
+        // Prior boxes layer.
+        layerInputs.assign(1, scores);
+        priorBoxLayer->getMemoryShapes(layerInputs, 1, layerOutputs, layerInternals);
+        CV_Assert(layerOutputs.size() == 1);
+        CV_Assert(layerInternals.empty());
+        internals.push_back(layerOutputs[0]);
+
+        // Scores permute layer.
+        CV_Assert(scores.size() == 4);
+        MatShape objectScores = scores;
+        CV_Assert((scores[1] & 1) == 0);  // Number of channels is even.
+        objectScores[1] /= 2;
+        layerInputs.assign(1, objectScores);
+        scoresPermute->getMemoryShapes(layerInputs, 1, layerOutputs, layerInternals);
+        CV_Assert(layerOutputs.size() == 1);
+        CV_Assert(layerInternals.empty());
+        internals.push_back(layerOutputs[0]);
+
+        // BBox predictions permute layer.
+        layerInputs.assign(1, bboxDeltas);
+        deltasPermute->getMemoryShapes(layerInputs, 1, layerOutputs, layerInternals);
+        CV_Assert(layerOutputs.size() == 1);
+        CV_Assert(layerInternals.empty());
+        internals.push_back(layerOutputs[0]);
+
+        // Detections layer.
+        internals.push_back(shape(1, 1, keepTopAfterNMS, 7));
+
+        outputs.resize(2);
+        outputs[0] = shape(keepTopAfterNMS, 5);
+        outputs[1] = shape(keepTopAfterNMS, 1);
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        std::vector<Mat> layerInputs;
+        std::vector<Mat> layerOutputs;
+
+        // Scores permute layer.
+        Mat scores = getObjectScores(inputs[0]);
+        layerInputs.assign(1, scores);
+        layerOutputs.assign(1, Mat(shape(scores.size[0], scores.size[2],
+                                         scores.size[3], scores.size[1]), CV_32FC1));
+        scoresPermute->finalize(layerInputs, layerOutputs);
+
+        // BBox predictions permute layer.
+        const Mat& bboxDeltas = inputs[1];
+        CV_Assert(bboxDeltas.dims == 4);
+        layerInputs.assign(1, bboxDeltas);
+        layerOutputs.assign(1, Mat(shape(bboxDeltas.size[0], bboxDeltas.size[2],
+                                         bboxDeltas.size[3], bboxDeltas.size[1]), CV_32FC1));
+        deltasPermute->finalize(layerInputs, layerOutputs);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        if (inputs_.depth() == CV_16S)
+            return false;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        CV_Assert(inputs.size() == 3);
+        CV_Assert(internals.size() == 4);
+        const UMat& scores = inputs[0];
+        const UMat& bboxDeltas = inputs[1];
+        const UMat& imInfo = inputs[2];
+        UMat& priorBoxes = internals[0];
+        UMat& permuttedScores = internals[1];
+        UMat& permuttedDeltas = internals[2];
+        UMat& detections = internals[3];
+
+        CV_Assert(imInfo.total() >= 2);
+        // We've chosen the smallest data type because we need just a shape from it.
+        Mat szMat;
+        imInfo.copyTo(szMat);
+        int rows = (int)szMat.at<float>(0);
+        int cols = (int)szMat.at<float>(1);
+        umat_fakeImageBlob.create(shape(1, 1, rows, cols), CV_8UC1);
+        umat_fakeImageBlob.setTo(0);
+
+        // Generate prior boxes.
+        std::vector<UMat> layerInputs(2), layerOutputs(1, priorBoxes);
+        layerInputs[0] = scores;
+        layerInputs[1] = umat_fakeImageBlob;
+        priorBoxLayer->forward(layerInputs, layerOutputs, internals);
+
+        // Permute scores.
+        layerInputs.assign(1, getObjectScores(scores));
+        layerOutputs.assign(1, permuttedScores);
+        scoresPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Permute deltas.
+        layerInputs.assign(1, bboxDeltas);
+        layerOutputs.assign(1, permuttedDeltas);
+        deltasPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Sort predictions by scores and apply NMS. DetectionOutputLayer allocates
+        // output internally because of different number of objects after NMS.
+        layerInputs.resize(4);
+        layerInputs[0] = permuttedDeltas;
+        layerInputs[1] = permuttedScores;
+        layerInputs[2] = priorBoxes;
+        layerInputs[3] = umat_fakeImageBlob;
+
+        layerOutputs[0] = detections;
+        detectionOutputLayer->forward(layerInputs, layerOutputs, internals);
+
+        // DetectionOutputLayer produces 1x1xNx7 output where N might be less or
+        // equal to keepTopAfterNMS. We fill the rest by zeros.
+        const int numDets = layerOutputs[0].total() / 7;
+        CV_Assert(numDets <= keepTopAfterNMS);
+
+        MatShape s = shape(numDets, 7);
+        layerOutputs[0] = layerOutputs[0].reshape(1, s.size(), &s[0]);
+
+        // The boxes.
+        UMat dst = outputs[0].rowRange(0, numDets);
+        layerOutputs[0].colRange(3, 7).copyTo(dst.colRange(1, 5));
+        dst.col(0).setTo(0);  // First column are batch ids. Keep it zeros too.
+
+        // The scores.
+        dst = outputs[1].rowRange(0, numDets);
+        layerOutputs[0].col(2).copyTo(dst);
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        CV_Assert(inputs.size() == 3);
+        CV_Assert(internals.size() == 4);
+        const Mat& scores = inputs[0];
+        const Mat& bboxDeltas = inputs[1];
+        const Mat& imInfo = inputs[2];
+        Mat& priorBoxes = internals[0];
+        Mat& permuttedScores = internals[1];
+        Mat& permuttedDeltas = internals[2];
+        Mat& detections = internals[3];
+
+        CV_Assert(imInfo.total() >= 2);
+        // We've chosen the smallest data type because we need just a shape from it.
+        // We don't allocate memory but just need the shape is correct.
+        Mat fakeImageBlob(shape(1, 1, imInfo.at<float>(0), imInfo.at<float>(1)), CV_8UC1, NULL);
+
+        // Generate prior boxes.
+        std::vector<Mat> layerInputs(2), layerOutputs(1, priorBoxes);
+        layerInputs[0] = scores;
+        layerInputs[1] = fakeImageBlob;
+        priorBoxLayer->forward(layerInputs, layerOutputs, internals);
+
+        // Permute scores.
+        layerInputs.assign(1, getObjectScores(scores));
+        layerOutputs.assign(1, permuttedScores);
+        scoresPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Permute deltas.
+        layerInputs.assign(1, bboxDeltas);
+        layerOutputs.assign(1, permuttedDeltas);
+        deltasPermute->forward(layerInputs, layerOutputs, internals);
+
+        // Sort predictions by scores and apply NMS. DetectionOutputLayer allocates
+        // output internally because of different number of objects after NMS.
+        layerInputs.resize(4);
+        layerInputs[0] = permuttedDeltas;
+        layerInputs[1] = permuttedScores;
+        layerInputs[2] = priorBoxes;
+        layerInputs[3] = fakeImageBlob;
+
+        layerOutputs[0] = detections;
+        detectionOutputLayer->forward(layerInputs, layerOutputs, internals);
+
+        // DetectionOutputLayer produces 1x1xNx7 output where N might be less or
+        // equal to keepTopAfterNMS. We fill the rest by zeros.
+        const int numDets = layerOutputs[0].total() / 7;
+        CV_Assert(numDets <= keepTopAfterNMS);
+
+        // The boxes.
+        layerOutputs[0] = layerOutputs[0].reshape(1, numDets);
+        Mat dst = outputs[0].rowRange(0, numDets);
+        layerOutputs[0].colRange(3, 7).copyTo(dst.colRange(1, 5));
+        dst.col(0).setTo(0);  // First column are batch ids. Keep it zeros too.
+
+        // The scores.
+        dst = outputs[1].rowRange(0, numDets);
+        layerOutputs[0].col(2).copyTo(dst);
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ProposalLayer ieLayer(name);
+
+        ieLayer.setBaseSize(baseSize);
+        ieLayer.setFeatStride(featStride);
+        ieLayer.setMinSize(16);
+        ieLayer.setNMSThresh(nmsThreshold);
+        ieLayer.setPostNMSTopN(keepTopAfterNMS);
+        ieLayer.setPreNMSTopN(keepTopBeforeNMS);
+
+        std::vector<float> scalesVec(scales.size());
+        for (int i = 0; i < scales.size(); ++i)
+            scalesVec[i] = scales.get<float>(i);
+        ieLayer.setScale(scalesVec);
+
+        std::vector<float> ratiosVec(ratios.size());
+        for (int i = 0; i < ratios.size(); ++i)
+            ratiosVec[i] = ratios.get<float>(i);
+        ieLayer.setRatio(ratiosVec);
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() == 3);
+        ngraph::op::ProposalAttrs attr;
+        attr.base_size     = baseSize;
+        attr.nms_thresh    = nmsThreshold;
+        attr.feat_stride   = featStride;
+        attr.min_size      = 16;
+        attr.pre_nms_topn  = keepTopBeforeNMS;
+        attr.post_nms_topn = keepTopAfterNMS;
+
+        std::vector<float> ratiosVec(ratios.size());
+        for (int i = 0; i < ratios.size(); ++i)
+            ratiosVec[i] = ratios.get<float>(i);
+        attr.ratio = ratiosVec;
+
+        std::vector<float> scalesVec(scales.size());
+        for (int i = 0; i < scales.size(); ++i)
+            scalesVec[i] = scales.get<float>(i);
+        attr.scale = scalesVec;
+
+        auto& class_probs  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto& class_logits = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto& image_shape  = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+
+        CV_Assert_N(image_shape->get_shape().size() == 2, image_shape->get_shape().front() == 1);
+        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape{1},
+                       std::vector<int64_t>{(int64_t)image_shape->get_shape().back()});
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(image_shape, shape, true);
+
+        auto proposal = std::make_shared<ngraph::op::Proposal>(class_probs, class_logits, reshape, attr);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(proposal));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+private:
+    // A first half of channels are background scores. We need only a second one.
+    static Mat getObjectScores(const Mat& m)
+    {
+        CV_Assert(m.dims == 4);
+        CV_Assert(m.size[0] == 1);
+        int channels = m.size[1];
+        CV_Assert((channels & 1) == 0);
+        return slice(m, Range::all(), Range(channels / 2, channels));
+    }
+
+#ifdef HAVE_OPENCL
+    static UMat getObjectScores(const UMat& m)
+    {
+        CV_Assert(m.dims == 4);
+        CV_Assert(m.size[0] == 1);
+        int channels = m.size[1];
+        CV_Assert((channels & 1) == 0);
+
+        Range r = Range(channels / 2, channels);
+        Range ranges[4] = { Range::all(), r, Range::all(), Range::all() };
+        return m(&ranges[0]);
+    }
+#endif
+
+    Ptr<PriorBoxLayer> priorBoxLayer;
+    Ptr<DetectionOutputLayer> detectionOutputLayer;
+
+    Ptr<PermuteLayer> deltasPermute;
+    Ptr<PermuteLayer> scoresPermute;
+    uint32_t keepTopBeforeNMS, keepTopAfterNMS, featStride, baseSize;
+    float nmsThreshold;
+    DictValue ratios, scales;
+#ifdef HAVE_OPENCL
+    UMat umat_fakeImageBlob;
+#endif
+};
+
+
+Ptr<ProposalLayer> ProposalLayer::create(const LayerParams& params)
+{
+    return Ptr<ProposalLayer>(new ProposalLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/recurrent_layers.cpp
@ -0,0 +1,813 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include <iostream>
+#include <iterator>
+#include <cmath>
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+template<typename Dtype>
+static void tanh(const Mat &src, Mat &dst)
+{
+    MatConstIterator_<Dtype> itSrc = src.begin<Dtype>();
+    MatIterator_<Dtype> itDst = dst.begin<Dtype>();
+
+    for (; itSrc != src.end<Dtype>(); itSrc++, itDst++)
+        *itDst = std::tanh(*itSrc);
+}
+
+//TODO: make utils method
+static void tanh(const Mat &src, Mat &dst)
+{
+    dst.create(src.dims, (const int*)src.size, src.type());
+
+    if (src.type() == CV_32F)
+        tanh<float>(src, dst);
+    else if (src.type() == CV_64F)
+        tanh<double>(src, dst);
+    else
+        CV_Error(Error::StsUnsupportedFormat, "Function supports only floating point types");
+}
+
+static void sigmoid(const Mat &src, Mat &dst)
+{
+    cv::exp(-src, dst);
+    cv::pow(1 + dst, -1, dst);
+}
+
+typedef void (*ActivationFunction)(const Mat &src, Mat &dst);
+static ActivationFunction get_activation_function(const String& activation) {
+    // most used activations for PyTorch and TF : Tanh, Sigmoid
+    // if you need to support more optional activations use std::map instead
+    if (activation == "Tanh")
+    {
+        return tanh;
+    }
+    else if (activation == "Sigmoid")
+    {
+        return sigmoid;
+    }
+    else
+    {
+        CV_Error(Error::StsNotImplemented,
+                 cv::format("Activation function [%s] for layer LSTM  is not supported", activation.c_str()));
+    }
+}
+
+class LSTMLayerImpl CV_FINAL : public LSTMLayer
+{
+    int numTimeStamps, numSamples;
+    bool allocated;
+
+    MatShape outTailShape;  //shape of single output sample
+    MatShape outTsShape;    //shape of N output samples
+
+    bool useTimestampDim;
+    bool produceCellOutput;
+    float forgetBias, cellClip;
+    bool useCellClip, usePeephole;
+    bool reverse;   // If true, go in negative direction along the time axis
+    bool bidirectional;  // If true, produces both forward and reversed directions along time axis
+
+    ActivationFunction f_activation;
+    ActivationFunction g_activation;
+    ActivationFunction h_activation;
+
+public:
+
+    LSTMLayerImpl(const LayerParams& params)
+        : numTimeStamps(0), numSamples(0)
+    {
+        setParamsFrom(params);
+
+        bidirectional = params.get<bool>("bidirectional", false);
+        if (!blobs.empty())
+        {
+            CV_Assert(blobs.size() >= 3);
+
+            blobs[2] = blobs[2].reshape(1, 1);
+
+            const Mat& Wh = blobs[0];
+            const Mat& Wx = blobs[1];
+            const Mat& bias = blobs[2];
+            const Mat& hInternal = blobs[3];
+            const Mat& cInternal = blobs[4];
+            CV_CheckEQ(Wh.dims, 2, "");
+            CV_CheckEQ(Wx.dims, 2, "");
+            CV_CheckEQ(Wh.rows, Wx.rows, "");
+            CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional))*4*Wh.cols, "");
+            CV_CheckEQ(Wh.rows, (int)bias.total(), "");
+            CV_CheckEQ(hInternal.cols, Wh.cols, "");
+            CV_CheckEQ(hInternal.cols, cInternal.cols, "");
+            CV_CheckEQ(hInternal.rows, cInternal.rows, "");
+            CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
+
+            // Peephole weights.
+            if (blobs.size() > 5)
+            {
+                CV_Assert(blobs.size() == 8);
+                const int N = Wh.cols;
+                for (int i = 5; i < 8; ++i)
+                {
+                    CV_Assert(blobs[i].rows == N && blobs[i].cols == N);
+                    CV_Assert(blobs[i].type() == bias.type());
+                }
+            }
+        }
+        useTimestampDim = params.get<bool>("use_timestamp_dim", true);
+        produceCellOutput = params.get<bool>("produce_cell_output", false);
+        forgetBias = params.get<float>("forget_bias", 0.0f);
+        cellClip = params.get<float>("cell_clip", 0.0f);
+        useCellClip = params.get<bool>("use_cell_clip", false);
+        usePeephole = params.get<bool>("use_peephole", false);
+        reverse = params.get<bool>("reverse", false);
+        CV_Assert(!reverse || !bidirectional);
+
+        // read activations
+        DictValue activations = params.get<DictValue>("activations", "");
+        if (activations.size() == 1) // if activations wasn't specified use default
+        {
+            f_activation = sigmoid;
+            g_activation = tanh;
+            h_activation = tanh;
+        } else {
+            CV_Assert(activations.size() == 3);
+            f_activation = get_activation_function(activations.getStringValue(0));
+            g_activation = get_activation_function(activations.getStringValue(1));
+            h_activation = get_activation_function(activations.getStringValue(2));
+        }
+
+        allocated = false;
+        outTailShape.clear();
+    }
+
+    void setUseTimstampsDim(bool use) CV_OVERRIDE
+    {
+        CV_Assert(!allocated);
+        useTimestampDim = use;
+    }
+
+    void setProduceCellOutput(bool produce) CV_OVERRIDE
+    {
+        CV_Assert(!allocated);
+        produceCellOutput = produce;
+    }
+
+    void setOutShape(const MatShape &outTailShape_) CV_OVERRIDE
+    {
+        CV_Assert(!allocated || total(outTailShape) == total(outTailShape_));
+        outTailShape = outTailShape_;
+    }
+
+    void setWeights(const Mat &Wh, const Mat &Wx, const Mat &bias) CV_OVERRIDE
+    {
+        CV_Assert(Wh.dims == 2 && Wx.dims == 2);
+        CV_Assert(Wh.rows == Wx.rows);
+        CV_Assert(Wh.rows == 4*Wh.cols);
+        CV_Assert(Wh.rows == (int)bias.total());
+        CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
+
+        blobs.resize(3);
+        blobs[0] = Mat(Wh.clone());
+        blobs[1] = Mat(Wx.clone());
+        blobs[2] = Mat(bias.clone()).reshape(1, 1);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8));
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inp0 = inputs[0];
+
+        const Mat &Wh = blobs[0], &Wx = blobs[1];
+        int _numOut = Wh.size[1];
+        int _numInp = Wx.size[1];
+        MatShape outTailShape_(outTailShape), outResShape;
+
+        if (!outTailShape_.empty())
+            CV_Assert(total(outTailShape_) == _numOut);
+        else
+            outTailShape_.assign(1, _numOut);
+
+        int _numSamples;
+        if (useTimestampDim)
+        {
+            CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
+            _numSamples = inp0[1];
+            outResShape.push_back(inp0[0]);
+        }
+        else
+        {
+            CV_Assert(inp0.size() >= 2 && total(inp0, 1) == _numInp);
+            _numSamples = inp0[0];
+        }
+
+        outResShape.push_back(_numSamples);
+        outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());
+        outResShape.back() *= (1 + static_cast<int>(bidirectional));
+
+        size_t noutputs = produceCellOutput ? 2 : 1;
+        outputs.assign(noutputs, outResShape);
+
+        internals.assign(1, shape(_numSamples, _numOut)); // hInternal
+        internals.push_back(shape(_numSamples, _numOut)); // cInternal
+        internals.push_back(shape(_numSamples, 1)); // dummyOnes
+        internals.push_back(shape(_numSamples, 4*_numOut)); // gates
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> input;
+        inputs_arr.getMatVector(input);
+
+        CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8));
+        CV_Assert(input.size() == 1);
+        const Mat& inp0 = input[0];
+
+        Mat &Wh = blobs[0], &Wx = blobs[1];
+        int numOut = Wh.size[1];
+        int numInp = Wx.size[1];
+
+        if (!outTailShape.empty())
+            CV_Assert(total(outTailShape) == numOut);
+        else
+            outTailShape.assign(1, numOut);
+
+        if (useTimestampDim)
+        {
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
+            numTimeStamps = inp0.size[0];
+            numSamples = inp0.size[1];
+        }
+        else
+        {
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(1) == numInp);
+            numTimeStamps = 1;
+            numSamples = inp0.size[0];
+        }
+
+        outTsShape.clear();
+        outTsShape.push_back(numSamples);
+        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+        outTsShape.back() *= (1 + static_cast<int>(bidirectional));
+
+        allocated = true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
+
+        const int numDirs = 1 + static_cast<int>(bidirectional);
+        for (int i = 0; i < numDirs; ++i)
+        {
+            const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
+            const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
+            const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
+            const Mat &h_0 = blobs[3].rowRange(i * blobs[3].rows / numDirs, (i + 1) * blobs[3].rows / numDirs);
+            const Mat &c_0 = blobs[4].rowRange(i * blobs[4].rows / numDirs, (i + 1) * blobs[4].rows / numDirs);
+
+            int numOut = Wh.size[1];
+            Mat hInternal = internals[0], cInternal = internals[1],
+                    dummyOnes = internals[2], gates = internals[3];
+            h_0.copyTo(hInternal);
+            c_0.copyTo(cInternal);
+            dummyOnes.setTo(1.);
+
+            int numSamplesTotal = numTimeStamps*numSamples;
+            Mat xTs = input[0].reshape(1, numSamplesTotal);
+
+            Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+            hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
+            Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
+
+            int tsStart, tsEnd, tsInc;
+            if (reverse || i == 1) {
+                tsStart = numTimeStamps - 1;
+                tsEnd = -1;
+                tsInc = -1;
+            }
+            else {
+                tsStart = 0;
+                tsEnd = numTimeStamps;
+                tsInc = 1;
+            }
+            for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+            {
+                Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
+                Mat xCurr = xTs.rowRange(curRowRange);
+
+                gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+                gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+                gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+
+                Mat gateI = gates.colRange(0*numOut, 1*numOut);
+                Mat gateF = gates.colRange(1*numOut, 2*numOut);
+                Mat gateO = gates.colRange(2*numOut, 3*numOut);
+                Mat gateG = gates.colRange(3*numOut, 4*numOut);
+
+                if (forgetBias)
+                    add(gateF, forgetBias, gateF);
+
+                if (usePeephole)
+                {
+                    Mat gatesIF = gates.colRange(0, 2*numOut);
+                    gemm(cInternal, blobs[5], 1, gateI, 1, gateI);
+                    gemm(cInternal, blobs[6], 1, gateF, 1, gateF);
+                    f_activation(gatesIF, gatesIF);
+                }
+                else
+                {
+                    Mat gatesIFO = gates.colRange(0, 3*numOut);
+                    f_activation(gatesIFO, gatesIFO);
+                }
+
+                g_activation(gateG, gateG);
+
+                //compute c_t
+                multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
+                multiply(gateI, gateG, gateI);      // i_t (*) g_t
+                add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
+
+                if (useCellClip)
+                {
+                    min(cInternal, cellClip, cInternal);
+                    max(cInternal, -cellClip, cInternal);
+                }
+                if (usePeephole)
+                {
+                    gemm(cInternal, blobs[7], 1, gateO, 1, gateO);
+                    f_activation(gateO, gateO);
+                }
+
+                //compute h_t
+                h_activation(cInternal, hInternal);
+                multiply(gateO, hInternal, hInternal);
+
+                //save results in output blobs
+                hInternal.copyTo(hOutTs.rowRange(curRowRange));
+                if (produceCellOutput)
+                    cInternal.copyTo(cOutTs.rowRange(curRowRange));
+            }
+        }
+    }
+};
+
+Ptr<LSTMLayer> LSTMLayer::create(const LayerParams& params)
+{
+    return Ptr<LSTMLayer>(new LSTMLayerImpl(params));
+}
+
+int LSTMLayer::inputNameToIndex(String inputName)
+{
+    if (toLowerCase(inputName) == "x")
+        return 0;
+    return -1;
+}
+
+int LSTMLayer::outputNameToIndex(const String& outputName)
+{
+    if (toLowerCase(outputName) == "h")
+        return 0;
+    else if (toLowerCase(outputName) == "c")
+        return 1;
+    return -1;
+}
+
+
+class RNNLayerImpl : public RNNLayer
+{
+    int numX, numH, numO;
+    int numSamples, numTimestamps, numSamplesTotal;
+    int dtype;
+    Mat Whh, Wxh, bh;
+    Mat Who, bo;
+    bool produceH;
+
+public:
+
+    RNNLayerImpl(const LayerParams& params)
+        : numX(0), numH(0), numO(0), numSamples(0), numTimestamps(0), numSamplesTotal(0), dtype(0)
+    {
+        setParamsFrom(params);
+        type = "RNN";
+        produceH = false;
+    }
+
+    void setProduceHiddenOutput(bool produce = false) CV_OVERRIDE
+    {
+        produceH = produce;
+    }
+
+    void setWeights(const Mat &W_xh, const Mat &b_h, const Mat &W_hh, const Mat &W_ho, const Mat &b_o) CV_OVERRIDE
+    {
+        CV_Assert(W_hh.dims == 2 && W_xh.dims == 2);
+        CV_Assert(W_hh.size[0] == W_xh.size[0] && W_hh.size[0] == W_hh.size[1] && (int)b_h.total() == W_xh.size[0]);
+        CV_Assert(W_ho.size[0] == (int)b_o.total());
+        CV_Assert(W_ho.size[1] == W_hh.size[1]);
+
+        blobs.resize(5);
+        blobs[0] = Mat(W_xh.clone());
+        blobs[1] = Mat(b_h.clone());
+        blobs[2] = Mat(W_hh.clone());
+        blobs[3] = Mat(W_ho.clone());
+        blobs[4] = Mat(b_o.clone());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() >= 1 && inputs.size() <= 2);
+
+        Mat Who_ = blobs[3];
+        Mat Wxh_ = blobs[0];
+
+        int numTimestamps_ = inputs[0][0];
+        int numSamples_ = inputs[0][1];
+
+        int numO_ = Who_.rows;
+        int numH_ = Wxh_.rows;
+
+        outputs.clear();
+        int dims[] = {numTimestamps_, numSamples_, numO_};
+        outputs.push_back(shape(dims, 3));
+        dims[2] = numH_;
+        if (produceH)
+            outputs.push_back(shape(dims, 3));
+
+        internals.assign(2, shape(numSamples_, numH_));
+        internals.push_back(shape(numSamples_, 1));
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> input, outputs;
+        inputs_arr.getMatVector(input);
+
+        CV_Assert(input.size() >= 1 && input.size() <= 2);
+
+        Wxh = blobs[0];
+        bh  = blobs[1];
+        Whh = blobs[2];
+        Who = blobs[3];
+        bo  = blobs[4];
+
+        numH = Wxh.rows;
+        numX = Wxh.cols;
+        numO = Who.rows;
+
+        const Mat& inp0 = input[0];
+
+        CV_Assert(inp0.dims >= 2);
+        CV_Assert(inp0.total(2) == numX);
+        dtype = CV_32F;
+        CV_Assert(inp0.type() == dtype);
+        numTimestamps = inp0.size[0];
+        numSamples = inp0.size[1];
+        numSamplesTotal = numTimestamps * numSamples;
+
+        bh = bh.reshape(1, 1); //is 1 x numH Mat
+        bo = bo.reshape(1, 1); //is 1 x numO Mat
+    }
+
+    void reshapeOutput(std::vector<Mat> &output)
+    {
+        output.resize(produceH ? 2 : 1);
+        int sz0[] = { numTimestamps, numSamples, numO };
+        output[0].create(3, sz0, dtype);
+        if (produceH)
+        {
+            int sz1[] = { numTimestamps, numSamples, numH };
+            output[1].create(3, sz1, dtype);
+        }
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
+
+        Mat xTs = input[0].reshape(1, numSamplesTotal);
+        Mat oTs = output[0].reshape(1, numSamplesTotal);
+        Mat hTs = produceH ? output[1].reshape(1, numSamplesTotal) : Mat();
+        Mat hCurr = internals[0];
+        Mat hPrev = internals[1];
+        Mat dummyBiasOnes = internals[2];
+
+        hPrev.setTo(0.);
+        dummyBiasOnes.setTo(1.);
+
+        for (int ts = 0; ts < numTimestamps; ts++)
+        {
+            Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
+            Mat xCurr = xTs.rowRange(curRowRange);
+
+            gemm(hPrev, Whh, 1, hCurr, 0, hCurr, GEMM_2_T); // W_{hh} * h_{prev}
+            gemm(xCurr, Wxh, 1, hCurr, 1, hCurr, GEMM_2_T); //+W_{xh} * x_{curr}
+            gemm(dummyBiasOnes, bh, 1, hCurr, 1, hCurr);    //+bh
+            tanh(hCurr, hPrev);
+
+            Mat oCurr = oTs.rowRange(curRowRange);
+            gemm(hPrev, Who, 1, oCurr, 0, oCurr, GEMM_2_T); // W_{ho} * h_{prev}
+            gemm(dummyBiasOnes, bo, 1, oCurr, 1, oCurr);    //+b_o
+            tanh(oCurr, oCurr);
+
+            if (produceH)
+                hPrev.copyTo(hTs.rowRange(curRowRange));
+        }
+    }
+};
+
+CV_EXPORTS_W Ptr<RNNLayer> RNNLayer::create(const LayerParams& params)
+{
+    return Ptr<RNNLayer>(new RNNLayerImpl(params));
+}
+
+class GRULayerImpl CV_FINAL : public GRULayer
+{
+    int numTimeStamps, numSamples;
+    bool allocated;
+
+    MatShape outTailShape;  //shape of single output sample
+    MatShape outTsShape;    //shape of N output samples
+    bool bidirectional;     // If true, produces both forward and reversed directions along time axis
+
+public:
+
+    GRULayerImpl(const LayerParams& params) : numTimeStamps(0), numSamples(0)
+    {
+        setParamsFrom(params);
+
+        bidirectional = params.get<bool>("bidirectional", false);
+        if (!blobs.empty())
+        {
+            CV_Assert(blobs.size() >= 3);
+
+            blobs[2] = blobs[2].reshape(1, 1);
+
+            const Mat& Wh = blobs[0];
+            const Mat& Wx = blobs[1];
+            const Mat& bias = blobs[2];
+            const Mat& hInternal = blobs[3];
+            CV_CheckEQ(Wh.dims, 2, "");
+            CV_CheckEQ(Wx.dims, 2, "");
+            CV_CheckEQ(Wh.rows, Wx.rows, "");
+            CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional)) * 3 * Wh.cols, "");
+            CV_CheckEQ(Wh.rows * 2, (int)bias.total(), "");
+            CV_CheckEQ(hInternal.cols, Wh.cols, "");
+            CV_CheckTypeEQ(Wh.type(), Wx.type(), "");
+            CV_CheckTypeEQ(Wx.type(), bias.type(), "");
+        }
+
+        allocated = false;
+        outTailShape.clear();
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inp0 = inputs[0];
+
+        const Mat &Wh = blobs[0], &Wx = blobs[1];
+        int _numOut = Wh.size[1];
+        int _numInp = Wx.size[1];
+        MatShape outTailShape_(outTailShape), outResShape;
+
+        if (!outTailShape_.empty())
+            CV_Assert(total(outTailShape_) == _numOut);
+        else
+            outTailShape_.assign(1, _numOut);
+
+        int _numSamples;
+        CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
+        _numSamples = inp0[1];
+        outResShape.push_back(inp0[0]);
+
+        outResShape.push_back(_numSamples);
+        outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());
+        outResShape.back() *= (1 + static_cast<int>(bidirectional));
+
+        outputs.assign(1, outResShape);
+
+        internals.assign(1, shape(_numSamples, _numOut));     // hInternal
+        internals.push_back(shape(_numSamples, 1));           // dummyOnes
+        internals.push_back(shape(_numSamples, 2 * _numOut)); // gates
+        internals.push_back(shape(_numSamples, 2 * _numOut)); // gates_b
+        internals.push_back(shape(_numSamples, 1 * _numOut)); // h_linear
+        internals.push_back(shape(_numSamples, _numOut));     // ones
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> input;
+        inputs_arr.getMatVector(input);
+
+        CV_Assert(input.size() == 1);
+        const Mat& inp0 = input[0];
+
+        Mat &Wh = blobs[0], &Wx = blobs[1];
+        int numOut = Wh.size[1];
+        int numInp = Wx.size[1];
+
+        if (!outTailShape.empty())
+            CV_Assert(total(outTailShape) == numOut);
+        else
+            outTailShape.assign(1, numOut);
+
+        CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
+        numTimeStamps = inp0.size[0];
+        numSamples = inp0.size[1];
+
+        outTsShape.clear();
+        outTsShape.push_back(numSamples);
+        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+        outTsShape.back() *= (1 + static_cast<int>(bidirectional));
+
+        allocated = true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
+
+        const int numDirs = 1 + static_cast<int>(bidirectional);
+        for (int i = 0; i < numDirs; ++i)
+        {
+            const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
+            const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
+            const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
+            const Mat &h_0 = blobs[3].rowRange(i * blobs[3].rows / numDirs, (i + 1) * blobs[3].rows / numDirs);
+
+            const Mat &bx = bias.colRange(0, bias.cols / 2);
+            const Mat &bh = bias.colRange(bias.cols / 2, bias.cols);
+
+            Mat hInternal = internals[0], dummyOnes = internals[1], gates = internals[2],
+                b_rz = internals[3], n_t = internals[4], ones = internals[5];
+            h_0.copyTo(hInternal);
+            dummyOnes.setTo(1.);
+            ones.setTo(1.);
+
+            int numOut = Wh.size[1];
+            const Mat& wx_rz = Wx.rowRange(0, 2 * numOut);
+            const Mat& wh_rz = Wh.rowRange(0, 2 * numOut);
+            b_rz = bx.colRange(0, 2 * numOut) + bh.colRange(0, 2 * numOut);
+            const Mat& wx_n = Wx.rowRange(2 * numOut, 3 * numOut);
+            const Mat& wh_n = Wh.rowRange(2 * numOut, 3 * numOut);
+            const Mat& b_in = bx.colRange(2 * numOut, 3 * numOut);
+            const Mat& b_hn = bh.colRange(2 * numOut, 3 * numOut);
+
+            int numSamplesTotal = numTimeStamps * numSamples;
+            Mat xTs = input[0].reshape(1, numSamplesTotal);
+
+            Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+            hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
+            Mat cOutTs = Mat();
+
+            int tsStart, tsEnd, tsInc;
+            if (i == 1) {
+                tsStart = numTimeStamps - 1;
+                tsEnd = -1;
+                tsInc = -1;
+            }
+            else {
+                tsStart = 0;
+                tsEnd = numTimeStamps;
+                tsInc = 1;
+            }
+            for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+            {
+                Range curRowRange(ts * numSamples, (ts + 1) * numSamples);
+                Mat xCurr = xTs.rowRange(curRowRange);
+
+                // calculate r_t = sigmoid(x * Wx_r + h_(t-1) * Wh_r + b_r)
+                // calculate z_t = sigmoid(x * Wx_z + h_(t-1) * Wh_z + b_z)
+                gemm(xCurr, wx_rz, 1, gates, 0, gates, GEMM_2_T);      // x * Wx_rz
+                gemm(hInternal, wh_rz, 1, gates, 1, gates, GEMM_2_T);  // + h_(t-1) * Wh_rz
+                gemm(dummyOnes, b_rz, 1, gates, 1, gates);             // + b_rz
+                sigmoid(gates, gates);                                 // sigmoid()
+
+                Mat z = gates.colRange(0, gates.cols / 2);
+                Mat r = gates.colRange(gates.cols / 2, gates.cols);
+
+                // calculate n_t = tanh(r (*) (h_(t-1) * Wh_n + b_hn) + x * Wx_n + b_in)
+                gemm(hInternal, wh_n, 1, n_t, 0, n_t, GEMM_2_T);       // h_(t-1) * Wh_n
+                gemm(dummyOnes, b_hn, 1, n_t, 1, n_t);                 // + b_hn
+                multiply(r, n_t, n_t);                                 // r (*) (h_(t-1) * Wh_n + b_hn)
+
+                gemm(xCurr, wx_n, 1, n_t, 1, n_t, GEMM_2_T);          // + x * Wx_n
+                gemm(dummyOnes, b_in, 1, n_t, 1, n_t);                // + b_in
+                tanh(n_t, n_t);                                       // tanh()
+
+                //compute next h_t = z (*) h_(t-1) + (1 - z) (*) n_t
+                multiply(z, hInternal, hInternal);                    // z (*) h_{t-1}
+                subtract(ones, z, z);                                 // 1 - z
+                multiply(z, n_t, z);                                  // (1 - z) * n
+                add(z, hInternal, hInternal);                         // z (*) h_(t-1) + (1 - z) (*) n_t
+
+                //save results in output blobs
+                hInternal.copyTo(hOutTs.rowRange(curRowRange));
+            }
+        }
+    }
+};
+
+Ptr<GRULayer> GRULayer::create(const LayerParams &params) {
+    return Ptr<GRULayer>(new GRULayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/region_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/region_layer.cpp
@ -0,0 +1,660 @@
+/*M ///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/dnn/all_layers.hpp>
+#include "../nms.inl.hpp"
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/region.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+
+namespace cv
+{
+namespace dnn
+{
+
+class RegionLayerImpl CV_FINAL : public RegionLayer
+{
+public:
+    int coords, classes, anchors, classfix;
+    float thresh, scale_x_y;
+    int new_coords;
+    bool useSoftmax, useLogistic;
+#ifdef HAVE_OPENCL
+    UMat blob_umat;
+#endif
+
+    RegionLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() == 1);
+
+        thresh = params.get<float>("thresh", 0.2);
+        coords = params.get<int>("coords", 4);
+        classes = params.get<int>("classes", 0);
+        anchors = params.get<int>("anchors", 5);
+        classfix = params.get<int>("classfix", 0);
+        useSoftmax = params.get<bool>("softmax", false);
+        useLogistic = params.get<bool>("logistic", false);
+        nmsThreshold = params.get<float>("nms_threshold", 0.4);
+        scale_x_y = params.get<float>("scale_x_y", 1.0); // Yolov4
+        new_coords = params.get<int>("new_coords", 0); // Yolov4x-mish
+
+        CV_Assert(nmsThreshold >= 0.);
+        CV_Assert(coords == 4);
+        CV_Assert(classes >= 1);
+        CV_Assert(anchors >= 1);
+        CV_Assert(useLogistic || useSoftmax);
+        if (params.get<bool>("softmax_tree", false))
+            CV_Error(cv::Error::StsNotImplemented, "Yolo9000 is not implemented");
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        // channels == cell_size*anchors
+        CV_Assert(inputs[0][3] == (1 + coords + classes)*anchors);
+        int batch_size = inputs[0][0];
+        if(batch_size > 1)
+            outputs = std::vector<MatShape>(1, shape(batch_size, inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
+        else
+            outputs = std::vector<MatShape>(1, shape(inputs[0][1] * inputs[0][2] * anchors, inputs[0][3] / anchors));
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_DNN_NGRAPH
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2) && preferableTarget != DNN_TARGET_MYRIAD && new_coords == 0;
+#endif
+#ifdef HAVE_CUDA
+        if (backendId == DNN_BACKEND_CUDA)
+            return true;
+#endif
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    float logistic_activate(float x) { return 1.F / (1.F + exp(-x)); }
+
+    void softmax_activate(const float* input, const int n, const float temp, float* output)
+    {
+        int i;
+        float sum = 0;
+        float largest = -FLT_MAX;
+        for (i = 0; i < n; ++i) {
+            if (input[i] > largest) largest = input[i];
+        }
+        for (i = 0; i < n; ++i) {
+            float e = exp((input[i] - largest) / temp);
+            sum += e;
+            output[i] = e;
+        }
+        for (i = 0; i < n; ++i) {
+            output[i] /= sum;
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        if (blob_umat.empty())
+            blobs[0].copyTo(blob_umat);
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        // TODO: implement a logistic activation to classification scores.
+        if (useLogistic || inps.depth() == CV_16S)
+            return false;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        CV_Assert(inputs.size() >= 1);
+        int const cell_size = classes + coords + 1;
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            UMat& inpBlob = inputs[ii];
+            UMat& outBlob = outputs[ii];
+
+            int batch_size = inpBlob.size[0];
+            int rows = inpBlob.size[1];
+            int cols = inpBlob.size[2];
+
+            // channels == cell_size*anchors, see l. 94
+            int sample_size = cell_size*rows*cols*anchors;
+
+            ocl::Kernel logistic_kernel("logistic_activ", ocl::dnn::region_oclsrc);
+            size_t nanchors = rows*cols*anchors*batch_size;
+            logistic_kernel.set(0, (int)nanchors);
+            logistic_kernel.set(1, ocl::KernelArg::PtrReadOnly(inpBlob));
+            logistic_kernel.set(2, (int)cell_size);
+            logistic_kernel.set(3, ocl::KernelArg::PtrWriteOnly(outBlob));
+            logistic_kernel.run(1, &nanchors, NULL, false);
+
+            if (useSoftmax)
+            {
+                // Yolo v2
+                // softmax activation for Probability, for each grid cell (X x Y x Anchor-index)
+                ocl::Kernel softmax_kernel("softmax_activ", ocl::dnn::region_oclsrc);
+                size_t nanchors = rows*cols*anchors*batch_size;
+                softmax_kernel.set(0, (int)nanchors);
+                softmax_kernel.set(1, ocl::KernelArg::PtrReadOnly(inpBlob));
+                softmax_kernel.set(2, ocl::KernelArg::PtrReadOnly(blob_umat));
+                softmax_kernel.set(3, (int)cell_size);
+                softmax_kernel.set(4, (int)classes);
+                softmax_kernel.set(5, (int)classfix);
+                softmax_kernel.set(6, (int)rows);
+                softmax_kernel.set(7, (int)cols);
+                softmax_kernel.set(8, (int)anchors);
+                softmax_kernel.set(9, (float)thresh);
+                softmax_kernel.set(10, ocl::KernelArg::PtrWriteOnly(outBlob));
+                if (!softmax_kernel.run(1, &nanchors, NULL, false))
+                    return false;
+            }
+
+            if (nmsThreshold > 0) {
+                Mat mat = outBlob.getMat(ACCESS_WRITE);
+                float *dstData = mat.ptr<float>();
+                for (int b = 0; b < batch_size; ++b)
+                    do_nms_sort(dstData + b*sample_size, rows*cols*anchors, thresh, nmsThreshold);
+            }
+
+        }
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        CV_Assert(inputs.size() >= 1);
+        CV_Assert(outputs.size() == 1);
+        int const cell_size = classes + coords + 1;
+
+        const float* biasData = blobs[0].ptr<float>();
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &inpBlob = inputs[ii];
+            Mat &outBlob = outputs[ii];
+
+            int batch_size = inpBlob.size[0];
+            int rows = inpBlob.size[1];
+            int cols = inpBlob.size[2];
+
+            // address length for one image in batch, both for input and output
+            int sample_size = cell_size*rows*cols*anchors;
+
+            // assert that the comment above is true
+            CV_Assert(sample_size*batch_size == inpBlob.total());
+            CV_Assert(sample_size*batch_size == outBlob.total());
+
+            CV_Assert(inputs.size() < 2 || inputs[1].dims == 4);
+            int hNorm = inputs.size() > 1 ? inputs[1].size[2] : rows;
+            int wNorm = inputs.size() > 1 ? inputs[1].size[3] : cols;
+
+            const float *srcData = inpBlob.ptr<float>();
+            float *dstData = outBlob.ptr<float>();
+
+            if (new_coords == 0) {
+                // logistic activation for t0, for each grid cell (X x Y x Anchor-index)
+                for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
+                    int index = cell_size*i;
+                    float x = srcData[index + 4];
+                    dstData[index + 4] = logistic_activate(x);	// logistic activation
+                }
+
+                if (useSoftmax) {  // Yolo v2
+                    for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
+                        int index = cell_size*i;
+                        softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
+                    }
+                }
+                else if (useLogistic) {  // Yolo v3
+                    for (int i = 0; i < batch_size*rows*cols*anchors; ++i){
+                        int index = cell_size*i;
+                        const float* input = srcData + index + 5;
+                        float* output = dstData + index + 5;
+                        for (int c = 0; c < classes; ++c)
+                            output[c] = logistic_activate(input[c]);
+                    }
+                }
+            }
+            for (int b = 0; b < batch_size; ++b)
+                for (int x = 0; x < cols; ++x)
+                    for(int y = 0; y < rows; ++y)
+                        for (int a = 0; a < anchors; ++a) {
+                            // relative start address for image b within the batch data
+                            int index_sample_offset = sample_size*b;
+                            int index = (y*cols + x)*anchors + a;  // index for each grid-cell & anchor
+                            int p_index = index_sample_offset + index * cell_size + 4;
+                            float scale = dstData[p_index];
+                            if (classfix == -1 && scale < .5)
+                            {
+                                scale = 0;  // if(t0 < 0.5) t0 = 0;
+                            }
+                            int box_index = index_sample_offset + index * cell_size;
+
+                            if (new_coords == 1) {
+                                float x_tmp = (srcData[box_index + 0] - 0.5f) * scale_x_y + 0.5f;
+                                float y_tmp = (srcData[box_index + 1] - 0.5f) * scale_x_y + 0.5f;
+                                dstData[box_index + 0] = (x + x_tmp) / cols;
+                                dstData[box_index + 1] = (y + y_tmp) / rows;
+                                dstData[box_index + 2] = (srcData[box_index + 2]) * (srcData[box_index + 2]) * 4 * biasData[2 * a] / wNorm;
+                                dstData[box_index + 3] = (srcData[box_index + 3]) * (srcData[box_index + 3]) * 4 * biasData[2 * a + 1] / hNorm;
+
+                                scale = srcData[p_index];
+                                if (classfix == -1 && scale < thresh)
+                                {
+                                    scale = 0;  // if(t0 < 0.5) t0 = 0;
+                                }
+
+                                int class_index = index_sample_offset + index * cell_size + 5;
+                                for (int j = 0; j < classes; ++j) {
+                                    float prob = scale*srcData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
+                                    dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                                }
+                            }
+                            else
+                            {
+                                float x_tmp = (logistic_activate(srcData[box_index + 0]) - 0.5f) * scale_x_y + 0.5f;
+                                float y_tmp = (logistic_activate(srcData[box_index + 1]) - 0.5f) * scale_x_y + 0.5f;
+                                dstData[box_index + 0] = (x + x_tmp) / cols;
+                                dstData[box_index + 1] = (y + y_tmp) / rows;
+                                dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / wNorm;
+                                dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / hNorm;
+
+                                int class_index = index_sample_offset + index * cell_size + 5;
+                                for (int j = 0; j < classes; ++j) {
+                                    float prob = scale*dstData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
+                                    dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                                }
+                            }
+                        }
+            if (nmsThreshold > 0) {
+                for (int b = 0; b < batch_size; ++b){
+                    do_nms_sort(dstData+b*sample_size, rows*cols*anchors, thresh, nmsThreshold);
+                }
+            }
+        }
+    }
+
+    void do_nms_sort(float *detections, int total, float score_thresh, float nms_thresh)
+    {
+        std::vector<Rect2d> boxes(total);
+        std::vector<float> scores(total);
+
+        for (int i = 0; i < total; ++i)
+        {
+            Rect2d &b = boxes[i];
+            int box_index = i * (classes + coords + 1);
+            b.width = detections[box_index + 2];
+            b.height = detections[box_index + 3];
+            b.x = detections[box_index + 0] - b.width / 2;
+            b.y = detections[box_index + 1] - b.height / 2;
+        }
+
+        std::vector<int> indices;
+        for (int k = 0; k < classes; ++k)
+        {
+            for (int i = 0; i < total; ++i)
+            {
+                int box_index = i * (classes + coords + 1);
+                int class_index = box_index + 5;
+                scores[i] = detections[class_index + k];
+                detections[class_index + k] = 0;
+            }
+            NMSBoxes(boxes, scores, score_thresh, nms_thresh, indices);
+            for (int i = 0, n = indices.size(); i < n; ++i)
+            {
+                int box_index = indices[i] * (classes + coords + 1);
+                int class_index = box_index + 5;
+                detections[class_index + k] = scores[indices[i]];
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        if (coords != 4)
+            CV_Error(Error::StsNotImplemented, "Only upright rectangular boxes are supported in RegionLayer.");
+
+        std::size_t height_norm, width_norm;
+        if (inputs.size() == 1)
+        {
+            auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+            auto input_shape = input_wrapper->getShape();
+            height_norm = input_shape[1];
+            width_norm = input_shape[2];
+        }
+        else
+        {
+            auto input_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
+            auto input_shape = input_wrapper->getShape();
+            CV_Assert(input_shape.size() == 4);
+            height_norm = input_shape[2];
+            width_norm = input_shape[3];
+        }
+
+        cuda4dnn::SquashMethod squash_method;
+        if(useLogistic)
+            squash_method = cuda4dnn::SquashMethod::SIGMOID;
+        else if (useSoftmax)
+            squash_method = cuda4dnn::SquashMethod::SOFTMAX;
+
+        /* exactly one must be true */
+        CV_Assert((useLogistic || useSoftmax) && !(useLogistic && useSoftmax));
+
+        cuda4dnn::RegionConfiguration<float> config;
+        config.squash_method = squash_method;
+        config.classes = classes;
+        config.boxes_per_cell = anchors;
+
+        config.height_norm = height_norm;
+        config.width_norm = width_norm;
+
+        config.scale_x_y = scale_x_y;
+
+        config.object_prob_cutoff = (classfix == -1) ? thresh : 0.f;
+        config.class_prob_cutoff = thresh;
+
+        config.nms_iou_threshold = nmsThreshold;
+
+        config.new_coords = (new_coords == 1);
+        return make_cuda_node<cuda4dnn::RegionOp>(preferableTarget, std::move(context->stream), blobs[0], config);
+    }
+#endif
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 60*total(inputs[i]);
+        }
+        return flops;
+    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto parent_shape = input->get_shape();
+        int64_t b = parent_shape[0];
+        int64_t h = parent_shape[1];
+        int64_t w = parent_shape[2];
+        int64_t c = parent_shape[3];
+
+        int64_t cols = b * h * w * anchors;
+        int64_t rows = c / anchors;
+        auto shape_node = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2},  std::vector<int64_t>{cols, rows});
+        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, 0});
+
+        std::shared_ptr<ngraph::Node> input2d;
+        {
+            input2d = std::make_shared<ngraph::op::v1::Reshape>(input, shape_node, true);
+            input2d = std::make_shared<ngraph::op::Transpose>(input2d, tr_axes);
+        }
+
+        std::shared_ptr<ngraph::Node> region;
+        {
+            auto new_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
+            auto tr_input = std::make_shared<ngraph::op::Transpose>(input, new_axes);
+
+            std::vector<float> anchors_vec(blobs[0].ptr<float>(), blobs[0].ptr<float>() + blobs[0].total());
+            std::vector<int64_t> mask(anchors, 1);
+            region = std::make_shared<ngraph::op::RegionYolo>(tr_input, coords, classes, anchors, useSoftmax, mask, 1, 3, anchors_vec);
+
+            auto tr_shape = tr_input->get_shape();
+            auto shape_as_inp = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                                                       ngraph::Shape{tr_shape.size()},
+                                                                       std::vector<int64_t>(tr_shape.begin(), tr_shape.end()));
+
+            region = std::make_shared<ngraph::op::v1::Reshape>(region, shape_as_inp, true);
+            new_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{4}, std::vector<int64_t>{0, 2, 3, 1});
+            region = std::make_shared<ngraph::op::Transpose>(region, new_axes);
+
+            region = std::make_shared<ngraph::op::v1::Reshape>(region, shape_node, true);
+            region = std::make_shared<ngraph::op::Transpose>(region, tr_axes);
+        }
+
+        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, 1});
+        std::vector<int64_t> boxes_shape{b, anchors, h, w};
+        auto shape_3d = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{boxes_shape.size()}, boxes_shape.data());
+
+        ngraph::Shape box_broad_shape{1, (size_t)anchors, (size_t)h, (size_t)w};
+        auto scale_x_y_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &scale_x_y);
+        auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{0.5});
+
+        auto axis = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{}, {0});
+        auto splits = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{5}, {1, 1, 1, 1, rows - 4});
+        auto split = std::make_shared<ngraph::op::v1::VariadicSplit>(input2d, axis, splits);
+        std::shared_ptr<ngraph::Node> box_x;
+        {
+            box_x = std::make_shared<ngraph::op::Sigmoid>(split->output(0));
+            box_x = std::make_shared<ngraph::op::v1::Subtract>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ngraph::op::v1::Multiply>(box_x, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ngraph::op::v1::Add>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ngraph::op::v1::Reshape>(box_x, shape_3d, true);
+
+            std::vector<float> x_indices(w * h * anchors);
+            auto begin = x_indices.begin();
+            for (int i = 0; i < h; i++)
+            {
+                std::fill(begin + i * anchors, begin + (i + 1) * anchors, i);
+            }
+
+            for (int j = 1; j < w; j++)
+            {
+                std::copy(begin, begin + h * anchors, begin + j * h * anchors);
+            }
+            auto horiz = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, x_indices.data());
+            box_x = std::make_shared<ngraph::op::v1::Add>(box_x, horiz, ngraph::op::AutoBroadcastType::NUMPY);
+
+            auto cols_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{float(w)});
+            box_x = std::make_shared<ngraph::op::v1::Divide>(box_x, cols_node, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        std::shared_ptr<ngraph::Node> box_y;
+        {
+            box_y = std::make_shared<ngraph::op::Sigmoid>(split->output(1));
+            box_y = std::make_shared<ngraph::op::v1::Subtract>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ngraph::op::v1::Multiply>(box_y, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ngraph::op::v1::Add>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ngraph::op::v1::Reshape>(box_y, shape_3d, true);
+
+            std::vector<float> y_indices(h * anchors);
+            for (int i = 0; i < h; i++)
+            {
+                std::fill(y_indices.begin() + i * anchors, y_indices.begin() + (i + 1) * anchors, i);
+            }
+
+            auto vert = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1, (size_t)anchors, (size_t)h, 1}, y_indices.data());
+            box_y = std::make_shared<ngraph::op::v1::Add>(box_y, vert, ngraph::op::AutoBroadcastType::NUMPY);
+            auto rows_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{float(h)});
+            box_y = std::make_shared<ngraph::op::v1::Divide>(box_y, rows_node, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        std::shared_ptr<ngraph::Node> box_w, box_h;
+        {
+            int hNorm, wNorm;
+            if (nodes.size() > 1)
+            {
+                auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
+                hNorm = node_1_shape[2];
+                wNorm = node_1_shape[3];
+            }
+            else
+            {
+                hNorm = h;
+                wNorm = w;
+            }
+
+            std::vector<float> anchors_w(anchors), anchors_h(anchors);
+            for (size_t a = 0; a < anchors; ++a)
+            {
+                anchors_w[a] = blobs[0].at<float>(0, 2 * a) / wNorm;
+                anchors_h[a] = blobs[0].at<float>(0, 2 * a + 1) / hNorm;
+            }
+
+            std::vector<float> bias_w(w * h * anchors), bias_h(w * h * anchors);
+            for (int j = 0; j < h; j++)
+            {
+                std::copy(anchors_w.begin(), anchors_w.end(), bias_w.begin() + j * anchors);
+                std::copy(anchors_h.begin(), anchors_h.end(), bias_h.begin() + j * anchors);
+            }
+
+            for (int i = 1; i < w; i++)
+            {
+                std::copy(bias_w.begin(), bias_w.begin() + h * anchors, bias_w.begin() + i * h * anchors);
+                std::copy(bias_h.begin(), bias_h.begin() + h * anchors, bias_h.begin() + i * h * anchors);
+            }
+
+            box_w = std::make_shared<ngraph::op::v0::Exp>(split->output(2));
+            box_w = std::make_shared<ngraph::op::v1::Reshape>(box_w, shape_3d, true);
+            auto anchor_w_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, bias_w.data());
+            box_w = std::make_shared<ngraph::op::v1::Multiply>(box_w, anchor_w_node, ngraph::op::AutoBroadcastType::NUMPY);
+
+            box_h = std::make_shared<ngraph::op::v0::Exp>(split->output(3));
+            box_h = std::make_shared<ngraph::op::v1::Reshape>(box_h, shape_3d, true);
+            auto anchor_h_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, bias_h.data());
+            box_h = std::make_shared<ngraph::op::v1::Multiply>(box_h, anchor_h_node, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+
+        auto region_splits = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{3}, {4, 1, rows - 5});
+        auto region_split = std::make_shared<ngraph::op::v1::VariadicSplit>(region, axis, region_splits);
+
+        std::shared_ptr<ngraph::Node> scale;
+        {
+            float thr = classfix == -1 ? 0.5 : 0;
+            auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{thr});
+            auto mask = std::make_shared<ngraph::op::v1::Less>(region_split->output(1), thresh_node);
+            auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>(cols, 0));
+            scale = std::make_shared<ngraph::op::v1::Select>(mask, zero_node, region_split->output(1));
+        }
+
+        std::shared_ptr<ngraph::Node> probs;
+        {
+            probs = std::make_shared<ngraph::op::v1::Multiply>(region_split->output(2), scale, ngraph::op::AutoBroadcastType::NUMPY);
+            auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &thresh);
+            auto mask = std::make_shared<ngraph::op::v1::Greater>(probs, thresh_node);
+            auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>((rows - 5) * cols, 0));
+            probs = std::make_shared<ngraph::op::v1::Select>(mask, probs, zero_node);
+        }
+
+
+        auto concat_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, cols});
+        box_x = std::make_shared<ngraph::op::v1::Reshape>(box_x, concat_shape, true);
+        box_y = std::make_shared<ngraph::op::v1::Reshape>(box_y, concat_shape, true);
+        box_w = std::make_shared<ngraph::op::v1::Reshape>(box_w, concat_shape, true);
+        box_h = std::make_shared<ngraph::op::v1::Reshape>(box_h, concat_shape, true);
+
+        ngraph::NodeVector inp_nodes{box_x, box_y, box_w, box_h, scale, probs};
+        std::shared_ptr<ngraph::Node> result = std::make_shared<ngraph::op::Concat>(inp_nodes, 0);
+        result = std::make_shared<ngraph::op::Transpose>(result, tr_axes);
+        if (b > 1)
+        {
+            std::vector<int64_t> sizes{b, static_cast<int64_t>(result->get_shape()[0]) / b, static_cast<int64_t>(result->get_shape()[1])};
+            auto shape_node = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{sizes.size()}, sizes.data());
+            result = std::make_shared<ngraph::op::v1::Reshape>(result, shape_node, true);
+        }
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(result));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+};
+
+Ptr<RegionLayer> RegionLayer::create(const LayerParams& params)
+{
+    return Ptr<RegionLayer>(new RegionLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/reorg_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/reorg_layer.cpp
@ -0,0 +1,264 @@
+/*M ///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/dnn/all_layers.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#include "../op_inf_engine.hpp"
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
+#include <ngraph/op/reorg_yolo.hpp>
+#else
+#include <ngraph/op/experimental/layers/reorg_yolo.hpp>
+#endif
+#endif
+
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reorg.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class ReorgLayerImpl CV_FINAL : public ReorgLayer
+{
+    int reorgStride;
+public:
+
+    ReorgLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+
+        reorgStride = params.get<int>("reorg_stride", 2);
+        CV_Assert(reorgStride > 0);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() > 0);
+        outputs = std::vector<MatShape>(inputs.size(), shape(
+            inputs[0][0],
+            inputs[0][1] * reorgStride * reorgStride,
+            inputs[0][2] / reorgStride,
+            inputs[0][3] / reorgStride));
+
+        CV_Assert(outputs[0][0] > 0 && outputs[0][1] > 0 && outputs[0][2] > 0 && outputs[0][3] > 0);
+        CV_Assert(total(outputs[0]) == total(inputs[0]));
+
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat inp = inputs[0];
+        Mat out = outputs[0];
+        int batchSize = inp.size[0];
+
+        LayerParams permParams;
+        if (batchSize == 1)
+        {
+            int order[] = {1, 3, 0, 2};
+            permParams.set("order", DictValue::arrayInt(&order[0], 4));
+
+            permuteInpShape.resize(4);
+            permuteInpShape[0] = inp.size[1] * inp.size[2] / (reorgStride * reorgStride);  // (channels*height)/(r*r)
+            permuteInpShape[1] = reorgStride;
+            permuteInpShape[2] = inp.size[3];  // width
+            permuteInpShape[3] = reorgStride;
+
+            permuteOutShape.resize(4);
+            for (int i = 0; i < 4; ++i)
+                permuteOutShape[i] = permuteInpShape[order[i]];
+        }
+        else
+        {
+            int order[] = {0, 2, 4, 1, 3};
+            permParams.set("order", DictValue::arrayInt(&order[0], 5));
+
+            permuteInpShape.resize(5);
+            permuteInpShape[0] = batchSize;
+            permuteInpShape[1] = inp.size[1] * inp.size[2] / (reorgStride * reorgStride);  // (channels*height)/(r*r)
+            permuteInpShape[2] = reorgStride;
+            permuteInpShape[3] = inp.size[3];  // width
+            permuteInpShape[4] = reorgStride;
+
+            permuteOutShape.resize(5);
+            for (int i = 0; i < 5; ++i)
+                permuteOutShape[i] = permuteInpShape[order[i]];
+        }
+        permute = PermuteLayer::create(permParams);
+        std::vector<Mat> permuteInputs(1, inp.reshape(1, permuteInpShape));
+        std::vector<Mat> permuteOutputs(1, out.reshape(1, permuteOutShape));
+        permute->finalize(permuteInputs, permuteOutputs);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        inputs[0] = inputs[0].reshape(1, permuteInpShape.size(), &permuteInpShape[0]);
+        outputs[0] = outputs[0].reshape(1, permuteOutShape.size(), &permuteOutShape[0]);
+        permute->preferableTarget = preferableTarget;
+        permute->forward(inputs, outputs, internals);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        inputs[0] = inputs[0].reshape(1, permuteInpShape);
+        outputs[0] = outputs[0].reshape(1, permuteOutShape);
+        permute->forward(inputs, outputs, internals_arr);
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ReorgYoloLayer ieLayer(name);
+        ieLayer.setStride(reorgStride);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto reorg = std::make_shared<ngraph::op::ReorgYolo>(ieInpNode, ngraph::Strides{(size_t)reorgStride});
+        return Ptr<BackendNode>(new InfEngineNgraphNode(reorg));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReorgOp>(preferableTarget, std::move(context->stream), reorgStride);
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 21*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    Ptr<PermuteLayer> permute;
+    std::vector<int> permuteInpShape, permuteOutShape;
+};
+
+Ptr<ReorgLayer> ReorgLayer::create(const LayerParams& params)
+{
+    return Ptr<ReorgLayer>(new ReorgLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/reshape_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/reshape_layer.cpp
@ -0,0 +1,367 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/reshape.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+static void computeShapeByReshapeMask(const MatShape &srcShape,
+                                      const MatShape &maskShape,
+                                      Range srcRange /*= Range::all()*/,
+                                      MatShape& dstShape)
+{
+    int srcShapeSize = (int)srcShape.size();
+    int maskShapeSize = (int)maskShape.size();
+
+    srcRange = normalize_axis_range(srcRange, srcShapeSize);
+
+    bool explicitMask = !maskShape.empty();  // All mask values are positive.
+    for (int i = 0, n = maskShape.size(); i < n && explicitMask; ++i)
+    {
+        explicitMask = maskShape[i] > 0;
+    }
+    // Working range of source shape is a range where area(src) == area(mask).
+    if (explicitMask)
+    {
+        int maskTotal = total(maskShape);
+        // Go from the end of mask until we collect required total.
+        bool matched = false;
+        for (int i = srcRange.end - 1; i >= srcRange.start; --i)
+        {
+            if (matched)
+            {
+                if (total(srcShape, i, srcRange.end) != maskTotal)
+                {
+                    srcRange.start = i + 1;
+                    break;
+                }
+                else if (i == 0)
+                {
+                    srcRange.start = 0;
+                    break;
+                }
+            }
+            else
+            {
+                matched = total(srcShape, i, srcRange.end) == maskTotal;
+            }
+        }
+        while (total(srcShape, srcRange.start, srcRange.end) != maskTotal && srcRange.start > 0)
+        {
+            srcRange.start -= 1;
+        }
+        CV_Assert(total(srcShape, srcRange.start, srcRange.end) == maskTotal);
+    }
+
+    CV_Assert(0 <= srcRange.start && srcRange.start <= srcRange.end && srcRange.end <= srcShapeSize);
+    int dstShapeSize = srcShapeSize - srcRange.size() + maskShapeSize;
+    dstShape.resize(dstShapeSize);
+
+    std::copy(srcShape.begin(), srcShape.begin() + srcRange.start, dstShape.begin());
+    std::copy(srcShape.begin() + srcRange.end, srcShape.begin() + srcShapeSize, dstShape.begin() + srcRange.start + maskShapeSize);
+
+    int inferDim = -1;
+    for (int i = 0; i < maskShapeSize; i++)
+    {
+        if (maskShape[i] > 0)
+        {
+            dstShape[srcRange.start + i] = maskShape[i];
+        }
+        else if (maskShape[i] == 0)
+        {
+            if (srcRange.start + i >= srcShapeSize)
+                CV_Error(Error::StsBadArg, format("Copy dim[%d] (which has zero size) is out of the source shape bounds", srcRange.start + i));
+            dstShape[srcRange.start + i] = srcShape[srcRange.start + i];
+        }
+        else if (maskShape[i] == -1)
+        {
+            if (inferDim != -1)
+                CV_Error(Error::StsAssert, "Duplicate of inferred dim (which is denoted by -1)");
+            inferDim = srcRange.start + i;
+            dstShape[inferDim] = 1;
+        }
+        else
+            CV_Error(Error::StsBadArg, "maskShape[i] >= -1");
+    }
+
+    size_t srcTotal = total(srcShape);
+    size_t dstTotal = total(dstShape);
+    CV_Assert(dstTotal != 0);
+
+    if (inferDim != -1)
+    {
+        if (srcTotal % dstTotal != 0)
+            CV_Error(Error::StsBackTrace, "Can't infer a dim denoted by -1");
+
+        dstShape[inferDim] = (int)(srcTotal / dstTotal);
+    }
+    else
+    {
+        CV_Assert(srcTotal == dstTotal);
+    }
+}
+
+
+class ReshapeLayerImpl CV_FINAL : public ReshapeLayer
+{
+public:
+    ReshapeLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        int axis = params.get<int>("axis", 0);
+        int numAxes = params.get<int>("num_axes", -1);
+        hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
+        shapesInitialized = !hasDynamicShapes;
+
+        CV_Assert(numAxes >= -1);
+        newShapeRange = (numAxes == -1) ? Range(axis, INT_MAX) : Range(axis, axis + numAxes);
+
+        newShapeDesc.clear();
+        if (params.has("dim"))
+        {
+            const DictValue &paramShape = params.get("dim");
+            int i, dims = paramShape.size();
+            newShapeDesc.resize(dims);
+            for (i = 0; i < dims; i++)
+                newShapeDesc[i] = paramShape.get<int>(i);
+        }
+        if (hasDynamicShapes)
+        {
+            dynamicShapes.clear();
+            inputIndices.clear();
+            if (params.has("dynamic_axes")) {
+                CV_Assert(params.has("input_indices"));
+                const DictValue &dynamicAxes = params.get("dynamic_axes");
+                const DictValue &dynamicInputShapes = params.get("input_indices");
+                int i, dims = dynamicAxes.size();
+                CV_Assert(dims == dynamicInputShapes.size());
+                CV_Assert(dims > 0);
+                dynamicShapes.resize(dims);
+                inputIndices.resize(dims);
+                for (i = 0; i < dims; i++) {
+                    dynamicShapes[i] = dynamicAxes.get<int>(i);
+                    inputIndices[i] = dynamicInputShapes.get<int>(i);
+                }
+            }
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && haveInfEngine());
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+
+        if (inputs.size() == 1 || inputs.size() == requiredOutputs)
+        {
+            outputs.clear();
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                if (hasDynamicShapes && !shapesInitialized)
+                {
+                    outputs.push_back(newShapeDesc);
+                }
+                else
+                {
+                    outputs.push_back(MatShape());
+                    computeShapeByReshapeMask(inputs[i], newShapeDesc, newShapeRange, outputs.back());
+                }
+            }
+        }
+        else
+        {
+            CV_Assert_N(inputs.size() == 2, total(inputs[0]) == total(inputs[1]));
+            outputs.assign(1, inputs[1]);
+        }
+        return true;
+    }
+
+    bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
+    {
+        if (hasDynamicShapes)
+        {
+            for (int i = 0; i < dynamicShapes.size(); ++i)
+            {
+                newShapeDesc[dynamicShapes[i]] = inputs[0][inputIndices[i]];
+            }
+        }
+        shapesInitialized = true;
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> outputs;
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(!outputs.empty());
+        outShapes.resize(outputs.size());
+        for (int i = 0; i < outputs.size(); ++i)
+            outShapes[i] = shape(outputs[i]);
+    }
+
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            UMat srcBlob = inputs[i];
+            void *src_handle = inputs[i].handle(ACCESS_READ);
+            void *dst_handle = outputs[i].handle(ACCESS_WRITE);
+            if (src_handle != dst_handle)
+            {
+                UMat umat = srcBlob.reshape(1, (int)outShapes[i].size(), &outShapes[i][0]);
+                umat.copyTo(outputs[i]);
+            }
+        }
+        outs.assign(outputs);
+
+        return true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            Mat srcBlob = inputs[i];
+            if (outputs[i].data != srcBlob.data)
+                srcBlob.reshape(1, shape(outputs[i])).copyTo(outputs[i]);
+        }
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::ReshapeLayer ieLayer(name);
+        CV_Assert(outShapes.size() == 1);
+        ieLayer.setDims(outShapes[0]);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(outShapes.size() == 1);
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        std::vector<int64_t> out(outShapes[0].begin(), outShapes[0].end());
+        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                       ngraph::Shape{out.size()}, out.data());
+        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, shape, true);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(reshape));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
+private:
+    std::vector<MatShape> outShapes;
+    std::vector<int> dynamicShapes; // Which axes shapes are dynamic and require reinitialization with new input
+    std::vector<int> inputIndices; // Which axes from input are needed to compute correct output shape
+    bool hasDynamicShapes;
+    bool shapesInitialized;
+};
+
+Ptr<ReshapeLayer> ReshapeLayer::create(const LayerParams& params)
+{
+    return Ptr<ReshapeLayer>(new ReshapeLayerImpl(params));
+}
+
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/resize_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/resize_layer.cpp
@ -0,0 +1,486 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include <opencv2/imgproc.hpp>
+
+#ifdef HAVE_DNN_NGRAPH
+#include "../ie_ngraph.hpp"
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
+#include <ngraph/op/interpolate.hpp>
+#else
+#include <ngraph/op/experimental/layers/interpolate.hpp>
+#endif
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/resize.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class ResizeLayerImpl : public ResizeLayer
+{
+public:
+    ResizeLayerImpl(const LayerParams& params) : zoomFactorWidth(params.get<float>("zoom_factor_x", params.get<float>("zoom_factor", 0))),
+                                                 zoomFactorHeight(params.get<float>("zoom_factor_y", params.get<float>("zoom_factor", 0))),
+                                                 scaleWidth(0), scaleHeight(0)
+    {
+        setParamsFrom(params);
+        outWidth = params.get<float>("width", 0);
+        outHeight = params.get<float>("height", 0);
+        if (params.has("zoom_factor"))
+        {
+            CV_Assert(!params.has("zoom_factor_x") && !params.has("zoom_factor_y"));
+        }
+        else if (params.has("zoom_factor_x") || params.has("zoom_factor_y"))
+        {
+            CV_Assert(params.has("zoom_factor_x") && params.has("zoom_factor_y"));
+        }
+        interpolation = params.get<String>("interpolation");
+        CV_Check(interpolation, interpolation == "nearest" || interpolation == "opencv_linear" || interpolation == "bilinear", "");
+
+        alignCorners = params.get<bool>("align_corners", false);
+        halfPixelCenters = params.get<bool>("half_pixel_centers", false);
+        if (interpolation == "opencv_linear")
+            halfPixelCenters = true;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 1 || inputs.size() == 2, inputs[0].size() == 4);
+        outputs.resize(1, inputs[0]);
+        if (inputs.size() == 1) {
+            outputs[0][2] = zoomFactorHeight > 0 ? (outputs[0][2] * zoomFactorHeight) : outHeight;
+            outputs[0][3] = zoomFactorWidth > 0 ? (outputs[0][3] * zoomFactorWidth) : outWidth;
+        } else {
+            outputs[0][2] = inputs[1][2];
+            outputs[0][3] = inputs[1][3];
+        }
+        // We can work in-place (do nothing) if input shape == output shape.
+        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_CUDA)
+            return interpolation == "nearest" || interpolation == "bilinear" || interpolation == "opencv_linear";
+
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            return (interpolation == "nearest" && scaleWidth == scaleHeight) ||
+                   (interpolation == "bilinear");
+        }
+#endif
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        outHeight = outputs[0].size[2];
+        outWidth = outputs[0].size[3];
+        if (alignCorners && outHeight > 1)
+            scaleHeight = static_cast<float>(inputs[0].size[2] - 1) / (outHeight - 1);
+        else
+            scaleHeight = static_cast<float>(inputs[0].size[2]) / outHeight;
+
+        if (alignCorners && outWidth > 1)
+            scaleWidth = static_cast<float>(inputs[0].size[3] - 1) / (outWidth - 1);
+        else
+            scaleWidth = static_cast<float>(inputs[0].size[3]) / outWidth;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
+        {
+            // outputs[0] = inputs[0] doesn't work due to BlobManager optimizations
+            if (inputs[0].data != outputs[0].data)
+            {
+                inputs[0].copyTo(outputs[0]);
+            }
+            return;
+        }
+
+        Mat& inp = inputs[0];
+        Mat& out = outputs[0];
+        int depth = inp.depth();
+        if ((interpolation == "nearest" && !alignCorners && !halfPixelCenters) || (interpolation == "opencv_linear" && depth != CV_8S) ||
+            (interpolation == "bilinear" && halfPixelCenters && depth != CV_8S))
+        {
+            // INTER_LINEAR Resize mode does not support INT8 inputs
+            InterpolationFlags mode = interpolation == "nearest" ? INTER_NEAREST : INTER_LINEAR;
+            for (size_t n = 0; n < inputs[0].size[0]; ++n)
+            {
+                for (size_t ch = 0; ch < inputs[0].size[1]; ++ch)
+                {
+                    resize(getPlane(inp, n, ch), getPlane(out, n, ch),
+                           Size(outWidth, outHeight), 0, 0, mode);
+                }
+            }
+        }
+        else if (interpolation == "nearest")
+        {
+            const int inpHeight = inp.size[2];
+            const int inpWidth = inp.size[3];
+            const int inpSpatialSize = inpHeight * inpWidth;
+            const int outSpatialSize = outHeight * outWidth;
+            const int numPlanes = inp.size[0] * inp.size[1];
+            CV_Assert_N(inp.isContinuous(), out.isContinuous());
+
+            Mat inpPlanes = inp.reshape(1, numPlanes * inpHeight);
+            Mat outPlanes = out.reshape(1, numPlanes * outHeight);
+
+            float heightOffset = 0.0f;
+            float widthOffset = 0.0f;
+
+            if (halfPixelCenters)
+            {
+                heightOffset = 0.5f * scaleHeight;
+                widthOffset = 0.5f * scaleWidth;
+            }
+
+            if (depth == CV_8S)
+            {
+                for (int y = 0; y < outHeight; ++y)
+                {
+                    float input_y = y * scaleHeight + heightOffset;
+                    int y0 = halfPixelCenters ? std::floor(input_y) : lroundf(input_y);
+                    y0 = std::min(y0, inpHeight - 1);
+
+                    const int8_t* inpData_row = inpPlanes.ptr<int8_t>(y0);
+
+                    for (int x = 0; x < outWidth; ++x)
+                    {
+                        float input_x = x * scaleWidth + widthOffset;
+                        int x0 = halfPixelCenters ? std::floor(input_x) : lroundf(input_x);
+                        x0 = std::min(x0, inpWidth - 1);
+
+                        int8_t* outData = outPlanes.ptr<int8_t>(y, x);
+                        const int8_t* inpData_row_c = inpData_row;
+
+                        for (int c = 0; c < numPlanes; ++c)
+                        {
+                            *outData = inpData_row_c[x0];
+
+                            inpData_row_c += inpSpatialSize;
+                            outData += outSpatialSize;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (int y = 0; y < outHeight; ++y)
+                {
+                    float input_y = y * scaleHeight + heightOffset;
+                    int y0 = halfPixelCenters ? std::floor(input_y) : lroundf(input_y);
+                    y0 = std::min(y0, inpHeight - 1);
+
+                    const float* inpData_row = inpPlanes.ptr<float>(y0);
+
+                    for (int x = 0; x < outWidth; ++x)
+                    {
+                        float input_x = x * scaleWidth + widthOffset;
+                        int x0 = halfPixelCenters ? std::floor(input_x) : lroundf(input_x);
+                        x0 = std::min(x0, inpWidth - 1);
+
+                        float* outData = outPlanes.ptr<float>(y, x);
+                        const float* inpData_row_c = inpData_row;
+
+                        for (int c = 0; c < numPlanes; ++c)
+                        {
+                            *outData = inpData_row_c[x0];
+
+                            inpData_row_c += inpSpatialSize;
+                            outData += outSpatialSize;
+                        }
+                    }
+                }
+            }
+        }
+        else if (interpolation == "bilinear" || interpolation == "opencv_linear")
+        {
+            const int inpHeight = inp.size[2];
+            const int inpWidth = inp.size[3];
+            const int inpSpatialSize = inpHeight * inpWidth;
+            const int outSpatialSize = outHeight * outWidth;
+            const int numPlanes = inp.size[0] * inp.size[1];
+            CV_Assert_N(inp.isContinuous(), out.isContinuous());
+
+            Mat inpPlanes = inp.reshape(1, numPlanes * inpHeight);
+            Mat outPlanes = out.reshape(1, numPlanes * outHeight);
+            if (depth == CV_8S)
+            {
+                for (int y = 0; y < outHeight; ++y)
+                {
+                    float input_y = halfPixelCenters ? std::max((y + 0.5f) * scaleHeight - 0.5f, 0.0f) : y * scaleHeight;
+                    int y0 = static_cast<int>(input_y);
+                    const int8_t* inpData_row0 = inpPlanes.ptr<int8_t>(y0);
+                    const int8_t* inpData_row1 = inpPlanes.ptr<int8_t>(std::min(y0 + 1, inpHeight - 1));
+                    for (int x = 0; x < outWidth; ++x)
+                    {
+                        float input_x = halfPixelCenters ? std::max((x + 0.5f) * scaleWidth - 0.5f, 0.0f) : x * scaleWidth;
+                        int x0 = static_cast<int>(input_x);
+                        int x1 = std::min(x0 + 1, inpWidth - 1);
+
+                        int8_t* outData = outPlanes.ptr<int8_t>(y, x);
+                        const int8_t* inpData_row0_c = inpData_row0;
+                        const int8_t* inpData_row1_c = inpData_row1;
+                        for (int c = 0; c < numPlanes; ++c)
+                        {
+                            *outData = static_cast<int8_t>(inpData_row0_c[x0] +
+                                (input_y - y0) * (inpData_row1_c[x0] - inpData_row0_c[x0]) +
+                                (input_x - x0) * (inpData_row0_c[x1] - inpData_row0_c[x0] +
+                                (input_y - y0) * (inpData_row1_c[x1] - inpData_row0_c[x1] - inpData_row1_c[x0] + inpData_row0_c[x0])));
+
+                            inpData_row0_c += inpSpatialSize;
+                            inpData_row1_c += inpSpatialSize;
+                            outData += outSpatialSize;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (int y = 0; y < outHeight; ++y)
+                {
+                    float input_y = y * scaleHeight;
+                    int y0 = static_cast<int>(input_y);
+                    const float* inpData_row0 = inpPlanes.ptr<float>(y0);
+                    const float* inpData_row1 = inpPlanes.ptr<float>(std::min(y0 + 1, inpHeight - 1));
+                    for (int x = 0; x < outWidth; ++x)
+                    {
+                        float input_x = x * scaleWidth;
+                        int x0 = static_cast<int>(input_x);
+                        int x1 = std::min(x0 + 1, inpWidth - 1);
+
+                        float* outData = outPlanes.ptr<float>(y, x);
+                        const float* inpData_row0_c = inpData_row0;
+                        const float* inpData_row1_c = inpData_row1;
+                        for (int c = 0; c < numPlanes; ++c)
+                        {
+                            *outData = inpData_row0_c[x0] +
+                                (input_y - y0) * (inpData_row1_c[x0] - inpData_row0_c[x0]) +
+                                (input_x - x0) * (inpData_row0_c[x1] - inpData_row0_c[x0] +
+                                (input_y - y0) * (inpData_row1_c[x1] - inpData_row0_c[x1] - inpData_row1_c[x0] + inpData_row0_c[x0]));
+
+                            inpData_row0_c += inpSpatialSize;
+                            inpData_row1_c += inpSpatialSize;
+                            outData += outSpatialSize;
+                        }
+                    }
+                }
+            }
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unknown interpolation: " + interpolation);
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (interpolation == "nearest")
+        {
+            ieLayer.setType("Resample");
+            ieLayer.getParameters()["type"] = std::string("caffe.ResampleParameter.NEAREST");
+            ieLayer.getParameters()["antialias"] = false;
+            if (scaleWidth != scaleHeight)
+                CV_Error(Error::StsNotImplemented, "resample with sw != sh");
+            ieLayer.getParameters()["factor"] = 1.0f / scaleWidth;
+        }
+        else if (interpolation == "bilinear")
+        {
+            ieLayer.setType("Interp");
+            ieLayer.getParameters()["pad_beg"] = 0;
+            ieLayer.getParameters()["pad_end"] = 0;
+            ieLayer.getParameters()["align_corners"] = alignCorners;
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
+        ieLayer.getParameters()["width"] = outWidth;
+        ieLayer.getParameters()["height"] = outHeight;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
+        ngraph::op::InterpolateAttrs attrs;
+        attrs.pads_begin.push_back(0);
+        attrs.pads_end.push_back(0);
+        attrs.axes = ngraph::AxisSet{2, 3};
+        attrs.align_corners = alignCorners;
+
+        if (interpolation == "nearest") {
+            attrs.mode = "nearest";
+            attrs.antialias = false;
+        } else if (interpolation == "bilinear") {
+            attrs.mode = "linear";
+        } else {
+            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
+        }
+
+        std::vector<int64_t> shape = {outHeight, outWidth};
+        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
+        auto interp = std::make_shared<ngraph::op::Interpolate>(ieInpNode, out_shape, attrs);
+#else
+        ngraph::op::v4::Interpolate::InterpolateAttrs attrs;
+
+        if (interpolation == "nearest") {
+            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::nearest;
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::half_pixel;
+        } else if (interpolation == "bilinear") {
+            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::linear_onnx;
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::asymmetric;
+        } else {
+            CV_Error(Error::StsNotImplemented, format("Unsupported interpolation: %s", interpolation.c_str()));
+        }
+        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::sizes;
+
+        if (alignCorners) {
+            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::align_corners;
+        }
+
+        attrs.nearest_mode = ngraph::op::v4::Interpolate::NearestMode::round_prefer_floor;
+
+        std::vector<int64_t> shape = {outHeight, outWidth};
+        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
+
+        auto& input_shape = ieInpNode->get_shape();
+        CV_Assert_N(input_shape[2] != 0, input_shape[3] != 0);
+        std::vector<float> scales = {static_cast<float>(outHeight) / input_shape[2], static_cast<float>(outWidth) / input_shape[3]};
+        auto scales_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{2}, scales.data());
+
+        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{2, 3});
+        auto interp = std::make_shared<ngraph::op::v4::Interpolate>(ieInpNode, out_shape, scales_shape, axes, attrs);
+#endif
+        return Ptr<BackendNode>(new InfEngineNgraphNode(interp));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        cuda4dnn::ResizeConfiguration config;
+        if (interpolation == "nearest")
+        {
+            config.type = InterpolationType::NEAREST_NEIGHBOUR;
+            config.align_corners = alignCorners;
+            config.half_pixel_centers = halfPixelCenters;
+        }
+        else if (interpolation == "bilinear")
+        {
+            config.type = InterpolationType::BILINEAR;
+            config.align_corners = alignCorners;
+            config.half_pixel_centers = halfPixelCenters;
+        }
+        else if (interpolation == "opencv_linear")
+        {
+            config.type = InterpolationType::BILINEAR;
+            config.align_corners = false;
+            config.half_pixel_centers = true;
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer.");
+        return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), config);
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
+protected:
+    int outWidth, outHeight;
+    const float zoomFactorWidth, zoomFactorHeight;
+    String interpolation;
+    float scaleWidth, scaleHeight;
+    bool alignCorners;
+    bool halfPixelCenters;
+};
+
+
+Ptr<ResizeLayer> ResizeLayer::create(const LayerParams& params)
+{
+    return Ptr<ResizeLayer>(new ResizeLayerImpl(params));
+}
+
+class InterpLayerImpl CV_FINAL : public ResizeLayerImpl
+{
+public:
+    InterpLayerImpl(const LayerParams& params) : ResizeLayerImpl(params) {}
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 1, inputs[0].size() == 4);
+        outputs.resize(1, inputs[0]);
+        outputs[0][2] = zoomFactorHeight > 0 ? (1 + zoomFactorHeight * (outputs[0][2] - 1)) : outHeight;
+        outputs[0][3] = zoomFactorWidth > 0 ? (1 + zoomFactorWidth * (outputs[0][3] - 1)) : outWidth;
+        // We can work in-place (do nothing) if input shape == output shape.
+        return (outputs[0][2] == inputs[0][2]) && (outputs[0][3] == inputs[0][3]);
+    }
+};
+
+Ptr<Layer> InterpLayer::create(const LayerParams& params)
+{
+    LayerParams lp(params);
+    lp.set("interpolation", "bilinear");
+    lp.set("align_corners", true);
+    return Ptr<Layer>(new InterpLayerImpl(lp));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/scale_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/scale_layer.cpp
@ -0,0 +1,507 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+/*
+Implementation of Scale layer.
+*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/scale_shift.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class ScaleLayerImpl CV_FINAL : public ScaleLayer
+{
+public:
+    ScaleLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        hasBias = params.get<bool>("bias_term", false);
+        axis = params.get<int>("axis", 1);
+        hasWeights = false;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        outputs.assign(1, inputs[0]);
+        return true;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        hasWeights = blobs.size() == 2 || (blobs.size() <= 1 && !hasBias);
+        CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_HALIDE ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && axis == 1 && !blobs.empty()) ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && axis > 0);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert_N(outputs.size() == 1, !blobs.empty() || inputs.size() == 2);
+
+        Mat &inpBlob = inputs[0];
+        Mat &outBlob = outputs[0];
+        // There is a mode when we multiply a first blob by a second one
+        // instead of trainable weights.
+        Mat weights = hasWeights ? (blobs.empty() ? inputs[1] : blobs[0]).reshape(1, 1) : Mat();;
+        Mat bias = hasBias ? (blobs.empty() ? inputs[1] : blobs.back()).reshape(1, 1) : Mat();
+
+        MatShape inpShape = shape(inpBlob);
+        const int numWeights = !weights.empty() ? weights.total() : bias.total();
+        CV_Assert(numWeights != 0);
+        if (hasWeights && hasBias)
+            CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
+
+        int endAxis;
+        for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
+        {
+            if (total(inpShape, axis, endAxis) == numWeights)
+                break;
+        }
+        CV_Assert(total(inpShape, axis, endAxis) == numWeights);
+        CV_Assert(!hasBias || numWeights == bias.total());
+        CV_CheckTypeEQ(inpBlob.type(), CV_32FC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_32FC1, "");
+
+        int numSlices = total(inpShape, 0, axis);
+        float* inpData = (float*)inpBlob.data;
+        float* outData = (float*)outBlob.data;
+
+        if (endAxis != inpBlob.dims)
+        {
+            float* weightsData = !weights.empty() ? (float*)weights.data : 0;
+            float* biasesData = hasBias ? (float*)bias.data : 0;
+            int spatialSize = total(inpShape, endAxis);  // spatialSize != 1
+            for (int i = 0; i < numSlices; ++i)
+            {
+                for (int j = 0; j < numWeights; ++j)
+                {
+                    float w = weightsData ? weightsData[j] : 1;
+                    float b = biasesData ? biasesData[j] : 0;
+                    Mat inpSlice(1, spatialSize, CV_32F, inpData);
+                    Mat outSlice(1, spatialSize, CV_32F, outData);
+                    inpSlice.convertTo(outSlice, CV_32F, w, b);
+                    inpData += spatialSize;
+                    outData += spatialSize;
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < numSlices; ++i)
+            {
+                Mat inpSlice(1, numWeights, CV_32F, inpData);
+                Mat outSlice(1, numWeights, CV_32F, outData);
+                if (!weights.empty())
+                {
+                    multiply(inpSlice, weights, outSlice);
+                    if (hasBias)
+                        add(outSlice, bias, outSlice);
+                }
+                else if (hasBias)
+                    add(inpSlice, bias, outSlice);
+                inpData += numWeights;
+                outData += numWeights;
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        CV_Assert(!blobs.empty() || inputs.size() == 2);
+
+        auto weightsMat = Mat(), biasMat = Mat();
+
+        cuda4dnn::ScaleShiftConfiguration config;
+        if (hasWeights)
+        {
+            if (blobs.empty())
+            {
+                config.scaleMode = cuda4dnn::ScaleShiftConfiguration::OpMode::UNTRAINABLE;
+            }
+            else
+            {
+                weightsMat = blobs[0];
+                config.scaleMode = cuda4dnn::ScaleShiftConfiguration::OpMode::TRAINABLE;
+            }
+        }
+        else
+        {
+            config.scaleMode = cuda4dnn::ScaleShiftConfiguration::OpMode::NONE;
+        }
+
+        if (hasBias)
+        {
+            if(blobs.empty())
+            {
+                config.shiftMode = cuda4dnn::ScaleShiftConfiguration::OpMode::UNTRAINABLE;
+            }
+            else
+            {
+                /* if the weights are provided, bias will be in blobs[1]; otherwise, it will be in blobs[0]
+                 * in either case, it is at the end of the blobs vector => bias = blobs.back()
+                 */
+                biasMat = blobs.back();
+                config.shiftMode = cuda4dnn::ScaleShiftConfiguration::OpMode::TRAINABLE;
+            }
+        }
+        else
+        {
+            config.shiftMode = cuda4dnn::ScaleShiftConfiguration::OpMode::NONE;
+        }
+
+        config.axis = axis;
+
+        return make_cuda_node<cuda4dnn::ScaleShiftOp>(preferableTarget, std::move(context->stream), config, weightsMat, biasMat);
+    }
+#endif
+
+    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+    {
+        switch (node->backendId)
+        {
+            case DNN_BACKEND_HALIDE:
+            {
+#ifdef HAVE_HALIDE
+                auto base = node.dynamicCast<HalideBackendNode>();
+                Halide::Func& input = base->funcs.back();
+                Halide::Var x("x"), y("y"), c("c"), n("n");
+                Halide::Func top = attachHalide(input(x, y, c, n));
+                return Ptr<BackendNode>(new HalideBackendNode(base, top));
+#endif  // HAVE_HALIDE
+                break;
+            }
+        }
+        return Ptr<BackendNode>();
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Buffer<float> input = halideBuffer(inputs[0]);
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = attachHalide(input(x, y, c, n));
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_HALIDE
+    // attachHalide can work both with Halide::Buffer and Halide::Func. In the
+    // second case it will be a fusion.
+    Halide::Func attachHalide(const Halide::Expr& input)
+    {
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+
+        const int numChannels = blobs[0].total();
+
+        Halide::Expr topExpr = input;
+        if (hasWeights)
+        {
+            auto weights = wrapToHalideBuffer(blobs[0], {numChannels});
+            topExpr *= weights(c);
+        }
+        if (hasBias)
+        {
+            auto bias = wrapToHalideBuffer(blobs.back(), {numChannels});
+            topExpr += bias(c);
+        }
+        top(x, y, c, n) = topExpr;
+        return top;
+    }
+#endif  // HAVE_HALIDE
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+        InferenceEngine::Builder::Layer l = InferenceEngine::Builder::ScaleShiftLayer(name);
+
+        CV_Assert(!blobs.empty());
+        const size_t numChannels = blobs[0].total();
+        if (hasWeights)
+        {
+            addConstantData("weights", wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C), l);
+        }
+        else
+        {
+            auto weights = InferenceEngine::make_shared_blob<float>({
+                               InferenceEngine::Precision::FP32, {(size_t)numChannels},
+                               InferenceEngine::Layout::C
+                           });
+            weights->allocate();
+            float* buf = weights->buffer().as<float*>();
+            std::fill(buf, buf + numChannels, 1);
+            addConstantData("weights", weights, l);
+        }
+        if (hasBias)
+            addConstantData("biases", wrapToInfEngineBlob(blobs.back(), {numChannels}, InferenceEngine::Layout::C), l);
+        return Ptr<BackendNode>(new InfEngineBackendNode(l));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto ieInpNode0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto ieInpNode1 = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
+
+        size_t numChannels = 1;
+        if (blobs.empty())
+            for (const size_t& dim : ieInpNode1->get_shape())
+                numChannels *= dim;
+        else
+            numChannels = blobs[0].total();
+
+        std::vector<size_t> shape(ieInpNode0->get_shape().size(), 1);
+        int cAxis = normalize_axis(axis, shape.size());
+        shape[cAxis] = numChannels;
+
+        auto node = ieInpNode0;
+        if (hasWeights)
+        {
+            auto weight = blobs.empty() ? ieInpNode1 :
+                          std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
+
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
+            node = std::make_shared<ngraph::op::v1::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#else
+            node = std::make_shared<ngraph::op::v0::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
+#endif
+        }
+        if (hasBias || !hasWeights)
+        {
+            std::shared_ptr<ngraph::Node> bias;
+            if (hasBias)
+            {
+                bias = blobs.empty() ? ieInpNode1 :
+                       std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                              ngraph::Shape(shape), blobs.back().data);
+            }
+            else
+                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+                                                              ngraph::Shape(shape), std::vector<float>(numChannels, 0).data());
+            node = std::make_shared<ngraph::op::v1::Add>(node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = (hasWeights && !blobs.empty()) ? blobs[0] : Mat();
+        shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
+    }
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
+        params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+        return true;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 2*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool hasWeights;
+};
+
+
+Ptr<ScaleLayer> ScaleLayer::create(const LayerParams& params)
+{
+    return Ptr<ScaleLayer>(new ScaleLayerImpl(params));
+}
+
+Ptr<Layer> ShiftLayer::create(const LayerParams& params)
+{
+    LayerParams scaleParams;
+    scaleParams.name = params.name;
+    scaleParams.type = "Scale";
+    scaleParams.blobs = params.blobs;
+    scaleParams.set("bias_term", true);
+    scaleParams.set("axis", 0);
+    return Ptr<ScaleLayer>(new ScaleLayerImpl(scaleParams));
+}
+
+class DataAugmentationLayerImpl CV_FINAL : public DataAugmentationLayer
+{
+public:
+    DataAugmentationLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        recompute_mean = params.get<int>("recompute_mean", 1);
+        CV_CheckGT(recompute_mean, 0, "");
+        mean_per_pixel = params.get<bool>("mean_per_pixel", false);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert_N(inputs.size() == 1, blobs.size() == 3);
+        CV_Assert_N(blobs[0].total() == 1,
+                    blobs[2].total() == inputs[0][1]);
+
+        outputs.assign(1, inputs[0]);
+        return true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert_N(outputs.size() == 1, blobs.size() == 3, inputs.size() == 1);
+        int num_iter = 0;
+
+        float* inpData = inputs[0].ptr<float>();
+        float* outData = outputs[0].ptr<float>();
+
+        Mat data_mean_cpu = blobs[1].clone();
+        Mat mean_resize = Mat(inputs[0].size[3], inputs[0].size[2], CV_32FC3);
+        Mat mean_3d = Mat(data_mean_cpu.size[3], data_mean_cpu.size[2], CV_32FC3, data_mean_cpu.ptr<float>(0));
+        resize(mean_3d, mean_resize, Size(inputs[0].size[3], inputs[0].size[2]));
+        int new_size[] = {1, mean_resize.channels(), mean_resize.cols, mean_resize.rows};
+        Mat data_mean_cpu_resize = mean_resize.reshape(1, *new_size);
+        Mat data_mean_per_channel_cpu = blobs[2].clone();
+
+        const int numWeights = data_mean_cpu_resize.total();
+        CV_Assert(numWeights != 0);
+
+        ++num_iter;
+        if (num_iter <= recompute_mean)
+        {
+            data_mean_cpu_resize *= (num_iter - 1);
+            const int batch = inputs[0].size[0];
+            float alpha = 1.0 / batch;
+
+            for (int i = 0; i < batch; ++i)
+            {
+                Mat inpSlice(1, numWeights, CV_32F, inpData);
+                inpSlice = alpha * inpSlice;
+
+                add(data_mean_cpu_resize.reshape(1, 1), inpSlice, data_mean_cpu_resize.reshape(1, 1));
+                inpData += numWeights;
+            }
+            data_mean_cpu_resize *= (1.0 / num_iter);
+
+            int newsize[] = {inputs[0].size[1], (int)inputs[0].total(2)};
+            reduce(data_mean_cpu_resize.reshape(1, 2, &newsize[0]), data_mean_per_channel_cpu, 1, REDUCE_SUM, CV_32F);
+
+            int area = inputs[0].total(2);
+            data_mean_per_channel_cpu *= (1.0 / area);
+        }
+
+        MatShape inpShape = shape(inputs[0]);
+
+        inpData = inputs[0].ptr<float>();
+        if (mean_per_pixel)
+        {
+            int numSlices = inputs[0].size[0];
+            for (int i = 0; i < numSlices; ++i)
+            {
+                Mat inpSlice(1, numWeights, CV_32F, inpData);
+                Mat outSlice(1, numWeights, CV_32F, outData);
+
+                add(inpSlice, (-1) * data_mean_cpu_resize, outSlice);
+                inpData += numWeights;
+                outData += numWeights;
+            }
+        }
+        else
+        {
+            int numSlices = inpShape[1];
+            int count = numWeights / numSlices;
+
+            for (int i = 0; i < numSlices; ++i)
+            {
+                Mat inpSlice(1, count, CV_32F, inpData);
+                Mat outSlice(1, count, CV_32F, outData);
+                float coeff = data_mean_per_channel_cpu.reshape(1, 1).at<float>(0, i);
+                outSlice = inpSlice - coeff;
+
+                inpData += count;
+                outData += count;
+            }
+        }
+    }
+
+private:
+    int recompute_mean;
+    bool mean_per_pixel;
+};
+
+Ptr<DataAugmentationLayer> DataAugmentationLayer::create(const LayerParams& params)
+{
+    return Ptr<DataAugmentationLayer>(new DataAugmentationLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/shuffle_channel_layer.cpp
@ -0,0 +1,167 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2018, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/shuffle_channel.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv { namespace dnn {
+
+class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer
+{
+public:
+    ShuffleChannelLayerImpl(const LayerParams& params)
+    {
+        group = params.get<int>("group", 1);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1 && inputs[0].size() == 4);
+        CV_Assert(inputs[0][1] % group == 0);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return group == 1;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        if (group != 1)
+        {
+            std::vector<Mat> inputs, outputs;
+            inputs_arr.getMatVector(inputs);
+            outputs_arr.getMatVector(outputs);
+
+            LayerParams lp;
+            float order[] = {0, 2, 1, 3};
+            lp.set("order", DictValue::arrayInt(&order[0], 4));
+            permute = PermuteLayer::create(lp);
+
+            const Mat& inp = inputs[0];
+            const Mat& out = outputs[0];
+
+            permuteInpShape.resize(4);
+            permuteInpShape[0] = inp.size[0];
+            permuteInpShape[1] = group;
+            permuteInpShape[2] = inp.size[1] / group;
+            permuteInpShape[3] = inp.size[2]*inp.size[3];
+
+            permuteOutShape.resize(4);
+            permuteOutShape[0] = permuteInpShape[0];
+            permuteOutShape[1] = permuteInpShape[2];
+            permuteOutShape[2] = permuteInpShape[1];
+            permuteOutShape[3] = permuteInpShape[3];
+
+            std::vector<Mat> permuteInputs(1, inp.reshape(1, permuteInpShape));
+            std::vector<Mat> permuteOutputs(1, out.reshape(1, permuteOutShape));
+            permute->finalize(permuteInputs, permuteOutputs);
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        if (inputs[0].u != outputs[0].u)
+        {
+            if (!permute.empty())
+            {
+                inputs[0] = inputs[0].reshape(1, permuteInpShape.size(), &permuteInpShape[0]);
+                outputs[0] = outputs[0].reshape(1, permuteOutShape.size(), &permuteOutShape[0]);
+                permute->preferableTarget = preferableTarget;
+                permute->forward(inputs, outputs, internals);
+            }
+            else
+                inputs[0].copyTo(outputs[0]);
+        }
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        Mat inp = inputs[0];
+        Mat out = outputs[0];
+        if (inp.data != out.data)
+        {
+            if (!permute.empty())
+            {
+                inp = inp.reshape(1, permuteInpShape);
+                out = out.reshape(1, permuteOutShape);
+                std::vector<Mat> permuteInputs(1, inp);
+                std::vector<Mat> permuteOutputs(1, out);
+                permute->forward(permuteInputs, permuteOutputs, internals);
+            }
+            else
+                inp.copyTo(out);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::ShuffleChannelOp>(preferableTarget, std::move(context->stream), group);
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
+private:
+    Ptr<PermuteLayer> permute;
+    std::vector<int> permuteInpShape, permuteOutShape;
+};
+
+Ptr<Layer> ShuffleChannelLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new ShuffleChannelLayerImpl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/slice_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/slice_layer.cpp
@ -0,0 +1,821 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#include <opencv2/core/utils/logger.hpp>
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/slice.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector<std::vector<cv::Range> >& sliceRanges)
+{
+    CV_Assert(inpShape.size() > 0);
+    bool axisNeg = (axis < 0);
+    axis = (axis + static_cast<int>(inpShape.size())) % inpShape.size();
+    int n = inpShape[axis];
+
+    for (size_t i = 0; i < sliceRanges.size(); ++i){
+        std::vector<Range>& ranges = sliceRanges[i];
+        if (axisNeg)
+        {
+            ranges.insert(ranges.begin(), axis, Range::all());
+        }
+        Range& range = ranges.back();
+
+        if (range.start >= 0)
+        {
+            continue;
+        }
+
+        CV_Assert(n != 0);
+        range.start = (n + range.start) % n;
+    }
+}
+
+class SliceLayerImpl : public SliceLayer
+{
+public:
+    SliceLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        hasSteps = false;
+        axis = params.get<int>("axis", 1);
+        num_split = params.get<int>("num_split", 0);
+        hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
+        shapesInitialized = !hasDynamicShapes;
+
+        if (params.has("slice_point"))
+        {
+            CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
+            const DictValue &indicesValue = params.get("slice_point");
+            int size = axis > 0 ? axis + 1 : 1;
+            sliceRanges.resize(indicesValue.size() + 1,
+                               std::vector<Range>(size, Range::all()));
+            int prevSlice = 0;
+            for (int i = 0; i < indicesValue.size(); ++i)
+            {
+                sliceRanges[i][size - 1].start = prevSlice;
+                sliceRanges[i][size - 1].end = indicesValue.get<int>(i);
+                prevSlice = sliceRanges[i][size - 1].end;
+            }
+            sliceRanges.back()[size - 1].start = prevSlice;
+        }
+        else if (params.has("begin"))
+        {
+            CV_Assert(params.has("size") ^ params.has("end"));
+            const DictValue &begins = params.get("begin");
+            const DictValue &sizesOrEnds = params.has("size") ? params.get("size") : params.get("end");
+            CV_Assert(begins.size() == sizesOrEnds.size());
+
+            sliceRanges.resize(1);
+            sliceRanges[0].resize(begins.size(), Range::all());
+            for (int i = 0; i < begins.size(); ++i)
+            {
+                int start = begins.get<int>(i);
+                int sizeOrEnd = sizesOrEnds.get<int>(i);  // It may be negative to reverse indexation.
+
+                sliceRanges[0][i].start = start;
+                if (params.has("size"))
+                {
+                    int size = sizeOrEnd;
+                    CV_Assert(size == -1 || size > 0);  // -1 value means range [start, axis_size).
+                    sliceRanges[0][i].end = size > 0 ? (start + size) : -1;  // We'll finalize a negative value later.
+                }
+                else
+                {
+                    int end = sizeOrEnd;
+                    CV_Assert(end < 0 || end > start);  // End index is excluded.
+                    sliceRanges[0][i].end = end;  // We'll finalize a negative value later.
+                }
+            }
+
+            if (params.has("steps"))
+            {
+                const DictValue &steps = params.get("steps");
+                sliceSteps.resize(1);
+                sliceSteps[0].resize(steps.size());
+
+                for (int i = 0; i < steps.size(); ++i)
+                {
+                    int step = steps.get<int>(i);
+                    CV_Assert(step >= 1);
+                    if (step > 1)
+                        hasSteps = true;
+                    sliceSteps[0][i] = step;
+                }
+            }
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+            return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) &&
+                sliceRanges.size() == 1 && sliceRanges[0].size() == 4 && !hasSteps;
+#endif
+#ifdef HAVE_DNN_NGRAPH
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return sliceRanges.size() == 1 && !hasSteps;
+#endif
+#ifdef HAVE_CUDA
+        if (backendId == DNN_BACKEND_CUDA)
+            return !hasSteps;
+#endif
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                            const int requiredOutputs,
+                            std::vector<MatShape> &outputs,
+                            std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        MatShape inpShape = inputs[0];
+
+        int axis_rw = axis;
+        std::vector<std::vector<cv::Range> > sliceRanges_rw = sliceRanges;
+        sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw);
+
+        if (!sliceRanges_rw.empty())
+        {
+            outputs.resize(sliceRanges_rw.size(), inpShape);
+            for (int i = 0; i < outputs.size(); ++i)
+            {
+                CV_Assert(sliceRanges_rw[i].size() <= inpShape.size());
+                for (int j = 0; j < sliceRanges_rw[i].size(); ++j)
+                {
+                    if (shapesInitialized || inpShape[j] > 0)
+                        outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size();
+
+                    if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
+                        outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
+                }
+            }
+        }
+        else  // Divide input blob on equal parts by axis.
+        {
+            CV_Assert(0 <= axis_rw && axis_rw < inpShape.size());
+            int splits = num_split ? num_split : requiredOutputs;
+            CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0);
+            inpShape[axis_rw] /= splits;
+            outputs.resize(splits, inpShape);
+        }
+        return false;
+    }
+
+    bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
+    {
+        shapesInitialized = true;
+        return true;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+#ifdef HAVE_OPENCL
+        ocl_exec_cache.clear();
+#endif
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(inputs.size() == 1);
+        const MatSize& inpShape = inputs[0].size;
+
+        sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges);
+        finalSliceRanges = sliceRanges;
+
+        if (sliceRanges.empty())
+        {
+            // Divide input blob on equal parts by axis.
+            int outAxisSize = inpShape[axis] / outputs.size();
+            finalSliceRanges.resize(outputs.size(),
+                                    std::vector<Range>(axis + 1, Range::all()));
+            int prevSlice = 0;
+            for (int i = 0; i < outputs.size(); ++i)
+            {
+                finalSliceRanges[i][axis].start = prevSlice;
+                finalSliceRanges[i][axis].end = finalSliceRanges[i][axis].start + outAxisSize;
+                prevSlice = finalSliceRanges[i][axis].end;
+            }
+        }
+        else
+            CV_Assert(outputs.size() == sliceRanges.size());
+
+        for (int i = 0; i < outputs.size(); ++i)
+        {
+            CV_Assert(finalSliceRanges[i].size() <= inpShape.dims());
+            // Fill the rest of ranges.
+            for (int j = finalSliceRanges[i].size(); j < inpShape.dims(); ++j)
+            {
+                finalSliceRanges[i].push_back(Range::all());
+            }
+            // Clamp.
+            for (int j = 0; j < finalSliceRanges[i].size(); ++j)
+            {
+                finalSliceRanges[i][j] = normalize_axis_range(finalSliceRanges[i][j], inpShape[j]);
+            }
+        }
+
+        if (!sliceSteps.empty() && sliceSteps[0].size() != inputs[0].dims)
+            sliceSteps[0].resize(inputs[0].dims, 1);
+
+#if 0
+        std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl;
+        for (int i = 0; i < outputs.size(); ++i)
+        {
+            for (int j = 0; j < finalSliceRanges[i].size(); ++j)
+            {
+                std::cout << finalSliceRanges[i][j];
+            }
+            std::cout << std::endl;
+        }
+#endif
+    }
+
+#ifdef HAVE_OPENCL
+    struct OpenCLExecInfo
+    {
+        std::string kernel_name;
+        std::string build_opts;
+        size_t local_size[2];
+        size_t global_size[2];
+
+        OpenCLExecInfo()
+        {
+            local_size[0] = local_size[1] = 0;
+            global_size[0] = global_size[1] = 0;
+        }
+    };
+    std::vector<OpenCLExecInfo> ocl_exec_cache;
+
+    void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+        ocl_exec_cache.resize(outputs.size());
+
+        const UMat& input = inputs[0];
+        const int dims = input.dims;
+
+        size_t WSZ = 128;
+
+        const int elemSize = (int)input.elemSize();
+        String opts0 = cv::format(
+                "-DDIMS=%d -DELEMSIZE=%d",
+                dims, elemSize
+            );
+        for (int d = 0; d < dims; d++)
+        {
+            opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
+        }
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            const UMat& output = outputs[i];
+            const std::vector<Range>& range = finalSliceRanges[i];
+
+            String opts = opts0;
+
+            CV_CheckEQ(output.dims, dims, "");
+            for (int d = 0; d < dims; d++)
+            {
+                opts += cv::format(" -DDST_STEP_%d=%d -DDST_SZ_%d=%d -DSRC_START_%d=%d",
+                        d, (int)output.step[dims - 1 - d],
+                        d, (int)output.size[dims - 1 - d],
+                        d, (int)range[dims - 1 - d].start
+                    );
+                CV_CheckEQ(range[d].size(), (int)output.size[d], "");
+            }
+
+            const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
+
+            int block_dims = 0;
+            size_t block_size = elemSize;
+            for (int i = dims - 1; i >= 0; --i)
+            {
+                if (input.step[i] != output.step[i])
+                    break;
+                block_size *= output.size[i];
+                block_dims++;
+                if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
+                    break;
+            }
+
+            const size_t total = output.total() * elemSize;
+            size_t num_blocks = total / block_size;
+
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
+            {
+                // use 1D copy mode
+                opts += cv::format(" -DUSE_COPY_1D=1");
+
+                opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
+                opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims);
+                opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
+            }
+            else
+            {
+                // use 2D copy mode
+                int block_cols = block_size;
+                int block_dims_contiguous = block_dims;
+                size_t input_base_step = input.step[dims - 1 - block_dims_contiguous];
+                size_t output_base_step = output.step[dims - 1 - block_dims_contiguous];
+
+                size_t block_rows = 1;
+                for (int i = dims - 1 - block_dims_contiguous; i >= 0; --i)
+                {
+                    if (input.step[i] * output_base_step != output.step[i] * input_base_step)
+                        break;
+                    block_rows *= output.size[i];
+                    block_dims++;
+                }
+
+                block_size *= block_rows;
+
+                num_blocks = total / block_size;
+
+                if (block_rows > 1)
+                {
+                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
+                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_cols);
+
+                    opts += cv::format(" -DBLOCK_ROWS=%d", (int)block_rows);
+                    opts += cv::format(" -DBLOCK_SRC_STRIDE=%d", (int)input_base_step);
+                }
+                else
+                {
+                    // use 1D copy mode
+                    opts += cv::format(" -DUSE_COPY_1D=1");
+
+                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
+                }
+            }
+
+            const size_t MIN_WORK_ITEMS = 16;
+            if (block_size <= 4 * MIN_WORK_ITEMS)
+                WSZ = 4;
+            else if (block_size <= 8 * MIN_WORK_ITEMS)
+                WSZ = 8;
+            else if (block_size <= 16 * MIN_WORK_ITEMS)
+                WSZ = 16;
+            else if (block_size <= 32 * MIN_WORK_ITEMS)
+                WSZ = 32;
+            else if (block_size <= 64 * MIN_WORK_ITEMS)
+                WSZ = 64;
+
+            opts += cv::format(" -DWSZ=%d", (int)WSZ);
+
+            std::ostringstream kernel_suffix;
+            kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
+            kernel_suffix << "__src_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.size[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';
+            /*for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << input.step[dims - 1 - d] << '_';
+            }
+            kernel_suffix << '_';*/
+
+            kernel_suffix << "dst_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.size[dims - 1 - d] << '_';
+            }
+            /*kernel_suffix << '_';
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << output.step[dims - 1 - d] << '_';
+            }*/
+            kernel_suffix << "_slice_";
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << range[dims - 1 - d].start << '_';
+            }
+            for (int d = 0; d < dims; d++)
+            {
+                kernel_suffix << '_' << range[dims - 1 - d].end;
+            }
+
+            std::string kernel_suffix_str = kernel_suffix.str();
+            opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
+
+            ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
+            ocl.build_opts = opts;
+            ocl.local_size[0] = WSZ;
+            ocl.local_size[1] = 1;
+            ocl.global_size[0] = WSZ;
+            ocl.global_size[1] = num_blocks;
+        }  // for outputs.size()
+    }  // ocl_prepare
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        CV_TRACE_FUNCTION();
+
+        if (hasSteps)
+            return false;  // TODO not implemented yet: https://github.com/opencv/opencv/pull/19546
+
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        const UMat& input = inputs[0];
+        const int dims = input.dims;
+        if (dims > 5)
+        {
+            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
+            return false;
+        }
+
+        if (ocl_exec_cache.empty())
+        {
+            ocl_prepare(inputs, outputs);
+        }
+        CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
+
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            const OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+            UMat& output = outputs[i];
+
+            ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
+            if (kernel.empty())
+                return false;
+            bool ret = kernel.args(
+                    ocl::KernelArg::PtrReadOnly(input),
+                    ocl::KernelArg::PtrWriteOnly(output)
+                )
+                .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
+            if (!ret)
+                return false;
+        }  // for outputs.size()
+
+        return true;
+    }  // forward_ocl
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const Mat& inpMat = inputs[0];
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        if (!hasSteps)
+        {
+            for (size_t i = 0; i < outputs.size(); i++)
+            {
+                inpMat(finalSliceRanges[i]).copyTo(outputs[i]);
+            }
+        }
+        else
+        {
+            int dimsNum = inpMat.dims;
+
+            for (size_t i = 0; i < outputs.size(); i++)
+            {
+                std::vector<int> inpIdx(dimsNum, 0);
+                std::vector<int> outIdx(dimsNum, 0);
+                if (inpMat.type() == CV_16S)
+                    getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                else if (inpMat.type() == CV_8S)
+                    getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                else
+                    getSliceRecursive<float>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+            }
+        }
+    }
+
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        CV_Assert_N(finalSliceRanges.size() == 1, inputs.size() <= 2);
+
+        std::vector<size_t> axes, offsets, dims;
+        int from, to, step;
+        int numDims = finalSliceRanges[0].size();
+        if (preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL)
+        {
+            from = axis;
+            to = numDims;
+            step = 1;
+        }
+        else
+        {
+            from = numDims - 1;
+            to = axis - 1;
+            step = -1;
+        }
+        for (int i = from; i != to; i += step)
+        {
+            axes.push_back(i);
+            offsets.push_back(finalSliceRanges[0][i].start);
+            dims.push_back(finalSliceRanges[0][i].size());
+        }
+
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        ieLayer.setType("Crop");
+        ieLayer.getParameters()["axis"] = axes;
+        ieLayer.getParameters()["dim"] = dims;
+        ieLayer.getParameters()["offset"] = offsets;
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+
+        if (inputs.size() != 2)
+        {
+            std::vector<size_t> outShape(numDims);
+            for (int i = 0; i < numDims; ++i)
+                outShape[i] = finalSliceRanges[0][i].size();
+
+            ieLayer.getInputPorts()[1].setParameter("type", "weights");
+
+            auto shapeSource = InferenceEngine::make_shared_blob<float>({
+                                   InferenceEngine::Precision::FP32, outShape,
+                                   InferenceEngine::Layout::ANY
+                               });
+            shapeSource->allocate();
+            addConstantData("weights", shapeSource, ieLayer);
+        }
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif
+#endif
+
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert_N(nodes.size() <= 2);
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size());
+
+        std::vector<int64_t> offsets, dims;
+        for (int i = 0; i < finalSliceRanges[0].size(); ++i)
+        {
+            offsets.push_back(finalSliceRanges[0][i].start);
+            dims.push_back(finalSliceRanges[0][i].end);
+        }
+
+        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                             ngraph::Shape{offsets.size()}, offsets.data());
+        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                             ngraph::Shape{dims.size()}, dims.data());
+        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
+                                        ngraph::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
+
+        auto slice = std::make_shared<ngraph::op::v1::StridedSlice>(ieInpNode,
+                                      lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(slice));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        std::vector<std::vector<std::size_t>> offsets;
+        for (const auto& ranges : finalSliceRanges)
+        {
+            std::vector<std::size_t> offsets_i;
+            for (const auto& range : ranges)
+                offsets_i.push_back(range.start);
+            offsets.push_back(std::move(offsets_i));
+        }
+
+        return make_cuda_node<cuda4dnn::SliceOp>(preferableTarget, std::move(context->stream), std::move(offsets));
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        const int numOutputs = scales[1].size();
+        for (int i = 0; i < numOutputs; i++)
+        {
+            if (scales[1][i] != scales[0][0])
+             return false;
+        }
+        return true;
+    }
+
+private:
+    template <typename T>
+    void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
+                           const std::vector<Range> &sliceRanges,
+                           const std::vector<int> &sliceSteps, int dim, int dimsNum,
+                           Mat &outputs, std::vector<int> &outIdx)
+    {
+        int begin = sliceRanges[dim].start;
+        int end = sliceRanges[dim].end;
+        int step = !sliceSteps.empty() ? sliceSteps[dim] : 1;
+
+        // TODO optimization is required (for 2D tail case at least)
+        for (int k = begin, j = 0; k < end; k += step, j++)
+        {
+            inpIdx[dim] = k;
+            outIdx[dim] = j;
+
+            if (dim + 1 < dimsNum)
+                getSliceRecursive<T>(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
+            else
+                outputs.at<T>(outIdx.data()) = inpMat.at<T>(inpIdx.data());
+        }
+    }
+
+protected:
+    // The actual non-negative values determined from @p sliceRanges depends on input size.
+    std::vector<std::vector<Range> > finalSliceRanges;
+    bool hasDynamicShapes;
+    bool shapesInitialized;
+    bool hasSteps;
+};
+
+class CropLayerImpl CV_FINAL : public SliceLayerImpl
+{
+public:
+    CropLayerImpl(const LayerParams& params) : SliceLayerImpl(LayerParams())
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 2);
+        const DictValue *paramOffset = params.ptr("offset");
+
+        if (paramOffset)
+        {
+            for (int i = 0; i < paramOffset->size(); i++)
+                offset.push_back(paramOffset->get<int>(i));
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 2);
+
+        MatShape dstShape = inputs[0];
+        int start = normalize_axis(axis, dstShape);
+        for (int i = start; i < dstShape.size(); i++)
+        {
+            dstShape[i] = inputs[1][i];
+        }
+        outputs.resize(1, dstShape);
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        CV_Assert(2 == inputs.size());
+
+        const Mat &inpBlob = inputs[0];
+        const Mat &inpSzBlob = inputs[1];
+
+        int dims = inpBlob.dims;
+        int start_axis = normalize_axis(axis, dims);
+
+        std::vector<int> offset_final(dims, 0);
+        if (offset.size() == 1)
+        {
+            for (int i = start_axis; i < dims; i++)
+                offset_final[i] = offset[0];
+        }
+        else if (offset.size() > 1)
+        {
+            if ((int)offset.size() != dims - start_axis)
+                CV_Error(Error::StsBadArg, "number of offset values specified must be "
+                                           "equal to the number of dimensions following axis.");
+
+            for (int i = start_axis; i < dims; i++)
+                offset_final[i] = offset[i - start_axis];
+        }
+
+        finalSliceRanges.resize(1);
+        finalSliceRanges[0].resize(dims);
+        for (int i = 0; i < start_axis; i++)
+        {
+            finalSliceRanges[0][i] = Range(0, inpBlob.size[i]);
+        }
+        for (int i = start_axis; i < dims; i++)
+        {
+            if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size[i] > inpBlob.size[i])
+                CV_Error(Error::StsBadArg, "invalid crop parameters or blob sizes");
+
+            finalSliceRanges[0][i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]);
+        }
+    }
+
+private:
+    std::vector<int> offset;
+};
+
+Ptr<SliceLayer> SliceLayer::create(const LayerParams& params)
+{
+    return Ptr<SliceLayer>(new SliceLayerImpl(params));
+}
+
+Ptr<Layer> CropLayer::create(const LayerParams& params)
+{
+    return Ptr<Layer>(new CropLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/softmax_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/softmax_layer.cpp
@ -0,0 +1,416 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "../op_cuda.hpp"
+#include "../op_halide.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include <algorithm>
+#include <stdlib.h>
+using std::max;
+
+#ifdef HAVE_OPENCL
+#include "opencl_kernels_dnn.hpp"
+using namespace cv::dnn::ocl4dnn;
+#endif
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/softmax.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
+{
+public:
+
+    SoftMaxLayerImpl(const LayerParams& params)
+    {
+        axisRaw = params.get<int>("axis", 1);
+        logSoftMax = params.get<bool>("log_softmax", false);
+        setParamsFrom(params);
+    }
+
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNSoftmax<float> > softmaxOp;
+#endif
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        MatShape shape = inputs[0];
+        int cAxis = normalize_axis(axisRaw, shape.size());
+        shape[cAxis] = 1;
+        internals.assign(1, shape);
+        return inplace;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA ||
+               (backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1) ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && haveInfEngine() && !logSoftMax) ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan());
+    }
+
+#ifdef HAVE_OPENCL
+    virtual void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        softmaxOp.release();
+    }
+
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+        std::vector<UMat> internals;
+
+        bool use_half = (inputs_.depth() == CV_16S);
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+        internals_.getUMatVector(internals);
+
+        UMat& src = inputs[0];
+        UMat& dstMat = outputs[0];
+        int axis = normalize_axis(axisRaw, src.dims);
+
+        if (softmaxOp.empty())
+        {
+            OCL4DNNSoftmaxConfig config;
+            config.in_shape = shape(inputs[0]);
+            config.axis = axis;
+            config.channels = inputs[0].size[axis];
+            config.logsoftmax = logSoftMax;
+            config.use_half = use_half;
+
+            softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
+        }
+
+        if (softmaxOp->Forward(src, dstMat))
+            return true;
+
+        UMat& bufMat = internals[0];
+        MatShape s = shape(src);
+        size_t outerSize = total(s, 0, axis);
+        size_t channels = src.size[axis];
+        size_t innerSize = total(s, axis + 1);
+
+        String buildOpts = format("-DT=%s", use_half ? "half" : "float");
+        ocl::Kernel kmax, ksub, ksum, kdiv;
+
+        if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (logSoftMax) buildOpts += " -DLOG_SOFTMAX ";
+        if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        size_t bufSize = internals[0].total();
+        size_t totalSize = src.total();
+
+        size_t internal_globalSize[1] = { bufSize };
+        size_t total_globalSize[1] = { totalSize };
+
+        kmax.args((int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!kmax.run(1, internal_globalSize, NULL, false))
+            return false;
+
+        ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(bufMat),
+                  ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrWriteOnly(dstMat));
+        if (!ksub.run(1, total_globalSize, NULL, false))
+            return false;
+
+        ksum.args((int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!ksum.run(1, internal_globalSize, NULL, false))
+            return false;
+
+        kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+        if (!kdiv.run(1, total_globalSize, NULL, false))
+            return false;
+
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        const Mat &src = inputs[0];
+        Mat &dst = outputs[0];
+
+        int axis = normalize_axis(axisRaw, src.dims);
+        size_t outerSize = src.total(0, axis), channels = src.size[axis],
+                innerSize = src.total(axis + 1);
+
+        CV_Assert(src.type() == CV_32F);
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+
+        const float *srcPtr = src.ptr<float>();
+        float *dstPtr = dst.ptr<float>();
+        float *bufPtr = internals[0].ptr<float>();
+
+        size_t outerStep = src.total(axis);
+        size_t cnStep = src.total(axis + 1);
+
+        //compute max along axis
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+
+            for (size_t cnDim = 1; cnDim < channels; cnDim++)
+            {
+                for (size_t i = 0; i < innerSize; i++)
+                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
+            }
+        }
+
+        //subtract max
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                const int offset = srcOffset + cnDim * cnStep;
+                for (size_t i = 0; i < innerSize; i++)
+                    dstPtr[offset + i] = srcPtr[offset + i] - bufPtr[bufOffset + i];
+            }
+        }
+
+        cv::exp(dst, dst);
+
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            //sum exp along axis
+            for (size_t i = 0; i < innerSize; i++)
+                bufPtr[bufOffset + i] = 0.f;
+
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                const int offset = srcOffset + cnDim * cnStep;
+                for (size_t i = 0; i < innerSize; i++)
+                    bufPtr[bufOffset + i] += dstPtr[offset + i];
+            }
+
+            //divide by computed sum
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                const int offset = srcOffset + cnDim * cnStep;
+                for (size_t i = 0; i < innerSize; i++)
+                    dstPtr[offset + i] /= bufPtr[bufOffset + i];
+            }
+            if (logSoftMax)
+            {
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        dstPtr[offset + i] = log(dstPtr[offset + i]);
+                }
+            }
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto channel_axis = normalize_axis(axisRaw, input_wrapper->getRank());
+        return make_cuda_node<cuda4dnn::SoftmaxOp>(preferableTarget, std::move(context->cudnn_handle), channel_axis, logSoftMax);
+    }
+#endif
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_VULKAN
+        vkcom::Tensor in = VkComTensor(inputs[0]);
+        int cAxis = normalize_axis(axisRaw, in.dimNum());
+        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpSoftmax(cAxis, logSoftMax));
+        return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
+#endif  // HAVE_VULKAN
+        return Ptr<BackendNode>();
+    }
+
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+#ifdef HAVE_HALIDE
+        Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
+        int inW, inH, inC, inN;
+        getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
+
+        if (inW != 1 || inH != 1)
+            CV_Error(cv::Error::StsNotImplemented,
+                     "Halide backend for SoftMax with spatial size "
+                     "more than 1x1 is not implemented");
+
+        Halide::Var x("x"), y("y"), c("c"), n("n");
+        Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+
+        Halide::Func expInput("expInput");
+        Halide::RDom r(0, inW, 0, inH, 0, inC);
+        expInput(x, y, c, n) = exp(inputBuffer(x, y, c, n));
+        Halide::Expr globalSum = sum(expInput(r.x, r.y, r.z, n));
+        top(x, y, c, n) = expInput(x, y, c, n) / globalSum;
+        return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+        return Ptr<BackendNode>();
+    }
+
+#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+    {
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
+        InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
+        ieLayer.setAxis(normalize_axis(axisRaw, input->getDims().size()));
+
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+    }
+#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        int axis = normalize_axis(axisRaw, ieInpNode->get_shape().size());
+        auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
+        if (logSoftMax)
+            return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(softmax));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        float inpScale = scales[0][0];
+        Mat lookUpTable(1, 256, CV_32F);
+        float* table = lookUpTable.ptr<float>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1)
+            table[i+128] = std::exp(x);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
+    int64 getFLOPS(const std::vector<MatShape> &inputs,
+                  const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        int64 flops = 0;
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            flops += 4*total(inputs[i]);
+        }
+
+        return flops;
+    }
+
+    int axisRaw;
+};
+
+Ptr<SoftmaxLayer> SoftmaxLayer::create(const LayerParams& params)
+{
+    return Ptr<SoftmaxLayer>(new SoftMaxLayerImpl(params));
+}
+
+}
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/layers/split_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/layers/split_layer.cpp
@ -0,0 +1,139 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+#include "../op_cuda.hpp"
+#include "layers_common.hpp"
+
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/split.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+
+class SplitLayerImpl CV_FINAL : public SplitLayer
+{
+public:
+    SplitLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        //TODO: maybe "top_count" param is useless because it can be determined by output connections number
+        if (params.has("top_count"))
+        {
+            outputsCount = params.get<int>("top_count");
+            CV_Assert(outputsCount >= 0);
+        }
+        else
+        {
+            outputsCount = -1;
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+
+        Layer::getMemoryShapes(inputs, max(1, outputsCount >= 0 ? outputsCount : requiredOutputs),
+                               outputs, internals);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            CV_Assert(inputs[0].total() == outputs[i].total());
+            inputs[0].copyTo(outputs[i]);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(
+        void *context_,
+        const std::vector<Ptr<BackendWrapper>>& inputs,
+        const std::vector<Ptr<BackendWrapper>>& outputs
+    ) override
+    {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        return make_cuda_node<cuda4dnn::SplitOp>(preferableTarget, std::move(context->stream));
+    }
+#endif
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        const int numOutputs = scales[1].size();
+        for (int i = 0; i < numOutputs; i++)
+        {
+            if (scales[1][i] != scales[0][0])
+             return false;
+        }
+        return true;
+    }
+};
+
+Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
+{
+    return Ptr<SplitLayer>(new SplitLayerImpl(params));
+}
+
+}
+}