feat: 切换后端至PaddleOCR-NCNN，切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法，已通过基本的兼容性测试 2.工程改为使用CMake组织，后续为了更好地兼容第三方库，不再提供QMake工程 3.重整权利声明文件，重整代码工程，确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN，切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions
--- a/3rdparty/opencv-4.5.4/modules/dnn/CMakeLists.txt
+++ b/3rdparty/opencv-4.5.4/modules/dnn/CMakeLists.txt
@@ -0,0 +1,200 @@
+if(WINRT)
+  ocv_module_disable(dnn)
+endif()
+
+if(NOT HAVE_PROTOBUF)
+  ocv_module_disable(opencv_dnn)
+endif()
+
+set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
+
+ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
+
+ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
+
+ocv_option(OPENCV_DNN_OPENCL "Build with OpenCL support" HAVE_OPENCL AND NOT APPLE)
+if(HAVE_TENGINE)
+  add_definitions(-DHAVE_TENGINE=1)
+endif()
+
+if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
+  add_definitions(-DCV_OCL4DNN=1)
+endif()
+
+ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
+    HAVE_CUDA
+    AND HAVE_CUBLAS
+    AND HAVE_CUDNN
+)
+
+if(OPENCV_DNN_CUDA)
+  if(HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
+    add_definitions(-DCV_CUDA4DNN=1)
+  else()
+    if(NOT HAVE_CUDA)
+      message(SEND_ERROR "DNN: CUDA backend requires CUDA Toolkit. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
+    elseif(NOT HAVE_CUBLAS)
+      message(SEND_ERROR "DNN: CUDA backend requires cuBLAS. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
+    elseif(NOT HAVE_CUDNN)
+      message(SEND_ERROR "DNN: CUDA backend requires cuDNN. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
+    endif()
+  endif()
+endif()
+
+ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
+
+if(MSVC)
+  add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
+  ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
+                                       /wd4305 /wd4127 /wd4100 /wd4512 /wd4125 /wd4389 /wd4510 /wd4610
+                                       /wd4702 /wd4456 /wd4457 /wd4065 /wd4310 /wd4661 /wd4506
+  )
+else()
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-deprecated -Wmissing-prototypes -Wmissing-declarations -Wshadow
+                                       -Wunused-parameter -Wsign-compare
+  )
+endif()
+if(HAVE_CUDA)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
+if(NOT HAVE_CXX11)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef)  # LANG_CXX11 from protobuf files
+endif()
+
+if(APPLE_FRAMEWORK)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshorten-64-to-32)
+endif()
+
+if(ANDROID)
+  add_definitions(-DDISABLE_POSIX_MEMALIGN -DTH_DISABLE_HEAP_TRACKING)
+endif()
+
+if(NOT BUILD_PROTOBUF)
+  add_definitions(-DOPENCV_DNN_EXTERNAL_PROTOBUF=1)
+endif()
+
+add_definitions(-DHAVE_PROTOBUF=1)
+
+#suppress warnings in autogenerated caffe.pb.* files
+ocv_warnings_disable(CMAKE_CXX_FLAGS
+    /wd4125 /wd4267 /wd4127 /wd4244 /wd4512 /wd4702
+    /wd4456 /wd4510 /wd4610 /wd4800
+    /wd4701 /wd4703                    # potentially uninitialized local/pointer variable 'value' used
+    /wd4505                            # unreferenced local function has been removed
+    /wd4458                            # declaration of 'x' hides class member. GCC still works, MSVC bug is here: https://developercommunity.visualstudio.com/content/problem/219311/c-c4458-declaration-hides-class-member-warning-iss.html
+    -wd858 -wd2196
+    -Winvalid-offsetof                 # Apple Clang (attr_value.pb.cc)
+)
+
+set(include_dirs "")
+set(libs "")
+
+if(PROTOBUF_UPDATE_FILES)
+  file(GLOB proto_files "${CMAKE_CURRENT_LIST_DIR}/src/tensorflow/*.proto" "${CMAKE_CURRENT_LIST_DIR}/src/caffe/opencv-caffe.proto" "${CMAKE_CURRENT_LIST_DIR}/src/onnx/opencv-onnx.proto")
+  set(PROTOBUF_GENERATE_CPP_APPEND_PATH ON) # required for tensorflow
+  protobuf_generate_cpp(fw_srcs fw_hdrs ${proto_files})
+else()
+  file(GLOB fw_srcs "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow/*.cc" "${CMAKE_CURRENT_LIST_DIR}/misc/caffe/opencv-caffe.pb.cc" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx/opencv-onnx.pb.cc")
+  file(GLOB fw_hdrs "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow/*.h" "${CMAKE_CURRENT_LIST_DIR}/misc/caffe/opencv-caffe.pb.h" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx/opencv-onnx.pb.h")
+  set(fw_inc "${CMAKE_CURRENT_LIST_DIR}/misc/caffe" "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx")
+endif()
+
+list(APPEND include_dirs ${fw_inc})
+list(APPEND libs ${Protobuf_LIBRARIES})
+if(NOT BUILD_PROTOBUF)
+  list(APPEND include_dirs ${Protobuf_INCLUDE_DIRS})
+endif()
+
+set(sources_options "")
+
+list(APPEND libs ${LAPACK_LIBRARIES})
+if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
+  list(APPEND include_dirs ${OPENCL_INCLUDE_DIRS})
+else()
+  set(sources_options EXCLUDE_OPENCL)
+endif()
+
+if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
+  list(APPEND include_dirs ${CUDA_TOOLKIT_INCLUDE} ${CUDNN_INCLUDE_DIRS})
+  set(CC_LIST ${CUDA_ARCH_BIN})
+  separate_arguments(CC_LIST)
+  foreach(cc ${CC_LIST})
+    if(cc VERSION_LESS 3.0)
+      message(FATAL_ERROR "CUDA backend for DNN module requires CC 3.0 or higher. Please remove unsupported architectures from CUDA_ARCH_BIN option or disable OPENCV_DNN_CUDA=OFF.")
+    endif()
+  endforeach()
+  unset(CC_LIST)
+else()
+  set(sources_options ${sources_options} EXCLUDE_CUDA)
+endif()
+
+if(HAVE_TENGINE)
+	list(APPEND include_dirs ${TENGINE_INCLUDE_DIRS})
+	list(APPEND libs -Wl,--whole-archive ${TENGINE_LIBRARIES} -Wl,--no-whole-archive)
+endif()
+
+ocv_module_include_directories(${include_dirs})
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override")  # GCC
+  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-array-bounds")  # GCC 9.3.0 (Ubuntu 20.04)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override")  # Clang
+endif()
+
+set(dnn_runtime_libs "")
+if(INF_ENGINE_TARGET)
+  set(use_nn_builder OFF)
+  if(TARGET inference_engine_nn_builder OR # custom imported target
+     TARGET IE::inference_engine_nn_builder OR # default imported target via InferenceEngineConfig.cmake
+     INF_ENGINE_RELEASE VERSION_LESS "2020000000") # compatibility with older versions on IE
+    set(use_nn_builder ON)
+  endif()
+  ocv_option(OPENCV_DNN_IE_NN_BUILDER_2019 "Build with Inference Engine NN Builder API support" ${use_nn_builder})  # future: NOT HAVE_NGRAPH
+  if(OPENCV_DNN_IE_NN_BUILDER_2019)
+    message(STATUS "DNN: Enabling Inference Engine NN Builder API support")
+    add_definitions(-DHAVE_DNN_IE_NN_BUILDER_2019=1)
+  endif()
+  list(APPEND dnn_runtime_libs ${INF_ENGINE_TARGET})
+endif()
+if(HAVE_NGRAPH)
+  message(STATUS "DNN: Enabling Inference Engine nGraph API support")
+  add_definitions(-DHAVE_DNN_NGRAPH)
+  list(APPEND dnn_runtime_libs ngraph::ngraph)
+endif()
+
+ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
+ocv_create_module(${libs} ${dnn_runtime_libs})
+ocv_add_samples()
+ocv_add_accuracy_tests(${dnn_runtime_libs})
+
+set(perf_path "${CMAKE_CURRENT_LIST_DIR}/perf")
+file(GLOB_RECURSE perf_srcs "${perf_path}/*.cpp")
+file(GLOB_RECURSE perf_hdrs "${perf_path}/*.hpp" "${perf_path}/*.h")
+ocv_add_perf_tests(${INF_ENGINE_TARGET}
+    FILES test_common "${CMAKE_CURRENT_LIST_DIR}/test/test_common.hpp" "${CMAKE_CURRENT_LIST_DIR}/test/test_common.impl.hpp"
+    FILES Src ${perf_srcs}
+    FILES Include ${perf_hdrs}
+)
+
+ocv_option(OPENCV_DNN_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
+ocv_option(OPENCV_DNN_PERF_CLCAFFE "Add performance tests of clCaffe framework" OFF)
+if(BUILD_PERF_TESTS)
+  if (OPENCV_DNN_PERF_CAFFE
+      OR ${the_module}_PERF_CAFFE  # compatibility for deprecated option
+  )
+    find_package(Caffe QUIET)
+    if (Caffe_FOUND)
+      add_definitions(-DHAVE_CAFFE=1)
+      ocv_target_link_libraries(opencv_perf_dnn caffe)
+    endif()
+  elseif(OPENCV_DNN_PERF_CLCAFFE
+         OR ${the_module}_PERF_CAFFE  # compatibility for deprecated option
+  )
+    find_package(Caffe QUIET)
+    if (Caffe_FOUND)
+      add_definitions(-DHAVE_CLCAFFE=1)
+      ocv_target_link_libraries(opencv_perf_dnn caffe)
+    endif()
+  endif()
+endif()
--- a/3rdparty/opencv-4.5.4/modules/dnn/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake
+++ b/3rdparty/opencv-4.5.4/modules/dnn/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake
@@ -0,0 +1,11 @@
+if(NOT (OPENCV_DNN_OPENCL AND HAVE_OPENCL))
+  message(STATUS "opencv_dnn: filter out ocl4dnn source code")
+  ocv_list_filterout(OPENCV_MODULE_${the_module}_SOURCES "/ocl4dnn/")
+  ocv_list_filterout(OPENCV_MODULE_${the_module}_HEADERS "/ocl4dnn/")
+endif()
+
+if(NOT (OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN))
+  message(STATUS "opencv_dnn: filter out cuda4dnn source code")
+  ocv_list_filterout(OPENCV_MODULE_${the_module}_SOURCES "/cuda4dnn/")
+  ocv_list_filterout(OPENCV_MODULE_${the_module}_HEADERS "/cuda4dnn/")
+endif()
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn.hpp
@@ -0,0 +1,78 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_HPP
+#define OPENCV_DNN_HPP
+
+// This is an umbrella header to include into you project.
+// We are free to change headers layout in dnn subfolder, so please include
+// this header for future compatibility
+
+
+/** @defgroup dnn Deep Neural Network module
+  @{
+    This module contains:
+        - API for new layers creation, layers are building bricks of neural networks;
+        - set of built-in most-useful Layers;
+        - API to construct and modify comprehensive neural networks from layers;
+        - functionality for loading serialized networks models from different frameworks.
+
+    Functionality of this module is designed only for forward pass computations (i.e. network testing).
+    A network training is in principle not supported.
+  @}
+*/
+/** @example samples/dnn/classification.cpp
+Check @ref tutorial_dnn_googlenet "the corresponding tutorial" for more details
+*/
+/** @example samples/dnn/colorization.cpp
+*/
+/** @example samples/dnn/object_detection.cpp
+Check @ref tutorial_dnn_yolo "the corresponding tutorial" for more details
+*/
+/** @example samples/dnn/openpose.cpp
+*/
+/** @example samples/dnn/segmentation.cpp
+*/
+/** @example samples/dnn/text_detection.cpp
+*/
+#include <opencv2/dnn/dnn.hpp>
+
+#endif /* OPENCV_DNN_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -0,0 +1,832 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_ALL_LAYERS_HPP
+#define OPENCV_DNN_DNN_ALL_LAYERS_HPP
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+/** @defgroup dnnLayerList Partial List of Implemented Layers
+  @{
+  This subsection of dnn module contains information about built-in layers and their descriptions.
+
+  Classes listed here, in fact, provides C++ API for creating instances of built-in layers.
+  In addition to this way of layers instantiation, there is a more common factory API (see @ref dnnLayerFactory), it allows to create layers dynamically (by name) and register new ones.
+  You can use both API, but factory API is less convenient for native C++ programming and basically designed for use inside importers (see @ref readNetFromCaffe(), @ref readNetFromTorch(), @ref readNetFromTensorflow()).
+
+  Built-in layers partially reproduce functionality of corresponding Caffe and Torch7 layers.
+  In particular, the following layers and Caffe importer were tested to reproduce <a href="http://caffe.berkeleyvision.org/tutorial/layers.html">Caffe</a> functionality:
+  - Convolution
+  - Deconvolution
+  - Pooling
+  - InnerProduct
+  - TanH, ReLU, Sigmoid, BNLL, Power, AbsVal
+  - Softmax
+  - Reshape, Flatten, Slice, Split
+  - LRN
+  - MVN
+  - Dropout (since it does nothing on forward pass -))
+*/
+
+    class CV_EXPORTS BlankLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams &params);
+    };
+
+    /**
+     * Constant layer produces the same data blob at an every forward pass.
+     */
+    class CV_EXPORTS ConstLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams &params);
+    };
+
+    //! LSTM recurrent layer
+    class CV_EXPORTS LSTMLayer : public Layer
+    {
+    public:
+        /** Creates instance of LSTM layer */
+        static Ptr<LSTMLayer> create(const LayerParams& params);
+
+        /** @deprecated Use LayerParams::blobs instead.
+        @brief Set trained weights for LSTM layer.
+
+        LSTM behavior on each step is defined by current input, previous output, previous cell state and learned weights.
+
+        Let @f$x_t@f$ be current input, @f$h_t@f$ be current output, @f$c_t@f$ be current state.
+        Than current output and current cell state is computed as follows:
+        @f{eqnarray*}{
+        h_t &= o_t \odot tanh(c_t),               \\
+        c_t &= f_t \odot c_{t-1} + i_t \odot g_t, \\
+        @f}
+        where @f$\odot@f$ is per-element multiply operation and @f$i_t, f_t, o_t, g_t@f$ is internal gates that are computed using learned weights.
+
+        Gates are computed as follows:
+        @f{eqnarray*}{
+        i_t &= sigmoid&(W_{xi} x_t + W_{hi} h_{t-1} + b_i), \\
+        f_t &= sigmoid&(W_{xf} x_t + W_{hf} h_{t-1} + b_f), \\
+        o_t &= sigmoid&(W_{xo} x_t + W_{ho} h_{t-1} + b_o), \\
+        g_t &= tanh   &(W_{xg} x_t + W_{hg} h_{t-1} + b_g), \\
+        @f}
+        where @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
+        @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
+
+        For simplicity and performance purposes we use @f$ W_x = [W_{xi}; W_{xf}; W_{xo}, W_{xg}] @f$
+        (i.e. @f$W_x@f$ is vertical concatenation of @f$ W_{x?} @f$), @f$ W_x \in R^{4N_h \times N_x} @f$.
+        The same for @f$ W_h = [W_{hi}; W_{hf}; W_{ho}, W_{hg}], W_h \in R^{4N_h \times N_h} @f$
+        and for @f$ b = [b_i; b_f, b_o, b_g]@f$, @f$b \in R^{4N_h} @f$.
+
+        @param Wh is matrix defining how previous output is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_h @f$)
+        @param Wx is matrix defining how current input is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_x @f$)
+        @param b  is bias vector (i.e. according to above mentioned notation is @f$ b @f$)
+        */
+        CV_DEPRECATED virtual void setWeights(const Mat &Wh, const Mat &Wx, const Mat &b) = 0;
+
+        /** @brief Specifies shape of output blob which will be [[`T`], `N`] + @p outTailShape.
+          * @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used,
+          * where `Wh` is parameter from setWeights().
+          */
+        virtual void setOutShape(const MatShape &outTailShape = MatShape()) = 0;
+
+        /** @deprecated Use flag `produce_cell_output` in LayerParams.
+          * @brief Specifies either interpret first dimension of input blob as timestamp dimension either as sample.
+          *
+          * If flag is set to true then shape of input blob will be interpreted as [`T`, `N`, `[data dims]`] where `T` specifies number of timestamps, `N` is number of independent streams.
+          * In this case each forward() call will iterate through `T` timestamps and update layer's state `T` times.
+          *
+          * If flag is set to false then shape of input blob will be interpreted as [`N`, `[data dims]`].
+          * In this case each forward() call will make one iteration and produce one timestamp with shape [`N`, `[out dims]`].
+          */
+        CV_DEPRECATED virtual void setUseTimstampsDim(bool use = true) = 0;
+
+        /** @deprecated Use flag `use_timestamp_dim` in LayerParams.
+         * @brief If this flag is set to true then layer will produce @f$ c_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        CV_DEPRECATED virtual void setProduceCellOutput(bool produce = false) = 0;
+
+        /* In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$).
+         * @param input should contain packed values @f$x_t@f$
+         * @param output contains computed outputs: @f$h_t@f$ (and @f$c_t@f$ if setProduceCellOutput() flag was set to true).
+         *
+         * If setUseTimstampsDim() is set to true then @p input[0] should has at least two dimensions with the following shape: [`T`, `N`, `[data dims]`],
+         * where `T` specifies number of timestamps, `N` is number of independent streams (i.e. @f$ x_{t_0 + t}^{stream} @f$ is stored inside @p input[0][t, stream, ...]).
+         *
+         * If setUseTimstampsDim() is set to false then @p input[0] should contain single timestamp, its shape should has form [`N`, `[data dims]`] with at least one dimension.
+         * (i.e. @f$ x_{t}^{stream} @f$ is stored inside @p input[0][stream, ...]).
+        */
+
+        int inputNameToIndex(String inputName) CV_OVERRIDE;
+        int outputNameToIndex(const String& outputName) CV_OVERRIDE;
+    };
+
+    /** @brief GRU recurrent one-layer
+     *
+     * Accepts input sequence and computes the final hidden state for each element in the batch.
+     *
+     * - input[0] containing the features of the input sequence.
+     * input[0] should have shape [`T`, `N`, `data_dims`] where `T` is sequence length, `N` is batch size, `data_dims` is input size
+     * - output would have shape [`T`, `N`, `D` * `hidden_size`] where `D = 2` if layer is bidirectional otherwise `D = 1`
+     *
+     * Depends on the following attributes:
+     * - hidden_size - Number of neurons in the hidden layer
+     * - direction - RNN could be bidirectional or forward
+     *
+     * The final hidden state @f$ h_t @f$ computes by the following formulas:
+     *
+     @f{eqnarray*}{
+     r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+     z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+     n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+     h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)} \\
+     @f}
+     * Where @f$x_t@f$ is current input, @f$h_{(t-1)}@f$ is previous or initial hidden state.
+     *
+     * @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
+     * @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
+     *
+     * @f$\odot@f$ is per-element multiply operation.
+    */
+    class CV_EXPORTS GRULayer : public Layer
+    {
+    public:
+        /** Creates instance of GRU layer */
+        static Ptr<GRULayer> create(const LayerParams& params);
+    };
+
+    /** @brief Classical recurrent layer
+
+    Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
+
+    - input: should contain packed input @f$x_t@f$.
+    - output: should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
+
+    input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
+
+    output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
+
+    If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
+    */
+    class CV_EXPORTS RNNLayer : public Layer
+    {
+    public:
+        /** Creates instance of RNNLayer */
+        static Ptr<RNNLayer> create(const LayerParams& params);
+
+        /** Setups learned weights.
+
+        Recurrent-layer behavior on each step is defined by current input @f$ x_t @f$, previous state @f$ h_t @f$ and learned weights as follows:
+        @f{eqnarray*}{
+        h_t &= tanh&(W_{hh} h_{t-1} + W_{xh} x_t + b_h),  \\
+        o_t &= tanh&(W_{ho} h_t + b_o),
+        @f}
+
+        @param Wxh is @f$ W_{xh} @f$ matrix
+        @param bh  is @f$ b_{h}  @f$ vector
+        @param Whh is @f$ W_{hh} @f$ matrix
+        @param Who is @f$ W_{xo} @f$ matrix
+        @param bo  is @f$ b_{o}  @f$ vector
+        */
+        virtual void setWeights(const Mat &Wxh, const Mat &bh, const Mat &Whh, const Mat &Who, const Mat &bo) = 0;
+
+        /** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        virtual void setProduceHiddenOutput(bool produce = false) = 0;
+
+    };
+
+    class CV_EXPORTS BaseConvolutionLayer : public Layer
+    {
+    public:
+        CV_DEPRECATED_EXTERNAL Size kernel, stride, pad, dilation, adjustPad;
+        std::vector<size_t> adjust_pads;
+        std::vector<size_t> kernel_size, strides, dilations;
+        std::vector<size_t> pads_begin, pads_end;
+        String padMode;
+        int numOutput;
+    };
+
+    class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
+    {
+    public:
+        int input_zp, output_zp;
+        float output_sc;
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
+    {
+    public:
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS LRNLayer : public Layer
+    {
+    public:
+        int type;
+
+        int size;
+        float alpha, beta, bias;
+        bool normBySize;
+
+        static Ptr<LRNLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS PoolingLayer : public Layer
+    {
+    public:
+        int type;
+        std::vector<size_t> kernel_size, strides;
+        std::vector<size_t> pads_begin, pads_end;
+        bool globalPooling; //!< Flag is true if at least one of the axes is global pooled.
+        std::vector<bool> isGlobalPooling;
+        bool computeMaxIdx;
+        String padMode;
+        bool ceilMode;
+        // If true for average pooling with padding, divide an every output region
+        // by a whole kernel area. Otherwise exclude zero padded values and divide
+        // by number of real values.
+        bool avePoolPaddedArea;
+        // ROIPooling parameters.
+        Size pooledSize;
+        float spatialScale;
+        // PSROIPooling parameters.
+        int psRoiOutChannels;
+
+        static Ptr<PoolingLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer
+    {
+    public:
+        int input_zp, output_zp;
+        static Ptr<PoolingLayerInt8> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS SoftmaxLayer : public Layer
+    {
+    public:
+        bool logSoftMax;
+
+        static Ptr<SoftmaxLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS InnerProductLayer : public Layer
+    {
+    public:
+        int axis;
+        static Ptr<InnerProductLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer
+    {
+    public:
+        int output_zp;
+        static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS MVNLayer : public Layer
+    {
+    public:
+        float eps;
+        bool normVariance, acrossChannels;
+
+        static Ptr<MVNLayer> create(const LayerParams& params);
+    };
+
+    /* Reshaping */
+
+    class CV_EXPORTS ReshapeLayer : public Layer
+    {
+    public:
+        MatShape newShapeDesc;
+        Range newShapeRange;
+
+        static Ptr<ReshapeLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS FlattenLayer : public Layer
+    {
+    public:
+        static Ptr<FlattenLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS QuantizeLayer : public Layer
+    {
+    public:
+        float scale;
+        int zeropoint;
+        static Ptr<QuantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS DequantizeLayer : public Layer
+    {
+    public:
+        float scale;
+        int zeropoint;
+        static Ptr<DequantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS RequantizeLayer : public Layer
+    {
+    public:
+        float scale, shift;
+        static Ptr<RequantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ConcatLayer : public Layer
+    {
+    public:
+        int axis;
+        /**
+         * @brief Add zero padding in case of concatenation of blobs with different
+         * spatial sizes.
+         *
+         * Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat
+         */
+        bool padding;
+        int paddingValue;
+
+        static Ptr<ConcatLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SplitLayer : public Layer
+    {
+    public:
+        int outputsCount; //!< Number of copies that will be produced (is ignored when negative).
+
+        static Ptr<SplitLayer> create(const LayerParams &params);
+    };
+
+    /**
+     * Slice layer has several modes:
+     * 1. Caffe mode
+     * @param[in] axis Axis of split operation
+     * @param[in] slice_point Array of split points
+     *
+     * Number of output blobs equals to number of split points plus one. The
+     * first blob is a slice on input from 0 to @p slice_point[0] - 1 by @p axis,
+     * the second output blob is a slice of input from @p slice_point[0] to
+     * @p slice_point[1] - 1 by @p axis and the last output blob is a slice of
+     * input from @p slice_point[-1] up to the end of @p axis size.
+     *
+     * 2. TensorFlow mode
+     * @param begin Vector of start indices
+     * @param size Vector of sizes
+     *
+     * More convenient numpy-like slice. One and only output blob
+     * is a slice `input[begin[0]:begin[0]+size[0], begin[1]:begin[1]+size[1], ...]`
+     *
+     * 3. Torch mode
+     * @param axis Axis of split operation
+     *
+     * Split input blob on the equal parts by @p axis.
+     */
+    class CV_EXPORTS SliceLayer : public Layer
+    {
+    public:
+        /**
+         * @brief Vector of slice ranges.
+         *
+         * The first dimension equals number of output blobs.
+         * Inner vector has slice ranges for the first number of input dimensions.
+         */
+        std::vector<std::vector<Range> > sliceRanges;
+        std::vector<std::vector<int> > sliceSteps;
+        int axis;
+        int num_split;
+
+        static Ptr<SliceLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS PermuteLayer : public Layer
+    {
+    public:
+        static Ptr<PermuteLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * Permute channels of 4-dimensional input blob.
+     * @param group Number of groups to split input channels and pick in turns
+     *              into output blob.
+     *
+     * \f[ groupSize = \frac{number\ of\ channels}{group} \f]
+     * \f[ output(n, c, h, w) = input(n, groupSize \times (c \% group) + \lfloor \frac{c}{group} \rfloor, h, w) \f]
+     * Read more at https://arxiv.org/pdf/1707.01083.pdf
+     */
+    class CV_EXPORTS ShuffleChannelLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+
+        int group;
+    };
+
+    /**
+     * @brief Adds extra values for specific axes.
+     * @param paddings Vector of paddings in format
+     *                 @code
+     *                 [ pad_before, pad_after,  // [0]th dimension
+     *                   pad_before, pad_after,  // [1]st dimension
+     *                   ...
+     *                   pad_before, pad_after ] // [n]th dimension
+     *                 @endcode
+     *                 that represents number of padded values at every dimension
+     *                 starting from the first one. The rest of dimensions won't
+     *                 be padded.
+     * @param value Value to be padded. Defaults to zero.
+     * @param type Padding type: 'constant', 'reflect'
+     * @param input_dims Torch's parameter. If @p input_dims is not equal to the
+     *                   actual input dimensionality then the `[0]th` dimension
+     *                   is considered as a batch dimension and @p paddings are shifted
+     *                   to a one dimension. Defaults to `-1` that means padding
+     *                   corresponding to @p paddings.
+     */
+    class CV_EXPORTS PaddingLayer : public Layer
+    {
+    public:
+        static Ptr<PaddingLayer> create(const LayerParams& params);
+    };
+
+    /* Activations */
+    class CV_EXPORTS ActivationLayer : public Layer
+    {
+    public:
+        virtual void forwardSlice(const float* src, float* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+        virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+        virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+    };
+
+    class CV_EXPORTS ReLULayer : public ActivationLayer
+    {
+    public:
+        float negativeSlope;
+
+        static Ptr<ReLULayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ReLU6Layer : public ActivationLayer
+    {
+    public:
+        float minValue, maxValue;
+
+        static Ptr<ReLU6Layer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ELULayer : public ActivationLayer
+    {
+    public:
+        static Ptr<ELULayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS TanHLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<TanHLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SwishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SwishLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS MishLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<MishLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SigmoidLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<SigmoidLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BNLLLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<BNLLLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AbsLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<AbsLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS PowerLayer : public ActivationLayer
+    {
+    public:
+        float power, scale, shift;
+
+        static Ptr<PowerLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ExpLayer : public ActivationLayer
+    {
+    public:
+        float base, scale, shift;
+
+        static Ptr<ExpLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
+    {
+    public:
+        static Ptr<ActivationLayerInt8> create(const LayerParams &params);
+    };
+
+    /* Layers used in semantic segmentation */
+
+    class CV_EXPORTS CropLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams &params);
+    };
+
+    /** @brief Element wise operation on inputs
+
+    Extra optional parameters:
+    - "operation" as string. Values are "sum" (default), "prod", "max", "div", "min"
+    - "coeff" as float array. Specify weights of inputs for SUM operation
+    - "output_channels_mode" as string. Values are "same" (default, all input must have the same layout), "input_0", "input_0_truncate", "max_input_channels"
+    */
+    class CV_EXPORTS EltwiseLayer : public Layer
+    {
+    public:
+        static Ptr<EltwiseLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS EltwiseLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BatchNormLayer : public ActivationLayer
+    {
+    public:
+        bool hasWeights, hasBias;
+        float epsilon;
+
+        static Ptr<BatchNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer
+    {
+    public:
+        float input_sc, output_sc;
+        int input_zp, output_zp;
+        static Ptr<BatchNormLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS MaxUnpoolLayer : public Layer
+    {
+    public:
+        Size poolKernel;
+        Size poolPad;
+        Size poolStride;
+
+        static Ptr<MaxUnpoolLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ScaleLayer : public Layer
+    {
+    public:
+        bool hasBias;
+        int axis;
+
+        static Ptr<ScaleLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<ScaleLayerInt8> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ShiftLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ShiftLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS DataAugmentationLayer : public Layer
+    {
+    public:
+        static Ptr<DataAugmentationLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CorrelationLayer : public Layer
+    {
+    public:
+        static Ptr<CorrelationLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS AccumLayer : public Layer
+    {
+    public:
+        static Ptr<AccumLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS FlowWarpLayer : public Layer
+    {
+    public:
+        static Ptr<FlowWarpLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS PriorBoxLayer : public Layer
+    {
+    public:
+        static Ptr<PriorBoxLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ReorgLayer : public Layer
+    {
+    public:
+        static Ptr<ReorgLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS RegionLayer : public Layer
+    {
+    public:
+        float nmsThreshold;
+
+        static Ptr<RegionLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief Detection output layer.
+     *
+     * The layer size is: @f$ (1 \times 1 \times N \times 7) @f$
+     *    where N is [keep_top_k] parameter multiplied by batch size. Each row is:
+     *    [image_id, label, confidence, xmin, ymin, xmax, ymax]
+     *    where image_id is the index of image input in the batch.
+     */
+    class CV_EXPORTS DetectionOutputLayer : public Layer
+    {
+    public:
+        static Ptr<DetectionOutputLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief \f$ L_p \f$ - normalization layer.
+     * @param p Normalization factor. The most common `p = 1` for \f$ L_1 \f$ -
+     *          normalization or `p = 2` for \f$ L_2 \f$ - normalization or a custom one.
+     * @param eps Parameter \f$ \epsilon \f$ to prevent a division by zero.
+     * @param across_spatial If true, normalize an input across all non-batch dimensions.
+     *                       Otherwise normalize an every channel separately.
+     *
+     * Across spatial:
+     * @f[
+     * norm = \sqrt[p]{\epsilon + \sum_{x, y, c} |src(x, y, c)|^p } \\
+     * dst(x, y, c) = \frac{ src(x, y, c) }{norm}
+     * @f]
+     *
+     * Channel wise normalization:
+     * @f[
+     * norm(c) = \sqrt[p]{\epsilon + \sum_{x, y} |src(x, y, c)|^p } \\
+     * dst(x, y, c) = \frac{ src(x, y, c) }{norm(c)}
+     * @f]
+     *
+     * Where `x, y` - spatial coordinates, `c` - channel.
+     *
+     * An every sample in the batch is normalized separately. Optionally,
+     * output is scaled by the trained parameters.
+     */
+    class CV_EXPORTS NormalizeBBoxLayer : public Layer
+    {
+    public:
+        float pnorm, epsilon;
+        CV_DEPRECATED_EXTERNAL bool acrossSpatial;
+
+        static Ptr<NormalizeBBoxLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief Resize input 4-dimensional blob by nearest neighbor or bilinear strategy.
+     *
+     * Layer is used to support TensorFlow's resize_nearest_neighbor and resize_bilinear ops.
+     */
+    class CV_EXPORTS ResizeLayer : public Layer
+    {
+    public:
+        static Ptr<ResizeLayer> create(const LayerParams& params);
+    };
+
+    /**
+     * @brief Bilinear resize layer from https://github.com/cdmh/deeplab-public-ver2
+     *
+     * It differs from @ref ResizeLayer in output shape and resize scales computations.
+     */
+    class CV_EXPORTS InterpLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ProposalLayer : public Layer
+    {
+    public:
+        static Ptr<ProposalLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CropAndResizeLayer : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS CumSumLayer : public Layer
+    {
+    public:
+        int exclusive;
+        int reverse;
+
+        static Ptr<CumSumLayer> create(const LayerParams& params);
+    };
+
+//! @}
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/dict.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/dict.hpp
@@ -0,0 +1,160 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <opencv2/core.hpp>
+#include <map>
+#include <ostream>
+
+#include <opencv2/dnn/dnn.hpp>
+
+#ifndef OPENCV_DNN_DNN_DICT_HPP
+#define OPENCV_DNN_DNN_DICT_HPP
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+/** @brief This struct stores the scalar value (or array) of one of the following type: double, cv::String or int64.
+ *  @todo Maybe int64 is useless because double type exactly stores at least 2^52 integers.
+ */
+struct CV_EXPORTS_W DictValue
+{
+    DictValue(const DictValue &r);
+    DictValue(bool i)           : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i ? 1 : 0; }       //!< Constructs integer scalar
+    DictValue(int64 i = 0)      : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i; }       //!< Constructs integer scalar
+    CV_WRAP DictValue(int i)    : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i; }       //!< Constructs integer scalar
+    DictValue(unsigned p)       : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = p; }       //!< Constructs integer scalar
+    CV_WRAP DictValue(double p)         : type(Param::REAL), pd(new AutoBuffer<double,1>) { (*pd)[0] = p; }     //!< Constructs floating point scalar
+    CV_WRAP DictValue(const String &s)  : type(Param::STRING), ps(new AutoBuffer<String,1>) { (*ps)[0] = s; }   //!< Constructs string scalar
+    DictValue(const char *s)            : type(Param::STRING), ps(new AutoBuffer<String,1>) { (*ps)[0] = s; }   //!< @overload
+
+    template<typename TypeIter>
+    static DictValue arrayInt(TypeIter begin, int size);    //!< Constructs integer array
+    template<typename TypeIter>
+    static DictValue arrayReal(TypeIter begin, int size);   //!< Constructs floating point array
+    template<typename TypeIter>
+    static DictValue arrayString(TypeIter begin, int size); //!< Constructs array of strings
+
+    template<typename T>
+    T get(int idx = -1) const; //!< Tries to convert array element with specified index to requested type and returns its.
+
+    int size() const;
+
+    CV_WRAP bool isInt() const;
+    CV_WRAP bool isString() const;
+    CV_WRAP bool isReal() const;
+
+    CV_WRAP int getIntValue(int idx = -1) const;
+    CV_WRAP double getRealValue(int idx = -1) const;
+    CV_WRAP String getStringValue(int idx = -1) const;
+
+    DictValue &operator=(const DictValue &r);
+
+    friend std::ostream &operator<<(std::ostream &stream, const DictValue &dictv);
+
+    ~DictValue();
+
+private:
+
+    Param type;
+
+    union
+    {
+        AutoBuffer<int64, 1> *pi;
+        AutoBuffer<double, 1> *pd;
+        AutoBuffer<String, 1> *ps;
+        void *pv;
+    };
+
+    DictValue(Param _type, void *_p) : type(_type), pv(_p) {}
+    void release();
+};
+
+/** @brief This class implements name-value dictionary, values are instances of DictValue. */
+class CV_EXPORTS Dict
+{
+    typedef std::map<String, DictValue> _Dict;
+    _Dict dict;
+
+public:
+
+    //! Checks a presence of the @p key in the dictionary.
+    bool has(const String &key) const;
+
+    //! If the @p key in the dictionary then returns pointer to its value, else returns NULL.
+    DictValue *ptr(const String &key);
+
+    /** @overload */
+    const DictValue *ptr(const String &key) const;
+
+    //! If the @p key in the dictionary then returns its value, else an error will be generated.
+    const DictValue &get(const String &key) const;
+
+    /** @overload */
+    template <typename T>
+    T get(const String &key) const;
+
+    //! If the @p key in the dictionary then returns its value, else returns @p defaultValue.
+    template <typename T>
+    T get(const String &key, const T &defaultValue) const;
+
+    //! Sets new @p value for the @p key, or adds new key-value pair into the dictionary.
+    template<typename T>
+    const T &set(const String &key, const T &value);
+
+    //! Erase @p key from the dictionary.
+    void erase(const String &key);
+
+    friend std::ostream &operator<<(std::ostream &stream, const Dict &dict);
+
+    std::map<String, DictValue>::const_iterator begin() const;
+
+    std::map<String, DictValue>::const_iterator end() const;
+};
+
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/dnn.hpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
@@ -0,0 +1,412 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_INL_HPP
+#define OPENCV_DNN_DNN_INL_HPP
+
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+template<typename TypeIter>
+DictValue DictValue::arrayInt(TypeIter begin, int size)
+{
+    DictValue res(Param::INT, new AutoBuffer<int64, 1>(size));
+    for (int j = 0; j < size; begin++, j++)
+        (*res.pi)[j] = *begin;
+    return res;
+}
+
+template<typename TypeIter>
+DictValue DictValue::arrayReal(TypeIter begin, int size)
+{
+    DictValue res(Param::REAL, new AutoBuffer<double, 1>(size));
+    for (int j = 0; j < size; begin++, j++)
+        (*res.pd)[j] = *begin;
+    return res;
+}
+
+template<typename TypeIter>
+DictValue DictValue::arrayString(TypeIter begin, int size)
+{
+    DictValue res(Param::STRING, new AutoBuffer<String, 1>(size));
+    for (int j = 0; j < size; begin++, j++)
+        (*res.ps)[j] = *begin;
+    return res;
+}
+
+template<>
+inline DictValue DictValue::get<DictValue>(int idx) const
+{
+    CV_Assert(idx == -1);
+    return *this;
+}
+
+template<>
+inline int64 DictValue::get<int64>(int idx) const
+{
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
+    idx = (idx == -1) ? 0 : idx;
+
+    if (type == Param::INT)
+    {
+        return (*pi)[idx];
+    }
+    else if (type == Param::REAL)
+    {
+        double doubleValue = (*pd)[idx];
+
+        double fracpart, intpart;
+        fracpart = std::modf(doubleValue, &intpart);
+        CV_Assert(fracpart == 0.0);
+
+        return (int64)doubleValue;
+    }
+    else if (type == Param::STRING)
+    {
+        return std::atoi((*ps)[idx].c_str());
+    }
+    else
+    {
+        CV_Assert(isInt() || isReal() || isString());
+        return 0;
+    }
+}
+
+template<>
+inline int DictValue::get<int>(int idx) const
+{
+    return (int)get<int64>(idx);
+}
+
+inline int DictValue::getIntValue(int idx) const
+{
+    return (int)get<int64>(idx);
+}
+
+template<>
+inline unsigned DictValue::get<unsigned>(int idx) const
+{
+    return (unsigned)get<int64>(idx);
+}
+
+template<>
+inline bool DictValue::get<bool>(int idx) const
+{
+    return (get<int64>(idx) != 0);
+}
+
+template<>
+inline double DictValue::get<double>(int idx) const
+{
+    CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
+    idx = (idx == -1) ? 0 : idx;
+
+    if (type == Param::REAL)
+    {
+        return (*pd)[idx];
+    }
+    else if (type == Param::INT)
+    {
+        return (double)(*pi)[idx];
+    }
+    else if (type == Param::STRING)
+    {
+        return std::atof((*ps)[idx].c_str());
+    }
+    else
+    {
+        CV_Assert(isReal() || isInt() || isString());
+        return 0;
+    }
+}
+
+inline double DictValue::getRealValue(int idx) const
+{
+    return get<double>(idx);
+}
+
+template<>
+inline float DictValue::get<float>(int idx) const
+{
+    return (float)get<double>(idx);
+}
+
+template<>
+inline String DictValue::get<String>(int idx) const
+{
+    CV_Assert(isString());
+    CV_Assert((idx == -1 && ps->size() == 1) || (idx >= 0 && idx < (int)ps->size()));
+    return (*ps)[(idx == -1) ? 0 : idx];
+}
+
+
+inline String DictValue::getStringValue(int idx) const
+{
+    return get<String>(idx);
+}
+
+inline void DictValue::release()
+{
+    switch (type)
+    {
+    case Param::INT:
+        delete pi;
+        break;
+    case Param::STRING:
+        delete ps;
+        break;
+    case Param::REAL:
+        delete pd;
+        break;
+    case Param::BOOLEAN:
+    case Param::MAT:
+    case Param::MAT_VECTOR:
+    case Param::ALGORITHM:
+    case Param::FLOAT:
+    case Param::UNSIGNED_INT:
+    case Param::UINT64:
+    case Param::UCHAR:
+    case Param::SCALAR:
+        break; // unhandled
+    }
+}
+
+inline DictValue::~DictValue()
+{
+    release();
+}
+
+inline DictValue & DictValue::operator=(const DictValue &r)
+{
+    if (&r == this)
+        return *this;
+
+    if (r.type == Param::INT)
+    {
+        AutoBuffer<int64, 1> *tmp = new AutoBuffer<int64, 1>(*r.pi);
+        release();
+        pi = tmp;
+    }
+    else if (r.type == Param::STRING)
+    {
+        AutoBuffer<String, 1> *tmp = new AutoBuffer<String, 1>(*r.ps);
+        release();
+        ps = tmp;
+    }
+    else if (r.type == Param::REAL)
+    {
+        AutoBuffer<double, 1> *tmp = new AutoBuffer<double, 1>(*r.pd);
+        release();
+        pd = tmp;
+    }
+
+    type = r.type;
+
+    return *this;
+}
+
+inline DictValue::DictValue(const DictValue &r)
+    : pv(NULL)
+{
+    type = r.type;
+
+    if (r.type == Param::INT)
+        pi = new AutoBuffer<int64, 1>(*r.pi);
+    else if (r.type == Param::STRING)
+        ps = new AutoBuffer<String, 1>(*r.ps);
+    else if (r.type == Param::REAL)
+        pd = new AutoBuffer<double, 1>(*r.pd);
+}
+
+inline bool DictValue::isString() const
+{
+    return (type == Param::STRING);
+}
+
+inline bool DictValue::isInt() const
+{
+    return (type == Param::INT);
+}
+
+inline bool DictValue::isReal() const
+{
+    return (type == Param::REAL || type == Param::INT);
+}
+
+inline int DictValue::size() const
+{
+    switch (type)
+    {
+    case Param::INT:
+        return (int)pi->size();
+    case Param::STRING:
+        return (int)ps->size();
+    case Param::REAL:
+        return (int)pd->size();
+    case Param::BOOLEAN:
+    case Param::MAT:
+    case Param::MAT_VECTOR:
+    case Param::ALGORITHM:
+    case Param::FLOAT:
+    case Param::UNSIGNED_INT:
+    case Param::UINT64:
+    case Param::UCHAR:
+    case Param::SCALAR:
+        break; // unhandled
+    }
+    CV_Error_(Error::StsInternal, ("Unhandled type (%d)", static_cast<int>(type)));
+}
+
+inline std::ostream &operator<<(std::ostream &stream, const DictValue &dictv)
+{
+    int i;
+
+    if (dictv.isInt())
+    {
+        for (i = 0; i < dictv.size() - 1; i++)
+            stream << dictv.get<int64>(i) << ", ";
+        stream << dictv.get<int64>(i);
+    }
+    else if (dictv.isReal())
+    {
+        for (i = 0; i < dictv.size() - 1; i++)
+            stream << dictv.get<double>(i) << ", ";
+        stream << dictv.get<double>(i);
+    }
+    else if (dictv.isString())
+    {
+        for (i = 0; i < dictv.size() - 1; i++)
+            stream << "\"" << dictv.get<String>(i) << "\", ";
+        stream << dictv.get<String>(i);
+    }
+
+    return stream;
+}
+
+/////////////////////////////////////////////////////////////////
+
+inline bool Dict::has(const String &key) const
+{
+    return dict.count(key) != 0;
+}
+
+inline DictValue *Dict::ptr(const String &key)
+{
+    _Dict::iterator i = dict.find(key);
+    return (i == dict.end()) ? NULL : &i->second;
+}
+
+inline const DictValue *Dict::ptr(const String &key) const
+{
+    _Dict::const_iterator i = dict.find(key);
+    return (i == dict.end()) ? NULL : &i->second;
+}
+
+inline const DictValue &Dict::get(const String &key) const
+{
+    _Dict::const_iterator i = dict.find(key);
+    if (i == dict.end())
+        CV_Error(Error::StsObjectNotFound, "Required argument \"" + key + "\" not found into dictionary");
+    return i->second;
+}
+
+template <typename T>
+inline T Dict::get(const String &key) const
+{
+    return this->get(key).get<T>();
+}
+
+template <typename T>
+inline T Dict::get(const String &key, const T &defaultValue) const
+{
+    _Dict::const_iterator i = dict.find(key);
+
+    if (i != dict.end())
+        return i->second.get<T>();
+    else
+        return defaultValue;
+}
+
+template<typename T>
+inline const T &Dict::set(const String &key, const T &value)
+{
+    _Dict::iterator i = dict.find(key);
+
+    if (i != dict.end())
+        i->second = DictValue(value);
+    else
+        dict.insert(std::make_pair(key, DictValue(value)));
+
+    return value;
+}
+
+inline void Dict::erase(const String &key)
+{
+    dict.erase(key);
+}
+
+inline std::ostream &operator<<(std::ostream &stream, const Dict &dict)
+{
+    Dict::_Dict::const_iterator it;
+    for (it = dict.dict.begin(); it != dict.dict.end(); it++)
+        stream << it->first << " : " << it->second << "\n";
+
+    return stream;
+}
+
+inline std::map<String, DictValue>::const_iterator Dict::begin() const
+{
+    return dict.begin();
+}
+
+inline std::map<String, DictValue>::const_iterator Dict::end() const
+{
+    return dict.end();
+}
+
+CV__DNN_INLINE_NS_END
+}
+}
+
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/layer.details.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/layer.details.hpp
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+#ifndef OPENCV_DNN_LAYER_DETAILS_HPP
+#define OPENCV_DNN_LAYER_DETAILS_HPP
+
+#include <opencv2/dnn/layer.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+/** @brief Registers layer constructor in runtime.
+*   @param type string, containing type name of the layer.
+*   @param constructorFunc pointer to the function of type LayerRegister::Constructor, which creates the layer.
+*   @details This macros must be placed inside the function code.
+*/
+#define CV_DNN_REGISTER_LAYER_FUNC(type, constructorFunc) \
+    cv::dnn::LayerFactory::registerLayer(#type, constructorFunc);
+
+/** @brief Registers layer class in runtime.
+ *  @param type string, containing type name of the layer.
+ *  @param class C++ class, derived from Layer.
+ *  @details This macros must be placed inside the function code.
+ */
+#define CV_DNN_REGISTER_LAYER_CLASS(type, class) \
+    cv::dnn::LayerFactory::registerLayer(#type, cv::dnn::details::_layerDynamicRegisterer<class>);
+
+/** @brief Registers layer constructor on module load time.
+*   @param type string, containing type name of the layer.
+*   @param constructorFunc pointer to the function of type LayerRegister::Constructor, which creates the layer.
+*   @details This macros must be placed outside the function code.
+*/
+#define CV_DNN_REGISTER_LAYER_FUNC_STATIC(type, constructorFunc) \
+static cv::dnn::details::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constructorFunc);
+
+/** @brief Registers layer class on module load time.
+ *  @param type string, containing type name of the layer.
+ *  @param class C++ class, derived from Layer.
+ *  @details This macros must be placed outside the function code.
+ */
+#define CV_DNN_REGISTER_LAYER_CLASS_STATIC(type, class)                         \
+Ptr<Layer> __LayerStaticRegisterer_func_##type(LayerParams &params) \
+    { return Ptr<Layer>(new class(params)); }                       \
+static cv::dnn::details::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, __LayerStaticRegisterer_func_##type);
+
+namespace details {
+
+template<typename LayerClass>
+Ptr<Layer> _layerDynamicRegisterer(LayerParams &params)
+{
+    return Ptr<Layer>(LayerClass::create(params));
+}
+
+//allows automatically register created layer on module load time
+class _LayerStaticRegisterer
+{
+    String type;
+public:
+
+    _LayerStaticRegisterer(const String &layerType, LayerFactory::Constructor layerConstructor)
+    {
+        this->type = layerType;
+        LayerFactory::registerLayer(layerType, layerConstructor);
+    }
+
+    ~_LayerStaticRegisterer()
+    {
+        LayerFactory::unregisterLayer(type);
+    }
+};
+
+} // namespace
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/layer.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/layer.hpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_LAYER_HPP
+#define OPENCV_DNN_LAYER_HPP
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+//!
+//! @defgroup dnnLayerFactory Utilities for New Layers Registration
+//! @{
+
+/** @brief %Layer factory allows to create instances of registered layers. */
+class CV_EXPORTS LayerFactory
+{
+public:
+
+    //! Each Layer class must provide this function to the factory
+    typedef Ptr<Layer>(*Constructor)(LayerParams &params);
+
+    //! Registers the layer class with typename @p type and specified @p constructor. Thread-safe.
+    static void registerLayer(const String &type, Constructor constructor);
+
+    //! Unregisters registered layer with specified type name. Thread-safe.
+    static void unregisterLayer(const String &type);
+
+    /** @brief Creates instance of registered layer.
+     *  @param type type name of creating layer.
+     *  @param params parameters which will be used for layer initialization.
+     *  @note Thread-safe.
+     */
+    static Ptr<Layer> createLayerInstance(const String &type, LayerParams& params);
+
+private:
+    LayerFactory();
+};
+
+//! @}
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_LAYER_REG_HPP
+#define OPENCV_DNN_LAYER_REG_HPP
+#include <opencv2/dnn.hpp>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+//! @addtogroup dnn
+//! @{
+
+typedef std::map<std::string, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
+
+//! Register layer types of DNN model.
+//!
+//! @note In order to thread-safely access the factory, see getLayerFactoryMutex() function.
+LayerFactory_Impl& getLayerFactoryImpl();
+
+//! Get the mutex guarding @ref LayerFactory_Impl, see getLayerFactoryImpl() function.
+Mutex& getLayerFactoryMutex();
+
+//! @}
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -0,0 +1,259 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_DNN_DNN_SHAPE_UTILS_HPP
+#define OPENCV_DNN_DNN_SHAPE_UTILS_HPP
+
+#include <opencv2/dnn/dnn.hpp>
+#include <opencv2/core/types_c.h>  // CV_MAX_DIM
+#include <iostream>
+#include <ostream>
+#include <sstream>
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+//Slicing
+
+struct _Range : public cv::Range
+{
+    _Range(const Range &r) : cv::Range(r) {}
+    _Range(int start_, int size_ = 1) : cv::Range(start_, start_ + size_) {}
+};
+
+static inline Mat slice(const Mat &m, const _Range &r0)
+{
+    Range ranges[CV_MAX_DIM];
+    for (int i = 1; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    return m(&ranges[0]);
+}
+
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
+{
+    CV_Assert(m.dims >= 2);
+    Range ranges[CV_MAX_DIM];
+    for (int i = 2; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    return m(&ranges[0]);
+}
+
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
+{
+    CV_Assert(m.dims >= 3);
+    Range ranges[CV_MAX_DIM];
+    for (int i = 3; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    return m(&ranges[0]);
+}
+
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
+{
+    CV_Assert(m.dims >= 4);
+    Range ranges[CV_MAX_DIM];
+    for (int i = 4; i < m.dims; i++)
+        ranges[i] = Range::all();
+    ranges[0] = r0;
+    ranges[1] = r1;
+    ranges[2] = r2;
+    ranges[3] = r3;
+    return m(&ranges[0]);
+}
+
+static inline Mat getPlane(const Mat &m, int n, int cn)
+{
+    CV_Assert(m.dims > 2);
+    int sz[CV_MAX_DIM];
+    for(int i = 2; i < m.dims; i++)
+    {
+        sz[i-2] = m.size.p[i];
+    }
+    return Mat(m.dims - 2, sz, m.type(), (void*)m.ptr<float>(n, cn));
+}
+
+static inline MatShape shape(const int* dims, const int n)
+{
+    MatShape shape;
+    shape.assign(dims, dims + n);
+    return shape;
+}
+
+static inline MatShape shape(const Mat& mat)
+{
+    return shape(mat.size.p, mat.dims);
+}
+
+static inline MatShape shape(const MatSize& sz)
+{
+    return shape(sz.p, sz.dims());
+}
+
+static inline MatShape shape(const UMat& mat)
+{
+    return shape(mat.size.p, mat.dims);
+}
+
+#if 0  // issues with MatExpr wrapped into InputArray
+static inline
+MatShape shape(InputArray input)
+{
+    int sz[CV_MAX_DIM];
+    int ndims = input.sizend(sz);
+    return shape(sz, ndims);
+}
+#endif
+
+namespace {inline bool is_neg(int i) { return i < 0; }}
+
+static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
+{
+    int dims[] = {a0, a1, a2, a3};
+    MatShape s = shape(dims, 4);
+    s.erase(std::remove_if(s.begin(), s.end(), is_neg), s.end());
+    return s;
+}
+
+static inline int total(const MatShape& shape, int start = -1, int end = -1)
+{
+    if (start == -1) start = 0;
+    if (end == -1) end = (int)shape.size();
+
+    if (shape.empty())
+        return 0;
+
+    int elems = 1;
+    CV_Assert(start <= (int)shape.size() && end <= (int)shape.size() &&
+              start <= end);
+    for(int i = start; i < end; i++)
+    {
+        elems *= shape[i];
+    }
+    return elems;
+}
+
+static inline MatShape concat(const MatShape& a, const MatShape& b)
+{
+    MatShape c = a;
+    c.insert(c.end(), b.begin(), b.end());
+
+    return c;
+}
+
+static inline std::string toString(const MatShape& shape, const String& name = "")
+{
+    std::ostringstream ss;
+    if (!name.empty())
+        ss << name << ' ';
+    ss << '[';
+    for(size_t i = 0, n = shape.size(); i < n; ++i)
+        ss << ' ' << shape[i];
+    ss << " ]";
+    return ss.str();
+}
+static inline void print(const MatShape& shape, const String& name = "")
+{
+    std::cout << toString(shape, name) << std::endl;
+}
+static inline std::ostream& operator<<(std::ostream &out, const MatShape& shape)
+{
+    out << toString(shape);
+    return out;
+}
+
+/// @brief Converts axis from `[-dims; dims)` (similar to Python's slice notation) to `[0; dims)` range.
+static inline
+int normalize_axis(int axis, int dims)
+{
+    CV_Check(axis, axis >= -dims && axis < dims, "");
+    axis = (axis < 0) ? (dims + axis) : axis;
+    CV_DbgCheck(axis, axis >= 0 && axis < dims, "");
+    return axis;
+}
+
+static inline
+int normalize_axis(int axis, const MatShape& shape)
+{
+    return normalize_axis(axis, (int)shape.size());
+}
+
+static inline
+Range normalize_axis_range(const Range& r, int axisSize)
+{
+    if (r == Range::all())
+        return Range(0, axisSize);
+    CV_CheckGE(r.start, 0, "");
+    Range clamped(r.start,
+                  r.end > 0 ? std::min(r.end, axisSize) : axisSize + r.end + 1);
+    CV_DbgCheckGE(clamped.start, 0, "");
+    CV_CheckLT(clamped.start, clamped.end, "");
+    CV_CheckLE(clamped.end, axisSize, "");
+    return clamped;
+}
+
+static inline
+bool isAllOnes(const MatShape &inputShape, int startPos, int endPos)
+{
+    CV_Assert(!inputShape.empty());
+
+    CV_CheckGE((int) inputShape.size(), startPos, "");
+    CV_CheckGE(startPos, 0, "");
+    CV_CheckLE(startPos, endPos, "");
+    CV_CheckLE((size_t)endPos, inputShape.size(), "");
+
+    for (size_t i = startPos; i < endPos; i++)
+    {
+        if (inputShape[i] != 1)
+            return false;
+    }
+    return true;
+}
+
+CV__DNN_INLINE_NS_END
+}
+}
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/utils/debug_utils.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/utils/debug_utils.hpp
@@ -0,0 +1,24 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
+#define OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
+
+#include "../dnn.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+/**
+ * @brief Skip model import after diagnostic run in readNet() functions.
+ * @param[in] skip Indicates whether to skip the import.
+ *
+ * This is an internal OpenCV function not intended for users.
+ */
+CV_EXPORTS void skipModelImport(bool skip);
+
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif // OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/utils/inference_engine.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/utils/inference_engine.hpp
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018-2019, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_DNN_UTILS_INF_ENGINE_HPP
+#define OPENCV_DNN_UTILS_INF_ENGINE_HPP
+
+#include "../dnn.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+
+/* Values for 'OPENCV_DNN_BACKEND_INFERENCE_ENGINE_TYPE' parameter */
+#define CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API     "NN_BUILDER"
+#define CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH             "NGRAPH"
+
+/** @brief Returns Inference Engine internal backend API.
+ *
+ * See values of `CV_DNN_BACKEND_INFERENCE_ENGINE_*` macros.
+ *
+ * Default value is controlled through `OPENCV_DNN_BACKEND_INFERENCE_ENGINE_TYPE` runtime parameter (environment variable).
+ */
+CV_EXPORTS_W cv::String getInferenceEngineBackendType();
+
+/** @brief Specify Inference Engine internal backend API.
+ *
+ * See values of `CV_DNN_BACKEND_INFERENCE_ENGINE_*` macros.
+ *
+ * @returns previous value of internal backend API
+ */
+CV_EXPORTS_W cv::String setInferenceEngineBackendType(const cv::String& newBackendType);
+
+
+/** @brief Release a Myriad device (binded by OpenCV).
+ *
+ * Single Myriad device cannot be shared across multiple processes which uses
+ * Inference Engine's Myriad plugin.
+ */
+CV_EXPORTS_W void resetMyriadDevice();
+
+
+/* Values for 'OPENCV_DNN_IE_VPU_TYPE' parameter */
+#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_UNSPECIFIED ""
+/// Intel(R) Movidius(TM) Neural Compute Stick, NCS (USB 03e7:2150), Myriad2 (https://software.intel.com/en-us/movidius-ncs)
+#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2 "Myriad2"
+/// Intel(R) Neural Compute Stick 2, NCS2 (USB 03e7:2485), MyriadX (https://software.intel.com/ru-ru/neural-compute-stick)
+#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X "MyriadX"
+#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE "ARM_COMPUTE"
+#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_X86         "X86"
+
+
+/** @brief Returns Inference Engine VPU type.
+ *
+ * See values of `CV_DNN_INFERENCE_ENGINE_VPU_TYPE_*` macros.
+ */
+CV_EXPORTS_W cv::String getInferenceEngineVPUType();
+
+/** @brief Returns Inference Engine CPU type.
+ *
+ * Specify OpenVINO plugin: CPU or ARM.
+ */
+CV_EXPORTS_W cv::String getInferenceEngineCPUType();
+
+/** @brief Release a HDDL plugin.
+ */
+CV_EXPORTS_W void releaseHDDLPlugin();
+
+
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif // OPENCV_DNN_UTILS_INF_ENGINE_HPP
--- a/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/include/opencv2/dnn/version.hpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_VERSION_HPP
+#define OPENCV_DNN_VERSION_HPP
+
+/// Use with major OpenCV version only.
+#define OPENCV_DNN_API_VERSION 20211004
+
+#if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
+#define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
+#define CV__DNN_INLINE_NS_BEGIN namespace CV__DNN_INLINE_NS {
+#define CV__DNN_INLINE_NS_END }
+namespace cv { namespace dnn { namespace CV__DNN_INLINE_NS { } using namespace CV__DNN_INLINE_NS; }}
+#else
+#define CV__DNN_INLINE_NS_BEGIN
+#define CV__DNN_INLINE_NS_END
+#endif
+
+#endif  // OPENCV_DNN_VERSION_HPP
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/caffe/opencv-caffe.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/caffe/opencv-caffe.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/caffe/opencv-caffe.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/caffe/opencv-caffe.pb.h
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/face_detector_accuracy.py
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/face_detector_accuracy.py
@@ -0,0 +1,196 @@
+# This script is used to estimate an accuracy of different face detection models.
+# COCO evaluation tool is used to compute an accuracy metrics (Average Precision).
+# Script works with different face detection datasets.
+import os
+import json
+from fnmatch import fnmatch
+from math import pi
+import cv2 as cv
+import argparse
+import os
+import sys
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+parser = argparse.ArgumentParser(
+        description='Evaluate OpenCV face detection algorithms '
+                    'using COCO evaluation tool, http://cocodataset.org/#detections-eval')
+parser.add_argument('--proto', help='Path to .prototxt of Caffe model or .pbtxt of TensorFlow graph')
+parser.add_argument('--model', help='Path to .caffemodel trained in Caffe or .pb from TensorFlow')
+parser.add_argument('--cascade', help='Optional path to trained Haar cascade as '
+                                      'an additional model for evaluation')
+parser.add_argument('--ann', help='Path to text file with ground truth annotations')
+parser.add_argument('--pics', help='Path to images root directory')
+parser.add_argument('--fddb', help='Evaluate FDDB dataset, http://vis-www.cs.umass.edu/fddb/', action='store_true')
+parser.add_argument('--wider', help='Evaluate WIDER FACE dataset, http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/', action='store_true')
+args = parser.parse_args()
+
+dataset = {}
+dataset['images'] = []
+dataset['categories'] = [{ 'id': 0, 'name': 'face' }]
+dataset['annotations'] = []
+
+def ellipse2Rect(params):
+    rad_x = params[0]
+    rad_y = params[1]
+    angle = params[2] * 180.0 / pi
+    center_x = params[3]
+    center_y = params[4]
+    pts = cv.ellipse2Poly((int(center_x), int(center_y)), (int(rad_x), int(rad_y)),
+                          int(angle), 0, 360, 10)
+    rect = cv.boundingRect(pts)
+    left = rect[0]
+    top = rect[1]
+    right = rect[0] + rect[2]
+    bottom = rect[1] + rect[3]
+    return left, top, right, bottom
+
+def addImage(imagePath):
+    assert('images' in  dataset)
+    imageId = len(dataset['images'])
+    dataset['images'].append({
+        'id': int(imageId),
+        'file_name': imagePath
+    })
+    return imageId
+
+def addBBox(imageId, left, top, width, height):
+    assert('annotations' in  dataset)
+    dataset['annotations'].append({
+        'id': len(dataset['annotations']),
+        'image_id': int(imageId),
+        'category_id': 0,  # Face
+        'bbox': [int(left), int(top), int(width), int(height)],
+        'iscrowd': 0,
+        'area': float(width * height)
+    })
+
+def addDetection(detections, imageId, left, top, width, height, score):
+    detections.append({
+      'image_id': int(imageId),
+      'category_id': 0,  # Face
+      'bbox': [int(left), int(top), int(width), int(height)],
+      'score': float(score)
+    })
+
+
+def fddb_dataset(annotations, images):
+    for d in os.listdir(annotations):
+        if fnmatch(d, 'FDDB-fold-*-ellipseList.txt'):
+            with open(os.path.join(annotations, d), 'rt') as f:
+                lines = [line.rstrip('\n') for line in f]
+                lineId = 0
+                while lineId < len(lines):
+                    # Image
+                    imgPath = lines[lineId]
+                    lineId += 1
+                    imageId = addImage(os.path.join(images, imgPath) + '.jpg')
+
+                    img = cv.imread(os.path.join(images, imgPath) + '.jpg')
+
+                    # Faces
+                    numFaces = int(lines[lineId])
+                    lineId += 1
+                    for i in range(numFaces):
+                        params = [float(v) for v in lines[lineId].split()]
+                        lineId += 1
+                        left, top, right, bottom = ellipse2Rect(params)
+                        addBBox(imageId, left, top, width=right - left + 1,
+                                height=bottom - top + 1)
+
+
+def wider_dataset(annotations, images):
+    with open(annotations, 'rt') as f:
+        lines = [line.rstrip('\n') for line in f]
+        lineId = 0
+        while lineId < len(lines):
+            # Image
+            imgPath = lines[lineId]
+            lineId += 1
+            imageId = addImage(os.path.join(images, imgPath))
+
+            # Faces
+            numFaces = int(lines[lineId])
+            lineId += 1
+            for i in range(numFaces):
+                params = [int(v) for v in lines[lineId].split()]
+                lineId += 1
+                left, top, width, height = params[0], params[1], params[2], params[3]
+                addBBox(imageId, left, top, width, height)
+
+def evaluate():
+    cocoGt = COCO('annotations.json')
+    cocoDt = cocoGt.loadRes('detections.json')
+    cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    cocoEval.summarize()
+
+
+### Convert to COCO annotations format #########################################
+assert(args.fddb or args.wider)
+if args.fddb:
+    fddb_dataset(args.ann, args.pics)
+elif args.wider:
+    wider_dataset(args.ann, args.pics)
+
+with open('annotations.json', 'wt') as f:
+    json.dump(dataset, f)
+
+### Obtain detections ##########################################################
+detections = []
+if args.proto and args.model:
+    net = cv.dnn.readNet(args.proto, args.model)
+
+    def detect(img, imageId):
+        imgWidth = img.shape[1]
+        imgHeight = img.shape[0]
+        net.setInput(cv.dnn.blobFromImage(img, 1.0, (300, 300), (104., 177., 123.), False, False))
+        out = net.forward()
+
+        for i in range(out.shape[2]):
+            confidence = out[0, 0, i, 2]
+            left = int(out[0, 0, i, 3] * img.shape[1])
+            top = int(out[0, 0, i, 4] * img.shape[0])
+            right = int(out[0, 0, i, 5] * img.shape[1])
+            bottom = int(out[0, 0, i, 6] * img.shape[0])
+
+            x = max(0, min(left, img.shape[1] - 1))
+            y = max(0, min(top, img.shape[0] - 1))
+            w = max(0, min(right - x + 1, img.shape[1] - x))
+            h = max(0, min(bottom - y + 1, img.shape[0] - y))
+
+            addDetection(detections, imageId, x, y, w, h, score=confidence)
+
+elif args.cascade:
+    cascade = cv.CascadeClassifier(args.cascade)
+
+    def detect(img, imageId):
+        srcImgGray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+        faces = cascade.detectMultiScale(srcImgGray)
+
+        for rect in faces:
+            left, top, width, height = rect[0], rect[1], rect[2], rect[3]
+            addDetection(detections, imageId, left, top, width, height, score=1.0)
+
+for i in range(len(dataset['images'])):
+    sys.stdout.write('\r%d / %d' % (i + 1, len(dataset['images'])))
+    sys.stdout.flush()
+
+    img = cv.imread(dataset['images'][i]['file_name'])
+    imageId = int(dataset['images'][i]['id'])
+
+    detect(img, imageId)
+
+with open('detections.json', 'wt') as f:
+    json.dump(detections, f)
+
+evaluate()
+
+
+def rm(f):
+    if os.path.exists(f):
+        os.remove(f)
+
+rm('annotations.json')
+rm('detections.json')
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/java/filelist_common
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/java/filelist_common
@@ -0,0 +1 @@
+misc/java/src/cpp/dnn_converters.hpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/java/gen_dict.json
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/java/gen_dict.json
@@ -0,0 +1,63 @@
+{
+    "type_dict": {
+        "MatShape": {
+            "j_type": "MatOfInt",
+            "jn_type": "long",
+            "jni_type": "jlong",
+            "jni_var": "MatShape %(n)s",
+            "suffix": "J",
+            "v_type": "Mat",
+            "j_import": "org.opencv.core.MatOfInt"
+        },
+        "vector_MatShape": {
+            "j_type": "List<MatOfInt>",
+            "jn_type": "List<MatOfInt>",
+            "jni_type": "jobject",
+            "jni_var": "std::vector< MatShape > %(n)s",
+            "suffix": "Ljava_util_List",
+            "v_type": "vector_MatShape",
+            "j_import": "org.opencv.core.MatOfInt"
+        },
+        "vector_size_t": {
+            "j_type": "MatOfDouble",
+            "jn_type": "long",
+            "jni_type": "jlong",
+            "jni_var": "std::vector<size_t> %(n)s",
+            "suffix": "J",
+            "v_type": "Mat",
+            "j_import": "org.opencv.core.MatOfDouble"
+        },
+        "vector_Ptr_Layer": {
+            "j_type": "List<Layer>",
+            "jn_type": "List<Layer>",
+            "jni_type": "jobject",
+            "jni_var": "std::vector< Ptr<cv::dnn::Layer> > %(n)s",
+            "suffix": "Ljava_util_List",
+            "v_type": "vector_Layer",
+            "j_import": "org.opencv.dnn.Layer"
+        },
+        "vector_Target": {
+            "j_type": "List<Integer>",
+            "jn_type": "List<Integer>",
+            "jni_type": "jobject",
+            "jni_var": "std::vector< cv::dnn::Target > %(n)s",
+            "suffix": "Ljava_util_List",
+            "v_type": "vector_Target"
+        },
+        "LayerId": {
+            "j_type": "DictValue",
+            "jn_type": "long",
+            "jn_args": [
+                [
+                    "__int64",
+                    ".getNativeObjAddr()"
+                ]
+
+            ],
+            "jni_name": "(*(*(Ptr<cv::dnn::DictValue>*)%(n)s_nativeObj))",
+            "jni_type": "jlong",
+            "suffix": "J",
+            "j_import": "org.opencv.dnn.DictValue"
+        }
+    }
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/java/src/cpp/dnn_converters.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/java/src/cpp/dnn_converters.cpp
@@ -0,0 +1,102 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Author: abratchik
+
+#include "dnn_converters.hpp"
+
+#define LOG_TAG "org.opencv.dnn"
+
+void Mat_to_MatShape(cv::Mat& mat, MatShape& matshape)
+{
+    matshape.clear();
+    CHECK_MAT(mat.type()==CV_32SC1 && mat.cols==1);
+    matshape = (MatShape) mat;
+}
+
+void MatShape_to_Mat(MatShape& matshape, cv::Mat& mat)
+{
+    mat = cv::Mat(matshape, true);
+}
+
+std::vector<MatShape> List_to_vector_MatShape(JNIEnv* env, jobject list)
+{
+    static jclass juArrayList       = ARRAYLIST(env);
+    jmethodID m_size       = LIST_SIZE(env, juArrayList);
+    jmethodID m_get        = LIST_GET(env, juArrayList);
+
+    static jclass jMatOfInt = MATOFINT(env);
+
+    jint len = env->CallIntMethod(list, m_size);
+    std::vector<MatShape> result;
+    result.reserve(len);
+    for (jint i=0; i<len; i++)
+    {
+        jobject element = static_cast<jobject>(env->CallObjectMethod(list, m_get, i));
+        cv::Mat& mat = *((cv::Mat*) GETNATIVEOBJ(env, jMatOfInt, element) );
+        MatShape matshape = (MatShape) mat;
+        result.push_back(matshape);
+        env->DeleteLocalRef(element);
+    }
+    return result;
+}
+
+jobject vector_Ptr_Layer_to_List(JNIEnv* env, std::vector<cv::Ptr<cv::dnn::Layer> >& vs)
+{
+    static jclass juArrayList   = ARRAYLIST(env);
+    static jmethodID m_create   = CONSTRUCTOR(env, juArrayList);
+    jmethodID m_add       = LIST_ADD(env, juArrayList);
+
+    static jclass jLayerClass = LAYER(env);
+    static jmethodID m_create_layer = LAYER_CONSTRUCTOR(env, jLayerClass);
+
+    jobject result = env->NewObject(juArrayList, m_create, vs.size());
+    for (std::vector< cv::Ptr<cv::dnn::Layer> >::iterator it = vs.begin(); it != vs.end(); ++it) {
+        jobject element = env->NewObject(jLayerClass, m_create_layer, (*it).get());
+        env->CallBooleanMethod(result, m_add, element);
+        env->DeleteLocalRef(element);
+    }
+    return result;
+}
+
+jobject vector_Target_to_List(JNIEnv* env, std::vector<cv::dnn::Target>& vs)
+{
+    static jclass juArrayList   = ARRAYLIST(env);
+    static jmethodID m_create   = CONSTRUCTOR(env, juArrayList);
+    jmethodID m_add       = LIST_ADD(env, juArrayList);
+
+    static jclass jInteger = env->FindClass("java/lang/Integer");
+    static jmethodID m_create_Integer = env->GetMethodID(jInteger, "<init>", "(I)V");
+
+    jobject result = env->NewObject(juArrayList, m_create, vs.size());
+    for (size_t i = 0; i < vs.size(); ++i)
+    {
+        jobject element = env->NewObject(jInteger, m_create_Integer, vs[i]);
+        env->CallBooleanMethod(result, m_add, element);
+        env->DeleteLocalRef(element);
+    }
+    return result;
+}
+
+std::vector<cv::Ptr<cv::dnn::Layer> > List_to_vector_Ptr_Layer(JNIEnv* env, jobject list)
+{
+    static jclass juArrayList       = ARRAYLIST(env);
+    jmethodID m_size       = LIST_SIZE(env, juArrayList);
+    jmethodID m_get        = LIST_GET(env, juArrayList);
+
+    static jclass jLayerClass = LAYER(env);
+
+    jint len = env->CallIntMethod(list, m_size);
+    std::vector< cv::Ptr<cv::dnn::Layer> > result;
+    result.reserve(len);
+    for (jint i=0; i<len; i++)
+    {
+        jobject element = static_cast<jobject>(env->CallObjectMethod(list, m_get, i));
+        cv::Ptr<cv::dnn::Layer>* layer_ptr = (cv::Ptr<cv::dnn::Layer>*) GETNATIVEOBJ(env, jLayerClass, element) ;
+        cv::Ptr<cv::dnn::Layer> layer = *(layer_ptr);
+        result.push_back(layer);
+        env->DeleteLocalRef(element);
+    }
+    return result;
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/java/src/cpp/dnn_converters.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/java/src/cpp/dnn_converters.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Author: abratchik
+
+#ifndef DNN_CONVERTERS_HPP
+#define	DNN_CONVERTERS_HPP
+
+#include <jni.h>
+#include "opencv_java.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/dnn/dnn.hpp"
+
+#define LAYER(ENV) static_cast<jclass>(ENV->NewGlobalRef(ENV->FindClass("org/opencv/dnn/Layer")))
+#define LAYER_CONSTRUCTOR(ENV, CLS) ENV->GetMethodID(CLS, "<init>", "(J)V")
+
+
+using namespace cv::dnn;
+
+void Mat_to_MatShape(cv::Mat& mat, MatShape& matshape);
+
+void MatShape_to_Mat(MatShape& matshape, cv::Mat& mat);
+
+std::vector<MatShape> List_to_vector_MatShape(JNIEnv* env, jobject list);
+
+jobject vector_Ptr_Layer_to_List(JNIEnv* env, std::vector<cv::Ptr<cv::dnn::Layer> >& vs);
+
+std::vector<cv::Ptr<cv::dnn::Layer> > List_to_vector_Ptr_Layer(JNIEnv* env, jobject list);
+
+jobject vector_Target_to_List(JNIEnv* env, std::vector<cv::dnn::Target>& vs);
+
+#endif	/* DNN_CONVERTERS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/java/test/DnnListRegressionTest.java
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/java/test/DnnListRegressionTest.java
@@ -0,0 +1,119 @@
+package org.opencv.test.dnn;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfInt;
+import org.opencv.core.MatOfFloat;
+import org.opencv.core.MatOfByte;
+import org.opencv.core.Scalar;
+import org.opencv.core.Size;
+import org.opencv.dnn.DictValue;
+import org.opencv.dnn.Dnn;
+import org.opencv.dnn.Layer;
+import org.opencv.dnn.Net;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.test.OpenCVTestCase;
+
+/*
+*  regression test for #12324,
+*    testing various java.util.List invocations,
+*    which use the LIST_GET macro
+*/
+
+public class DnnListRegressionTest extends OpenCVTestCase {
+
+    private final static String ENV_OPENCV_DNN_TEST_DATA_PATH = "OPENCV_DNN_TEST_DATA_PATH";
+
+    private final static String ENV_OPENCV_TEST_DATA_PATH = "OPENCV_TEST_DATA_PATH";
+
+    String modelFileName = "";
+    String sourceImageFile = "";
+
+    Net net;
+
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+
+        String envDnnTestDataPath = System.getenv(ENV_OPENCV_DNN_TEST_DATA_PATH);
+
+        if(envDnnTestDataPath == null){
+            isTestCaseEnabled = false;
+            return;
+        }
+
+        File dnnTestDataPath = new File(envDnnTestDataPath);
+        modelFileName =  new File(dnnTestDataPath, "dnn/tensorflow_inception_graph.pb").toString();
+
+        String envTestDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
+
+        if(envTestDataPath == null) throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+
+        File testDataPath = new File(envTestDataPath);
+
+        File f = new File(testDataPath, "dnn/grace_hopper_227.png");
+        sourceImageFile = f.toString();
+        if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);
+
+        net = Dnn.readNetFromTensorflow(modelFileName);
+
+        Mat image = Imgcodecs.imread(sourceImageFile);
+        assertNotNull("Loading image from file failed!", image);
+
+        Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
+        assertNotNull("Converting image to blob failed!", inputBlob);
+
+        net.setInput(inputBlob, "input");
+    }
+
+    public void testSetInputsNames() {
+        List<String> inputs = new ArrayList();
+        inputs.add("input");
+        try {
+            net.setInputsNames(inputs);
+        } catch(Exception e) {
+            fail("Net setInputsNames failed: " + e.getMessage());
+        }
+    }
+
+    public void testForward() {
+        List<Mat> outs = new ArrayList();
+        List<String> outNames = new ArrayList();
+        outNames.add("softmax2");
+        try {
+            net.forward(outs,outNames);
+        } catch(Exception e) {
+            fail("Net forward failed: " + e.getMessage());
+        }
+    }
+
+    public void testGetMemoryConsumption() {
+        int layerId = 1;
+        List<MatOfInt> netInputShapes = new ArrayList();
+        netInputShapes.add(new MatOfInt(1, 3, 224, 224));
+        long[] weights=null;
+        long[] blobs=null;
+        try {
+            net.getMemoryConsumption(layerId, netInputShapes, weights, blobs);
+        } catch(Exception e) {
+            fail("Net getMemoryConsumption failed: " + e.getMessage());
+        }
+    }
+
+    public void testGetFLOPS() {
+        int layerId = 1;
+        List<MatOfInt> netInputShapes = new ArrayList();
+        netInputShapes.add(new MatOfInt(1, 3, 224, 224));
+        try {
+            net.getFLOPS(layerId, netInputShapes);
+        } catch(Exception e) {
+            fail("Net getFLOPS failed: " + e.getMessage());
+        }
+    }
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/java/test/DnnTensorFlowTest.java
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/java/test/DnnTensorFlowTest.java
@@ -0,0 +1,149 @@
+package org.opencv.test.dnn;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfFloat;
+import org.opencv.core.MatOfByte;
+import org.opencv.core.Scalar;
+import org.opencv.core.Size;
+import org.opencv.dnn.DictValue;
+import org.opencv.dnn.Dnn;
+import org.opencv.dnn.Layer;
+import org.opencv.dnn.Net;
+import org.opencv.imgcodecs.Imgcodecs;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.test.OpenCVTestCase;
+
+public class DnnTensorFlowTest extends OpenCVTestCase {
+
+    private final static String ENV_OPENCV_DNN_TEST_DATA_PATH = "OPENCV_DNN_TEST_DATA_PATH";
+
+    private final static String ENV_OPENCV_TEST_DATA_PATH = "OPENCV_TEST_DATA_PATH";
+
+    String modelFileName = "";
+    String sourceImageFile = "";
+
+    Net net;
+
+    private static void normAssert(Mat ref, Mat test) {
+        final double l1 = 1e-5;
+        final double lInf = 1e-4;
+        double normL1 = Core.norm(ref, test, Core.NORM_L1) / ref.total();
+        double normLInf = Core.norm(ref, test, Core.NORM_INF) / ref.total();
+        assertTrue(normL1 < l1);
+        assertTrue(normLInf < lInf);
+    }
+
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+
+        String envDnnTestDataPath = System.getenv(ENV_OPENCV_DNN_TEST_DATA_PATH);
+
+        if(envDnnTestDataPath == null){
+            isTestCaseEnabled = false;
+            return;
+        }
+
+        File dnnTestDataPath = new File(envDnnTestDataPath);
+        modelFileName =  new File(dnnTestDataPath, "dnn/tensorflow_inception_graph.pb").toString();
+
+        String envTestDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
+
+        if(envTestDataPath == null) throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+
+        File testDataPath = new File(envTestDataPath);
+
+        File f = new File(testDataPath, "dnn/grace_hopper_227.png");
+        sourceImageFile = f.toString();
+        if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);
+
+        net = Dnn.readNetFromTensorflow(modelFileName);
+    }
+
+    public void testGetLayerTypes() {
+        List<String> layertypes = new ArrayList();
+        net.getLayerTypes(layertypes);
+
+        assertFalse("No layer types returned!", layertypes.isEmpty());
+    }
+
+    public void testGetLayer() {
+        List<String> layernames = net.getLayerNames();
+
+        assertFalse("Test net returned no layers!", layernames.isEmpty());
+
+        String testLayerName = layernames.get(0);
+
+        DictValue layerId = new DictValue(testLayerName);
+
+        assertEquals("DictValue did not return the string, which was used in constructor!", testLayerName, layerId.getStringValue());
+
+        Layer layer = net.getLayer(layerId);
+
+        assertEquals("Layer name does not match the expected value!", testLayerName, layer.get_name());
+
+    }
+
+    public void checkInceptionNet(Net net)
+    {
+        Mat image = Imgcodecs.imread(sourceImageFile);
+        assertNotNull("Loading image from file failed!", image);
+
+        Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
+        assertNotNull("Converting image to blob failed!", inputBlob);
+
+        net.setInput(inputBlob, "input");
+
+        Mat result = new Mat();
+        try {
+            net.setPreferableBackend(Dnn.DNN_BACKEND_OPENCV);
+            result = net.forward("softmax2");
+        }
+        catch (Exception e) {
+            fail("DNN forward failed: " + e.getMessage());
+        }
+        assertNotNull("Net returned no result!", result);
+
+        result = result.reshape(1, 1);
+        Core.MinMaxLocResult minmax = Core.minMaxLoc(result);
+        assertEquals("Wrong prediction", (int)minmax.maxLoc.x, 866);
+
+        Mat top5RefScores = new MatOfFloat(new float[] {
+            0.63032645f, 0.2561979f, 0.032181446f, 0.015721032f, 0.014785315f
+        }).reshape(1, 1);
+
+        Core.sort(result, result, Core.SORT_DESCENDING);
+
+        normAssert(result.colRange(0, 5), top5RefScores);
+    }
+
+    public void testTestNetForward() {
+        checkInceptionNet(net);
+    }
+
+    public void testReadFromBuffer() {
+        File modelFile = new File(modelFileName);
+        byte[] modelBuffer = new byte[ (int)modelFile.length() ];
+
+        try {
+            FileInputStream fis = new FileInputStream(modelFile);
+            fis.read(modelBuffer);
+            fis.close();
+        } catch (IOException e) {
+            fail("Failed to read a model: " + e.getMessage());
+        }
+        net = Dnn.readNetFromTensorflow(new MatOfByte(modelBuffer));
+        checkInceptionNet(net);
+    }
+
+    public void testGetAvailableTargets() {
+        List<Integer> targets = Dnn.getAvailableTargets(Dnn.DNN_BACKEND_OPENCV);
+        assertTrue(targets.contains(Dnn.DNN_TARGET_CPU));
+    }
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/objc/gen_dict.json
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/objc/gen_dict.json
@@ -0,0 +1,46 @@
+{
+    "func_arg_fix" : {
+        "Dnn": {
+            "(Net*)readNetFromCaffe:(NSString*)prototxt caffeModel:(NSString*)caffeModel" : { "readNetFromCaffe" : {"name" : "readNetFromCaffeFile"} },
+            "(Net*)readNetFromCaffe:(ByteVector*)bufferProto bufferModel:(ByteVector*)bufferModel" : { "readNetFromCaffe" : {"name" : "readNetFromCaffeBuffer"} },
+            "(Net*)readNetFromDarknet:(NSString*)cfgFile darknetModel:(NSString*)darknetModel" : { "readNetFromDarknet" : {"name" : "readNetFromDarknetFile"} },
+            "(Net*)readNetFromDarknet:(ByteVector*)bufferCfg bufferModel:(ByteVector*)bufferModel" : { "readNetFromDarknet" : {"name" : "readNetFromDarknetBuffer"} },
+            "(Net*)readNetFromONNX:(NSString*)onnxFile" : { "readNetFromONNX" : {"name" : "readNetFromONNXFile"} },
+            "(Net*)readNetFromONNX:(ByteVector*)buffer" : { "readNetFromONNX" : {"name" : "readNetFromONNXBuffer"} },
+            "(Net*)readNetFromTensorflow:(NSString*)model config:(NSString*)config" : { "readNetFromTensorflow" : {"name" : "readNetFromTensorflowFile"} },
+            "(Net*)readNetFromTensorflow:(ByteVector*)bufferModel bufferConfig:(ByteVector*)bufferConfig" : { "readNetFromTensorflow" : {"name" : "readNetFromTensorflowBuffer"} }
+        },
+        "Net": {
+            "(void)forward:(NSMutableArray<Mat*>*)outputBlobs outputName:(NSString*)outputName" : { "forward" : {"name" : "forwardOutputBlobs"} },
+            "(void)forward:(NSMutableArray<Mat*>*)outputBlobs outBlobNames:(NSArray<NSString*>*)outBlobNames" : { "forward" : {"name" : "forwardOutputBlobs"} },
+            "(void)forwardAndRetrieve:(NSMutableArray<NSMutableArray<Mat*>*>*)outputBlobs outBlobNames:(NSArray<NSString*>*)outBlobNames" : { "forward" : {"swift_name" : "forwardAndRetrieve"} },
+            "(long)getFLOPS:(IntVector*)netInputShape" : { "getFLOPS" : {"name" : "getFLOPSWithNetInputShape"} },
+            "(long)getFLOPS:(NSArray<IntVector*>*)netInputShapes" : { "getFLOPS" : {"name" : "getFLOPSWithNetInputShapes"} },
+            "(long)getFLOPS:(int)layerId netInputShape:(IntVector*)netInputShape" : { "getFLOPS" : {"name" : "getFLOPSWithLayerId"} },
+            "(long)getFLOPS:(int)layerId netInputShapes:(NSArray<IntVector*>*)netInputShapes" : { "getFLOPS" : {"name" : "getFLOPSWithLayerId"} },
+            "(void)getLayersShapes:(IntVector*)netInputShape layersIds:(IntVector*)layersIds inLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)inLayersShapes outLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)outLayersShapes" : { "getLayersShapes" : {"name" : "getLayersShapesWithNetInputShape"} },
+            "(void)getLayersShapes:(NSArray<IntVector*>*)netInputShapes layersIds:(IntVector*)layersIds inLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)inLayersShapes outLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)outLayersShapes" : { "getLayersShapes" : {"name" : "getLayersShapesWithNetInputShapes"} }
+        }
+    },
+    "type_dict": {
+        "MatShape": {
+            "objc_type": "IntVector*",
+            "to_cpp": "%(n)s.nativeRef",
+            "from_cpp": "[IntVector fromNative:%(n)s]",
+            "cast_to": "std::vector<int>"
+        },
+        "vector_MatShape": {
+            "objc_type": "IntVector*",
+            "v_type": "IntVector"
+        },
+        "vector_vector_MatShape": {
+            "objc_type": "IntVector*",
+            "v_v_type": "IntVector"
+        },
+        "LayerId": {
+            "objc_type": "DictValue*",
+            "to_cpp": "*(cv::dnn::DictValue*)(%(n)s.nativePtr)",
+            "from_cpp": "[DictValue fromNative:%(n)s]"
+        }
+    }
+}
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/onnx/opencv-onnx.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/onnx/opencv-onnx.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/onnx/opencv-onnx.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/onnx/opencv-onnx.pb.h
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/python/pyopencv_dnn.hpp
@@ -0,0 +1,219 @@
+#ifdef HAVE_OPENCV_DNN
+typedef dnn::DictValue LayerId;
+typedef std::vector<dnn::MatShape> vector_MatShape;
+typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
+
+template<>
+bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const ArgInfo& info)
+{
+    CV_UNUSED(info);
+    if (!o || o == Py_None)
+        return true; //Current state will be used
+    else if (PyLong_Check(o))
+    {
+        dv = dnn::DictValue((int64)PyLong_AsLongLong(o));
+        return true;
+    }
+    else if (PyInt_Check(o))
+    {
+        dv = dnn::DictValue((int64)PyInt_AS_LONG(o));
+        return true;
+    }
+    else if (PyFloat_Check(o))
+    {
+        dv = dnn::DictValue(PyFloat_AsDouble(o));
+        return true;
+    }
+    else
+    {
+        std::string str;
+        if (getUnicodeString(o, str))
+        {
+            dv = dnn::DictValue(str);
+            return true;
+        }
+    }
+    return false;
+}
+
+template<typename T>
+PyObject* pyopencv_from(const dnn::DictValue &dv)
+{
+    if (dv.size() > 1)
+    {
+        std::vector<T> vec(dv.size());
+        for (int i = 0; i < dv.size(); ++i)
+            vec[i] = dv.get<T>(i);
+        return pyopencv_from_generic_vec(vec);
+    }
+    else
+        return pyopencv_from(dv.get<T>());
+}
+
+template<>
+PyObject* pyopencv_from(const dnn::DictValue &dv)
+{
+    if (dv.isInt()) return pyopencv_from<int>(dv);
+    if (dv.isReal()) return pyopencv_from<float>(dv);
+    if (dv.isString()) return pyopencv_from<String>(dv);
+    CV_Error(Error::StsNotImplemented, "Unknown value type");
+    return NULL;
+}
+
+template<>
+PyObject* pyopencv_from(const dnn::LayerParams& lp)
+{
+    PyObject* dict = PyDict_New();
+    for (std::map<String, dnn::DictValue>::const_iterator it = lp.begin(); it != lp.end(); ++it)
+    {
+        CV_Assert(!PyDict_SetItemString(dict, it->first.c_str(), pyopencv_from(it->second)));
+    }
+    return dict;
+}
+
+template<>
+PyObject* pyopencv_from(const std::vector<dnn::Target> &t)
+{
+    return pyopencv_from(std::vector<int>(t.begin(), t.end()));
+}
+
+class pycvLayer CV_FINAL : public dnn::Layer
+{
+public:
+    pycvLayer(const dnn::LayerParams &params, PyObject* pyLayer) : Layer(params)
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        PyObject* args = PyTuple_New(2);
+        CV_Assert(!PyTuple_SetItem(args, 0, pyopencv_from(params)));
+        CV_Assert(!PyTuple_SetItem(args, 1, pyopencv_from(params.blobs)));
+        o = PyObject_CallObject(pyLayer, args);
+
+        Py_DECREF(args);
+        PyGILState_Release(gstate);
+        if (!o)
+            CV_Error(Error::StsError, "Failed to create an instance of custom layer");
+    }
+
+    static void registerLayer(const std::string& type, PyObject* o)
+    {
+        std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(type);
+        if (it != pyLayers.end())
+            it->second.push_back(o);
+        else
+            pyLayers[type] = std::vector<PyObject*>(1, o);
+    }
+
+    static void unregisterLayer(const std::string& type)
+    {
+        std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(type);
+        if (it != pyLayers.end())
+        {
+            if (it->second.size() > 1)
+                it->second.pop_back();
+            else
+                pyLayers.erase(it);
+        }
+    }
+
+    static Ptr<dnn::Layer> create(dnn::LayerParams &params)
+    {
+        std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(params.type);
+        if (it == pyLayers.end())
+            CV_Error(Error::StsNotImplemented, "Layer with a type \"" + params.type +
+                                               "\" is not implemented");
+        CV_Assert(!it->second.empty());
+        return Ptr<dnn::Layer>(new pycvLayer(params, it->second.back()));
+    }
+
+    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
+                                 const int,
+                                 std::vector<std::vector<int> > &outputs,
+                                 std::vector<std::vector<int> > &) const CV_OVERRIDE
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        PyObject* args = PyList_New(inputs.size());
+        for(size_t i = 0; i < inputs.size(); ++i)
+            PyList_SetItem(args, i, pyopencv_from_generic_vec(inputs[i]));
+
+        PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("getMemoryShapes"), args, NULL);
+        Py_DECREF(args);
+        PyGILState_Release(gstate);
+        if (!res)
+            CV_Error(Error::StsNotImplemented, "Failed to call \"getMemoryShapes\" method");
+        CV_Assert(pyopencv_to_generic_vec(res, outputs, ArgInfo("", 0)));
+        return false;
+    }
+
+    virtual void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        PyObject* args = pyopencv_from(inputs);
+        PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("forward"), args, NULL);
+        Py_DECREF(args);
+        if (!res)
+            CV_Error(Error::StsNotImplemented, "Failed to call \"forward\" method");
+
+        std::vector<Mat> pyOutputs;
+        CV_Assert(pyopencv_to(res, pyOutputs, ArgInfo("", 0)));
+        Py_DECREF(res);
+        PyGILState_Release(gstate);
+
+        CV_Assert(pyOutputs.size() == outputs.size());
+        for (size_t i = 0; i < outputs.size(); ++i)
+        {
+            CV_Assert(pyOutputs[i].size == outputs[i].size);
+            CV_Assert(pyOutputs[i].type() == outputs[i].type());
+            pyOutputs[i].copyTo(outputs[i]);
+        }
+    }
+
+private:
+    // Map layers types to python classes.
+    static std::map<std::string, std::vector<PyObject*> > pyLayers;
+    PyObject* o;  // Instance of implemented python layer.
+};
+
+std::map<std::string, std::vector<PyObject*> > pycvLayer::pyLayers;
+
+static PyObject *pyopencv_cv_dnn_registerLayer(PyObject*, PyObject *args, PyObject *kw)
+{
+    const char *keywords[] = { "type", "class", NULL };
+    char* layerType;
+    PyObject *classInstance;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "sO", (char**)keywords, &layerType, &classInstance))
+        return NULL;
+    if (!PyCallable_Check(classInstance)) {
+        PyErr_SetString(PyExc_TypeError, "class must be callable");
+        return NULL;
+    }
+
+    pycvLayer::registerLayer(layerType, classInstance);
+    dnn::LayerFactory::registerLayer(layerType, pycvLayer::create);
+    Py_RETURN_NONE;
+}
+
+static PyObject *pyopencv_cv_dnn_unregisterLayer(PyObject*, PyObject *args, PyObject *kw)
+{
+    const char *keywords[] = { "type", NULL };
+    char* layerType;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "s", (char**)keywords, &layerType))
+        return NULL;
+
+    pycvLayer::unregisterLayer(layerType);
+    dnn::LayerFactory::unregisterLayer(layerType);
+    Py_RETURN_NONE;
+}
+
+#endif  // HAVE_OPENCV_DNN
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/python/test/test_dnn.py
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/python/test/test_dnn.py
@@ -0,0 +1,415 @@
+#!/usr/bin/env python
+import os
+import cv2 as cv
+import numpy as np
+
+from tests_common import NewOpenCVTests, unittest
+
+def normAssert(test, a, b, msg=None, lInf=1e-5):
+    test.assertLess(np.max(np.abs(a - b)), lInf, msg)
+
+def inter_area(box1, box2):
+    x_min, x_max = max(box1[0], box2[0]), min(box1[2], box2[2])
+    y_min, y_max = max(box1[1], box2[1]), min(box1[3], box2[3])
+    return (x_max - x_min) * (y_max - y_min)
+
+def area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+def box2str(box):
+    left, top = box[0], box[1]
+    width, height = box[2] - left, box[3] - top
+    return '[%f x %f from (%f, %f)]' % (width, height, left, top)
+
+def normAssertDetections(test, refClassIds, refScores, refBoxes, testClassIds, testScores, testBoxes,
+                 confThreshold=0.0, scores_diff=1e-5, boxes_iou_diff=1e-4):
+    matchedRefBoxes = [False] * len(refBoxes)
+    errMsg = ''
+    for i in range(len(testBoxes)):
+        testScore = testScores[i]
+        if testScore < confThreshold:
+            continue
+
+        testClassId, testBox = testClassIds[i], testBoxes[i]
+        matched = False
+        for j in range(len(refBoxes)):
+            if (not matchedRefBoxes[j]) and testClassId == refClassIds[j] and \
+               abs(testScore - refScores[j]) < scores_diff:
+                interArea = inter_area(testBox, refBoxes[j])
+                iou = interArea / (area(testBox) + area(refBoxes[j]) - interArea)
+                if abs(iou - 1.0) < boxes_iou_diff:
+                    matched = True
+                    matchedRefBoxes[j] = True
+        if not matched:
+            errMsg += '\nUnmatched prediction: class %d score %f box %s' % (testClassId, testScore, box2str(testBox))
+
+    for i in range(len(refBoxes)):
+        if (not matchedRefBoxes[i]) and refScores[i] > confThreshold:
+            errMsg += '\nUnmatched reference: class %d score %f box %s' % (refClassIds[i], refScores[i], box2str(refBoxes[i]))
+    if errMsg:
+        test.fail(errMsg)
+
+def printParams(backend, target):
+    backendNames = {
+        cv.dnn.DNN_BACKEND_OPENCV: 'OCV',
+        cv.dnn.DNN_BACKEND_INFERENCE_ENGINE: 'DLIE'
+    }
+    targetNames = {
+        cv.dnn.DNN_TARGET_CPU: 'CPU',
+        cv.dnn.DNN_TARGET_OPENCL: 'OCL',
+        cv.dnn.DNN_TARGET_OPENCL_FP16: 'OCL_FP16',
+        cv.dnn.DNN_TARGET_MYRIAD: 'MYRIAD'
+    }
+    print('%s/%s' % (backendNames[backend], targetNames[target]))
+
+def getDefaultThreshold(target):
+    if target == cv.dnn.DNN_TARGET_OPENCL_FP16 or target == cv.dnn.DNN_TARGET_MYRIAD:
+        return 4e-3
+    else:
+        return 1e-5
+
+testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
+
+g_dnnBackendsAndTargets = None
+
+class dnn_test(NewOpenCVTests):
+
+    def setUp(self):
+        super(dnn_test, self).setUp()
+
+        global g_dnnBackendsAndTargets
+        if g_dnnBackendsAndTargets is None:
+            g_dnnBackendsAndTargets = self.initBackendsAndTargets()
+        self.dnnBackendsAndTargets = g_dnnBackendsAndTargets
+
+    def initBackendsAndTargets(self):
+        self.dnnBackendsAndTargets = [
+            [cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
+        ]
+
+        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU):
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
+        if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
+
+        if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
+            self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
+            if cv.ocl_Device.getDefault().isIntel():
+                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL):
+                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
+                if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16):
+                    self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
+        return self.dnnBackendsAndTargets
+
+    def find_dnn_file(self, filename, required=True):
+        if not required:
+            required = testdata_required
+        return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd()),
+                                         os.environ['OPENCV_TEST_DATA_PATH']],
+                              required=required)
+
+    def checkIETarget(self, backend, target):
+        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt')
+        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel')
+        net = cv.dnn.readNet(proto, model)
+        net.setPreferableBackend(backend)
+        net.setPreferableTarget(target)
+        inp = np.random.standard_normal([1, 2, 10, 11]).astype(np.float32)
+        try:
+            net.setInput(inp)
+            net.forward()
+        except BaseException as e:
+            return False
+        return True
+
+    def test_getAvailableTargets(self):
+        targets = cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_OPENCV)
+        self.assertTrue(cv.dnn.DNN_TARGET_CPU in targets)
+
+    def test_blobFromImage(self):
+        np.random.seed(324)
+
+        width = 6
+        height = 7
+        scale = 1.0/127.5
+        mean = (10, 20, 30)
+
+        # Test arguments names.
+        img = np.random.randint(0, 255, [4, 5, 3]).astype(np.uint8)
+        blob = cv.dnn.blobFromImage(img, scale, (width, height), mean, True, False)
+        blob_args = cv.dnn.blobFromImage(img, scalefactor=scale, size=(width, height),
+                                         mean=mean, swapRB=True, crop=False)
+        normAssert(self, blob, blob_args)
+
+        # Test values.
+        target = cv.resize(img, (width, height), interpolation=cv.INTER_LINEAR)
+        target = target.astype(np.float32)
+        target = target[:,:,[2, 1, 0]]  # BGR2RGB
+        target[:,:,0] -= mean[0]
+        target[:,:,1] -= mean[1]
+        target[:,:,2] -= mean[2]
+        target *= scale
+        target = target.transpose(2, 0, 1).reshape(1, 3, height, width)  # to NCHW
+        normAssert(self, blob, target)
+
+
+    def test_model(self):
+        img_path = self.find_dnn_file("dnn/street.png")
+        weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
+        config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
+        if weights is None or config is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        frame = cv.imread(img_path)
+        model = cv.dnn_DetectionModel(weights, config)
+        model.setInputParams(size=(300, 300), mean=(127.5, 127.5, 127.5), scale=1.0/127.5)
+
+        iouDiff = 0.05
+        confThreshold = 0.0001
+        nmsThreshold = 0
+        scoreDiff = 1e-3
+
+        classIds, confidences, boxes = model.detect(frame, confThreshold, nmsThreshold)
+
+        refClassIds = (7, 15)
+        refConfidences = (0.9998, 0.8793)
+        refBoxes = ((328, 238, 85, 102), (101, 188, 34, 138))
+
+        normAssertDetections(self, refClassIds, refConfidences, refBoxes,
+                             classIds, confidences, boxes,confThreshold, scoreDiff, iouDiff)
+
+        for box in boxes:
+            cv.rectangle(frame, box, (0, 255, 0))
+            cv.rectangle(frame, np.array(box), (0, 255, 0))
+            cv.rectangle(frame, tuple(box), (0, 255, 0))
+            cv.rectangle(frame, list(box), (0, 255, 0))
+
+
+    def test_classification_model(self):
+        img_path = self.find_dnn_file("dnn/googlenet_0.png")
+        weights = self.find_dnn_file("dnn/squeezenet_v1.1.caffemodel", required=False)
+        config = self.find_dnn_file("dnn/squeezenet_v1.1.prototxt")
+        ref = np.load(self.find_dnn_file("dnn/squeezenet_v1.1_prob.npy"))
+        if weights is None or config is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/squeezenet_v1.1.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        frame = cv.imread(img_path)
+        model = cv.dnn_ClassificationModel(config, weights)
+        model.setInputSize(227, 227)
+        model.setInputCrop(True)
+
+        out = model.predict(frame)
+        normAssert(self, out, ref)
+
+
+    def test_textdetection_model(self):
+        img_path = self.find_dnn_file("dnn/text_det_test1.png")
+        weights = self.find_dnn_file("dnn/onnx/models/DB_TD500_resnet50.onnx", required=False)
+        if weights is None:
+            raise unittest.SkipTest("Missing DNN test files (onnx/models/DB_TD500_resnet50.onnx). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        frame = cv.imread(img_path)
+        scale = 1.0 / 255.0
+        size = (736, 736)
+        mean = (122.67891434, 116.66876762, 104.00698793)
+
+        model = cv.dnn_TextDetectionModel_DB(weights)
+        model.setInputParams(scale, size, mean)
+        out, _ = model.detect(frame)
+
+        self.assertTrue(type(out) == tuple, msg='actual type {}'.format(str(type(out))))
+        self.assertTrue(np.array(out).shape == (2, 4, 2))
+
+
+    def test_face_detection(self):
+        proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt')
+        model = self.find_dnn_file('dnn/opencv_face_detector.caffemodel', required=False)
+        if proto is None or model is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/opencv_face_detector.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        img = self.get_sample('gpu/lbpcascade/er.png')
+        blob = cv.dnn.blobFromImage(img, mean=(104, 177, 123), swapRB=False, crop=False)
+
+        ref = [[0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631],
+               [0, 1, 0.9934696,  0.2831718,  0.50738752, 0.345781,   0.5985168],
+               [0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290],
+               [0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477],
+               [0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494],
+               [0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427,  0.5347801]]
+
+        print('\n')
+        for backend, target in self.dnnBackendsAndTargets:
+            printParams(backend, target)
+
+            net = cv.dnn.readNet(proto, model)
+            net.setPreferableBackend(backend)
+            net.setPreferableTarget(target)
+            net.setInput(blob)
+            out = net.forward().reshape(-1, 7)
+
+            scoresDiff = 4e-3 if target in [cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD] else 1e-5
+            iouDiff = 2e-2 if target in [cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD] else 1e-4
+
+            ref = np.array(ref, np.float32)
+            refClassIds, testClassIds = ref[:, 1], out[:, 1]
+            refScores, testScores = ref[:, 2], out[:, 2]
+            refBoxes, testBoxes = ref[:, 3:], out[:, 3:]
+
+            normAssertDetections(self, refClassIds, refScores, refBoxes, testClassIds,
+                                 testScores, testBoxes, 0.5, scoresDiff, iouDiff)
+
+    def test_async(self):
+        timeout = 10*1000*10**6  # in nanoseconds (10 sec)
+        proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt')
+        model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel')
+        if proto is None or model is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/layers/layer_convolution.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        print('\n')
+        for backend, target in self.dnnBackendsAndTargets:
+            if backend != cv.dnn.DNN_BACKEND_INFERENCE_ENGINE:
+                continue
+
+            printParams(backend, target)
+
+            netSync = cv.dnn.readNet(proto, model)
+            netSync.setPreferableBackend(backend)
+            netSync.setPreferableTarget(target)
+
+            netAsync = cv.dnn.readNet(proto, model)
+            netAsync.setPreferableBackend(backend)
+            netAsync.setPreferableTarget(target)
+
+            # Generate inputs
+            numInputs = 10
+            inputs = []
+            for _ in range(numInputs):
+                inputs.append(np.random.standard_normal([2, 6, 75, 113]).astype(np.float32))
+
+            # Run synchronously
+            refs = []
+            for i in range(numInputs):
+                netSync.setInput(inputs[i])
+                refs.append(netSync.forward())
+
+            # Run asynchronously. To make test more robust, process inputs in the reversed order.
+            outs = []
+            for i in reversed(range(numInputs)):
+                netAsync.setInput(inputs[i])
+                outs.insert(0, netAsync.forwardAsync())
+
+            for i in reversed(range(numInputs)):
+                ret, result = outs[i].get(timeoutNs=float(timeout))
+                self.assertTrue(ret)
+                normAssert(self, refs[i], result, 'Index: %d' % i, 1e-10)
+
+    def test_nms(self):
+        confs = (1, 1)
+        rects = ((0, 0, 0.4, 0.4), (0, 0, 0.2, 0.4)) # 0.5 overlap
+
+        self.assertTrue(all(cv.dnn.NMSBoxes(rects, confs, 0, 0.6).ravel() == (0, 1)))
+
+    def test_custom_layer(self):
+        class CropLayer(object):
+            def __init__(self, params, blobs):
+                self.xstart = 0
+                self.xend = 0
+                self.ystart = 0
+                self.yend = 0
+            # Our layer receives two inputs. We need to crop the first input blob
+            # to match a shape of the second one (keeping batch size and number of channels)
+            def getMemoryShapes(self, inputs):
+                inputShape, targetShape = inputs[0], inputs[1]
+                batchSize, numChannels = inputShape[0], inputShape[1]
+                height, width = targetShape[2], targetShape[3]
+                self.ystart = (inputShape[2] - targetShape[2]) // 2
+                self.xstart = (inputShape[3] - targetShape[3]) // 2
+                self.yend = self.ystart + height
+                self.xend = self.xstart + width
+                return [[batchSize, numChannels, height, width]]
+            def forward(self, inputs):
+                return [inputs[0][:,:,self.ystart:self.yend,self.xstart:self.xend]]
+
+        cv.dnn_registerLayer('CropCaffe', CropLayer)
+        proto = '''
+        name: "TestCrop"
+        input: "input"
+        input_shape
+        {
+            dim: 1
+            dim: 2
+            dim: 5
+            dim: 5
+        }
+        input: "roi"
+        input_shape
+        {
+            dim: 1
+            dim: 2
+            dim: 3
+            dim: 3
+        }
+        layer {
+          name: "Crop"
+          type: "CropCaffe"
+          bottom: "input"
+          bottom: "roi"
+          top: "Crop"
+        }'''
+
+        net = cv.dnn.readNetFromCaffe(bytearray(proto.encode()))
+        for backend, target in self.dnnBackendsAndTargets:
+            if backend != cv.dnn.DNN_BACKEND_OPENCV:
+                continue
+
+            printParams(backend, target)
+
+            net.setPreferableBackend(backend)
+            net.setPreferableTarget(target)
+            src_shape = [1, 2, 5, 5]
+            dst_shape = [1, 2, 3, 3]
+            inp = np.arange(0, np.prod(src_shape), dtype=np.float32).reshape(src_shape)
+            roi = np.empty(dst_shape, dtype=np.float32)
+            net.setInput(inp, "input")
+            net.setInput(roi, "roi")
+            out = net.forward()
+            ref = inp[:, :, 1:4, 1:4]
+            normAssert(self, out, ref)
+
+        cv.dnn_unregisterLayer('CropCaffe')
+
+    # check that dnn module can work with 3D tensor as input for network
+    def test_input_3d(self):
+        model = self.find_dnn_file('dnn/onnx/models/hidden_lstm.onnx')
+        input_file = self.find_dnn_file('dnn/onnx/data/input_hidden_lstm.npy')
+        output_file = self.find_dnn_file('dnn/onnx/data/output_hidden_lstm.npy')
+        if model is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/onnx/models/hidden_lstm.onnx). "
+                                    "Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+        if input_file is None or output_file is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/onnx/data/{input/output}_hidden_lstm.npy). "
+                                    "Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        input = np.load(input_file)
+        # we have to expand the shape of input tensor because Python bindings cut 3D tensors to 2D
+        # it should be fixed in future. see : https://github.com/opencv/opencv/issues/19091
+        # please remove `expand_dims` after that
+        input = np.expand_dims(input, axis=3)
+        gold_output = np.load(output_file)
+
+        for backend, target in self.dnnBackendsAndTargets:
+            printParams(backend, target)
+
+            net = cv.dnn.readNet(model)
+
+            net.setPreferableBackend(backend)
+            net.setPreferableTarget(target)
+
+            net.setInput(input)
+            real_output = net.forward()
+
+            normAssert(self, real_output, gold_output, "", getDefaultThreshold(target))
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/quantize_face_detector.py
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/quantize_face_detector.py
@@ -0,0 +1,365 @@
+from __future__ import print_function
+import sys
+import argparse
+import cv2 as cv
+import tensorflow as tf
+import numpy as np
+import struct
+
+if sys.version_info > (3,):
+    long = int
+
+from tensorflow.python.tools import optimize_for_inference_lib
+from tensorflow.tools.graph_transforms import TransformGraph
+from tensorflow.core.framework.node_def_pb2 import NodeDef
+from google.protobuf import text_format
+
+parser = argparse.ArgumentParser(description="Use this script to create TensorFlow graph "
+                                             "with weights from OpenCV's face detection network. "
+                                             "Only backbone part of SSD model is converted this way. "
+                                             "Look for .pbtxt configuration file at "
+                                             "https://github.com/opencv/opencv_extra/tree/master/testdata/dnn/opencv_face_detector.pbtxt")
+parser.add_argument('--model', help='Path to .caffemodel weights', required=True)
+parser.add_argument('--proto', help='Path to .prototxt Caffe model definition', required=True)
+parser.add_argument('--pb', help='Path to output .pb TensorFlow model', required=True)
+parser.add_argument('--pbtxt', help='Path to output .pbxt TensorFlow graph', required=True)
+parser.add_argument('--quantize', help='Quantize weights to uint8', action='store_true')
+parser.add_argument('--fp16', help='Convert weights to half precision floats', action='store_true')
+args = parser.parse_args()
+
+assert(not args.quantize or not args.fp16)
+
+dtype = tf.float16 if args.fp16 else tf.float32
+
+################################################################################
+cvNet = cv.dnn.readNetFromCaffe(args.proto, args.model)
+
+def dnnLayer(name):
+    return cvNet.getLayer(long(cvNet.getLayerId(name)))
+
+def scale(x, name):
+    with tf.variable_scope(name):
+        layer = dnnLayer(name)
+        w = tf.Variable(layer.blobs[0].flatten(), dtype=dtype, name='mul')
+        if len(layer.blobs) > 1:
+            b = tf.Variable(layer.blobs[1].flatten(), dtype=dtype, name='add')
+            return tf.nn.bias_add(tf.multiply(x, w), b)
+        else:
+            return tf.multiply(x, w, name)
+
+def conv(x, name, stride=1, pad='SAME', dilation=1, activ=None):
+    with tf.variable_scope(name):
+        layer = dnnLayer(name)
+        w = tf.Variable(layer.blobs[0].transpose(2, 3, 1, 0), dtype=dtype, name='weights')
+        if dilation == 1:
+            conv = tf.nn.conv2d(x, filter=w, strides=(1, stride, stride, 1), padding=pad)
+        else:
+            assert(stride == 1)
+            conv = tf.nn.atrous_conv2d(x, w, rate=dilation, padding=pad)
+
+        if len(layer.blobs) > 1:
+            b = tf.Variable(layer.blobs[1].flatten(), dtype=dtype, name='bias')
+            conv = tf.nn.bias_add(conv, b)
+        return activ(conv) if activ else conv
+
+def batch_norm(x, name):
+    with tf.variable_scope(name):
+        # Unfortunately, TensorFlow's batch normalization layer doesn't work with fp16 input.
+        # Here we do a cast to fp32 but remove it in the frozen graph.
+        if x.dtype != tf.float32:
+            x = tf.cast(x, tf.float32)
+
+        layer = dnnLayer(name)
+        assert(len(layer.blobs) >= 3)
+
+        mean = layer.blobs[0].flatten()
+        std = layer.blobs[1].flatten()
+        scale = layer.blobs[2].flatten()
+
+        eps = 1e-5
+        hasBias = len(layer.blobs) > 3
+        hasWeights = scale.shape != (1,)
+
+        if not hasWeights and not hasBias:
+            mean /= scale[0]
+            std /= scale[0]
+
+        mean = tf.Variable(mean, dtype=tf.float32, name='mean')
+        std = tf.Variable(std, dtype=tf.float32, name='std')
+        gamma = tf.Variable(scale if hasWeights else np.ones(mean.shape), dtype=tf.float32, name='gamma')
+        beta = tf.Variable(layer.blobs[3].flatten() if hasBias else np.zeros(mean.shape), dtype=tf.float32, name='beta')
+        bn = tf.nn.fused_batch_norm(x, gamma, beta, mean, std, eps,
+                                    is_training=False)[0]
+        if bn.dtype != dtype:
+            bn = tf.cast(bn, dtype)
+        return bn
+
+def l2norm(x, name):
+    with tf.variable_scope(name):
+        layer = dnnLayer(name)
+        w = tf.Variable(layer.blobs[0].flatten(), dtype=dtype, name='mul')
+        return tf.nn.l2_normalize(x, 3, epsilon=1e-10) * w
+
+### Graph definition ###########################################################
+inp = tf.placeholder(dtype, [1, 300, 300, 3], 'data')
+data_bn = batch_norm(inp, 'data_bn')
+data_scale = scale(data_bn, 'data_scale')
+
+# Instead of tf.pad we use tf.space_to_batch_nd layers which override convolution's padding strategy to explicit numbers
+# data_scale = tf.pad(data_scale, [[0, 0], [3, 3], [3, 3], [0, 0]])
+data_scale = tf.space_to_batch_nd(data_scale, [1, 1], [[3, 3], [3, 3]], name='Pad')
+conv1_h = conv(data_scale, stride=2, pad='VALID', name='conv1_h')
+
+conv1_bn_h = batch_norm(conv1_h, 'conv1_bn_h')
+conv1_scale_h = scale(conv1_bn_h, 'conv1_scale_h')
+conv1_relu = tf.nn.relu(conv1_scale_h)
+conv1_pool = tf.layers.max_pooling2d(conv1_relu, pool_size=(3, 3), strides=(2, 2),
+                                     padding='SAME', name='conv1_pool')
+
+layer_64_1_conv1_h = conv(conv1_pool, 'layer_64_1_conv1_h')
+layer_64_1_bn2_h = batch_norm(layer_64_1_conv1_h, 'layer_64_1_bn2_h')
+layer_64_1_scale2_h = scale(layer_64_1_bn2_h, 'layer_64_1_scale2_h')
+layer_64_1_relu2 = tf.nn.relu(layer_64_1_scale2_h)
+layer_64_1_conv2_h = conv(layer_64_1_relu2, 'layer_64_1_conv2_h')
+layer_64_1_sum = layer_64_1_conv2_h + conv1_pool
+
+layer_128_1_bn1_h = batch_norm(layer_64_1_sum, 'layer_128_1_bn1_h')
+layer_128_1_scale1_h = scale(layer_128_1_bn1_h, 'layer_128_1_scale1_h')
+layer_128_1_relu1 = tf.nn.relu(layer_128_1_scale1_h)
+layer_128_1_conv1_h = conv(layer_128_1_relu1, stride=2, name='layer_128_1_conv1_h')
+layer_128_1_bn2 = batch_norm(layer_128_1_conv1_h, 'layer_128_1_bn2')
+layer_128_1_scale2 = scale(layer_128_1_bn2, 'layer_128_1_scale2')
+layer_128_1_relu2 = tf.nn.relu(layer_128_1_scale2)
+layer_128_1_conv2 = conv(layer_128_1_relu2, 'layer_128_1_conv2')
+layer_128_1_conv_expand_h = conv(layer_128_1_relu1, stride=2, name='layer_128_1_conv_expand_h')
+layer_128_1_sum = layer_128_1_conv2 + layer_128_1_conv_expand_h
+
+layer_256_1_bn1 = batch_norm(layer_128_1_sum, 'layer_256_1_bn1')
+layer_256_1_scale1 = scale(layer_256_1_bn1, 'layer_256_1_scale1')
+layer_256_1_relu1 = tf.nn.relu(layer_256_1_scale1)
+
+# layer_256_1_conv1 = tf.pad(layer_256_1_relu1, [[0, 0], [1, 1], [1, 1], [0, 0]])
+layer_256_1_conv1 = tf.space_to_batch_nd(layer_256_1_relu1, [1, 1], [[1, 1], [1, 1]], name='Pad_1')
+layer_256_1_conv1 = conv(layer_256_1_conv1, stride=2, pad='VALID', name='layer_256_1_conv1')
+
+layer_256_1_bn2 = batch_norm(layer_256_1_conv1, 'layer_256_1_bn2')
+layer_256_1_scale2 = scale(layer_256_1_bn2, 'layer_256_1_scale2')
+layer_256_1_relu2 = tf.nn.relu(layer_256_1_scale2)
+layer_256_1_conv2 = conv(layer_256_1_relu2, 'layer_256_1_conv2')
+layer_256_1_conv_expand = conv(layer_256_1_relu1, stride=2, name='layer_256_1_conv_expand')
+layer_256_1_sum = layer_256_1_conv2 + layer_256_1_conv_expand
+
+layer_512_1_bn1 = batch_norm(layer_256_1_sum, 'layer_512_1_bn1')
+layer_512_1_scale1 = scale(layer_512_1_bn1, 'layer_512_1_scale1')
+layer_512_1_relu1 = tf.nn.relu(layer_512_1_scale1)
+layer_512_1_conv1_h = conv(layer_512_1_relu1, 'layer_512_1_conv1_h')
+layer_512_1_bn2_h = batch_norm(layer_512_1_conv1_h, 'layer_512_1_bn2_h')
+layer_512_1_scale2_h = scale(layer_512_1_bn2_h, 'layer_512_1_scale2_h')
+layer_512_1_relu2 = tf.nn.relu(layer_512_1_scale2_h)
+layer_512_1_conv2_h = conv(layer_512_1_relu2, dilation=2, name='layer_512_1_conv2_h')
+layer_512_1_conv_expand_h = conv(layer_512_1_relu1, 'layer_512_1_conv_expand_h')
+layer_512_1_sum = layer_512_1_conv2_h + layer_512_1_conv_expand_h
+
+last_bn_h = batch_norm(layer_512_1_sum, 'last_bn_h')
+last_scale_h = scale(last_bn_h, 'last_scale_h')
+fc7 = tf.nn.relu(last_scale_h, name='last_relu')
+
+conv6_1_h = conv(fc7, 'conv6_1_h', activ=tf.nn.relu)
+conv6_2_h = conv(conv6_1_h, stride=2, name='conv6_2_h', activ=tf.nn.relu)
+conv7_1_h = conv(conv6_2_h, 'conv7_1_h', activ=tf.nn.relu)
+
+# conv7_2_h = tf.pad(conv7_1_h, [[0, 0], [1, 1], [1, 1], [0, 0]])
+conv7_2_h = tf.space_to_batch_nd(conv7_1_h, [1, 1], [[1, 1], [1, 1]], name='Pad_2')
+conv7_2_h = conv(conv7_2_h, stride=2, pad='VALID', name='conv7_2_h', activ=tf.nn.relu)
+
+conv8_1_h = conv(conv7_2_h, pad='SAME', name='conv8_1_h', activ=tf.nn.relu)
+conv8_2_h = conv(conv8_1_h, pad='VALID', name='conv8_2_h', activ=tf.nn.relu)
+conv9_1_h = conv(conv8_2_h, 'conv9_1_h', activ=tf.nn.relu)
+conv9_2_h = conv(conv9_1_h, pad='VALID', name='conv9_2_h', activ=tf.nn.relu)
+
+conv4_3_norm = l2norm(layer_256_1_relu1, 'conv4_3_norm')
+
+### Locations and confidences ##################################################
+locations = []
+confidences = []
+flattenLayersNames = []  # Collect all reshape layers names that should be replaced to flattens.
+for top, suffix in zip([locations, confidences], ['_mbox_loc', '_mbox_conf']):
+    for bottom, name in zip([conv4_3_norm, fc7, conv6_2_h, conv7_2_h, conv8_2_h, conv9_2_h],
+                            ['conv4_3_norm', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']):
+        name += suffix
+        flat = tf.layers.flatten(conv(bottom, name))
+        flattenLayersNames.append(flat.name[:flat.name.find(':')])
+        top.append(flat)
+
+mbox_loc = tf.concat(locations, axis=-1, name='mbox_loc')
+mbox_conf = tf.concat(confidences, axis=-1, name='mbox_conf')
+
+total = int(np.prod(mbox_conf.shape[1:]))
+mbox_conf_reshape = tf.reshape(mbox_conf, [-1, 2], name='mbox_conf_reshape')
+mbox_conf_softmax = tf.nn.softmax(mbox_conf_reshape, name='mbox_conf_softmax')
+mbox_conf_flatten = tf.reshape(mbox_conf_softmax, [-1, total], name='mbox_conf_flatten')
+flattenLayersNames.append('mbox_conf_flatten')
+
+with tf.Session() as sess:
+    sess.run(tf.global_variables_initializer())
+
+    ### Check correctness ######################################################
+    out_nodes = ['mbox_loc', 'mbox_conf_flatten']
+    inp_nodes = [inp.name[:inp.name.find(':')]]
+
+    np.random.seed(2701)
+    inputData = np.random.standard_normal([1, 3, 300, 300]).astype(np.float32)
+
+    cvNet.setInput(inputData)
+    cvNet.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
+    outDNN = cvNet.forward(out_nodes)
+
+    outTF = sess.run([mbox_loc, mbox_conf_flatten], feed_dict={inp: inputData.transpose(0, 2, 3, 1)})
+    print('Max diff @ locations:  %e' % np.max(np.abs(outDNN[0] - outTF[0])))
+    print('Max diff @ confidence: %e' % np.max(np.abs(outDNN[1] - outTF[1])))
+
+    # Save a graph
+    graph_def = sess.graph.as_graph_def()
+
+    # Freeze graph. Replaces variables to constants.
+    graph_def = tf.graph_util.convert_variables_to_constants(sess, graph_def, out_nodes)
+    # Optimize graph. Removes training-only ops, unused nodes.
+    graph_def = optimize_for_inference_lib.optimize_for_inference(graph_def, inp_nodes, out_nodes, dtype.as_datatype_enum)
+    # Fuse constant operations.
+    transforms = ["fold_constants(ignore_errors=True)"]
+    if args.quantize:
+        transforms += ["quantize_weights(minimum_size=0)"]
+    transforms += ["sort_by_execution_order"]
+    graph_def = TransformGraph(graph_def, inp_nodes, out_nodes, transforms)
+
+    # By default, float16 weights are stored in repeated tensor's field called
+    # `half_val`. It has type int32 with leading zeros for unused bytes.
+    # This type is encoded by Variant that means only 7 bits are used for value
+    # representation but the last one is indicated the end of encoding. This way
+    # float16 might takes 1 or 2 or 3 bytes depends on value. To improve compression,
+    # we replace all `half_val` values to `tensor_content` using only 2 bytes for everyone.
+    for node in graph_def.node:
+        if 'value' in node.attr:
+            halfs = node.attr["value"].tensor.half_val
+            if not node.attr["value"].tensor.tensor_content and halfs:
+                node.attr["value"].tensor.tensor_content = struct.pack('H' * len(halfs), *halfs)
+                node.attr["value"].tensor.ClearField('half_val')
+
+    # Serialize
+    with tf.gfile.FastGFile(args.pb, 'wb') as f:
+            f.write(graph_def.SerializeToString())
+
+
+################################################################################
+# Write a text graph representation
+################################################################################
+def tensorMsg(values):
+    msg = 'tensor { dtype: DT_FLOAT tensor_shape { dim { size: %d } }' % len(values)
+    for value in values:
+        msg += 'float_val: %f ' % value
+    return msg + '}'
+
+# Remove Const nodes and unused attributes.
+for i in reversed(range(len(graph_def.node))):
+    if graph_def.node[i].op in ['Const', 'Dequantize']:
+        del graph_def.node[i]
+    for attr in ['T', 'data_format', 'Tshape', 'N', 'Tidx', 'Tdim',
+                 'use_cudnn_on_gpu', 'Index', 'Tperm', 'is_training',
+                 'Tpaddings', 'Tblock_shape', 'Tcrops']:
+        if attr in graph_def.node[i].attr:
+            del graph_def.node[i].attr[attr]
+
+# Append prior box generators
+min_sizes = [30, 60, 111, 162, 213, 264]
+max_sizes = [60, 111, 162, 213, 264, 315]
+steps = [8, 16, 32, 64, 100, 300]
+aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+layers = [conv4_3_norm, fc7, conv6_2_h, conv7_2_h, conv8_2_h, conv9_2_h]
+for i in range(6):
+    priorBox = NodeDef()
+    priorBox.name = 'PriorBox_%d' % i
+    priorBox.op = 'PriorBox'
+    priorBox.input.append(layers[i].name[:layers[i].name.find(':')])
+    priorBox.input.append(inp_nodes[0])  # data
+
+    text_format.Merge('i: %d' % min_sizes[i], priorBox.attr["min_size"])
+    text_format.Merge('i: %d' % max_sizes[i], priorBox.attr["max_size"])
+    text_format.Merge('b: true', priorBox.attr["flip"])
+    text_format.Merge('b: false', priorBox.attr["clip"])
+    text_format.Merge(tensorMsg(aspect_ratios[i]), priorBox.attr["aspect_ratio"])
+    text_format.Merge(tensorMsg([0.1, 0.1, 0.2, 0.2]), priorBox.attr["variance"])
+    text_format.Merge('f: %f' % steps[i], priorBox.attr["step"])
+    text_format.Merge('f: 0.5', priorBox.attr["offset"])
+    graph_def.node.extend([priorBox])
+
+# Concatenate prior boxes
+concat = NodeDef()
+concat.name = 'mbox_priorbox'
+concat.op = 'ConcatV2'
+for i in range(6):
+    concat.input.append('PriorBox_%d' % i)
+concat.input.append('mbox_loc/axis')
+graph_def.node.extend([concat])
+
+# DetectionOutput layer
+detectionOut = NodeDef()
+detectionOut.name = 'detection_out'
+detectionOut.op = 'DetectionOutput'
+
+detectionOut.input.append('mbox_loc')
+detectionOut.input.append('mbox_conf_flatten')
+detectionOut.input.append('mbox_priorbox')
+
+text_format.Merge('i: 2', detectionOut.attr['num_classes'])
+text_format.Merge('b: true', detectionOut.attr['share_location'])
+text_format.Merge('i: 0', detectionOut.attr['background_label_id'])
+text_format.Merge('f: 0.45', detectionOut.attr['nms_threshold'])
+text_format.Merge('i: 400', detectionOut.attr['top_k'])
+text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
+text_format.Merge('i: 200', detectionOut.attr['keep_top_k'])
+text_format.Merge('f: 0.01', detectionOut.attr['confidence_threshold'])
+
+graph_def.node.extend([detectionOut])
+
+# Replace L2Normalization subgraph onto a single node.
+for i in reversed(range(len(graph_def.node))):
+    if graph_def.node[i].name in ['conv4_3_norm/l2_normalize/Square',
+                                  'conv4_3_norm/l2_normalize/Sum',
+                                  'conv4_3_norm/l2_normalize/Maximum',
+                                  'conv4_3_norm/l2_normalize/Rsqrt']:
+        del graph_def.node[i]
+for node in graph_def.node:
+    if node.name == 'conv4_3_norm/l2_normalize':
+        node.op = 'L2Normalize'
+        node.input.pop()
+        node.input.pop()
+        node.input.append(layer_256_1_relu1.name)
+        node.input.append('conv4_3_norm/l2_normalize/Sum/reduction_indices')
+        break
+
+softmaxShape = NodeDef()
+softmaxShape.name = 'reshape_before_softmax'
+softmaxShape.op = 'Const'
+text_format.Merge(
+'tensor {'
+'  dtype: DT_INT32'
+'  tensor_shape { dim { size: 3 } }'
+'  int_val: 0'
+'  int_val: -1'
+'  int_val: 2'
+'}', softmaxShape.attr["value"])
+graph_def.node.extend([softmaxShape])
+
+for node in graph_def.node:
+    if node.name == 'mbox_conf_reshape':
+        node.input[1] = softmaxShape.name
+    elif node.name == 'mbox_conf_softmax':
+        text_format.Merge('i: 2', node.attr['axis'])
+    elif node.name in flattenLayersNames:
+        node.op = 'Flatten'
+        inpName = node.input[0]
+        node.input.pop()
+        node.input.pop()
+        node.input.append(inpName)
+
+tf.train.write_graph(graph_def, "", args.pbtxt, as_text=True)
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/attr_value.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/attr_value.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/attr_value.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/attr_value.pb.h
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/function.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/function.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/function.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/function.pb.h
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/graph.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/graph.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/graph.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/graph.pb.h
@@ -0,0 +1,968 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: graph.proto
+
+#ifndef PROTOBUF_graph_2eproto__INCLUDED
+#define PROTOBUF_graph_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/map.h>  // IWYU pragma: export
+#include <google/protobuf/map_entry.h>
+#include <google/protobuf/map_field_inl.h>
+#include <google/protobuf/unknown_field_set.h>
+#include "attr_value.pb.h"
+#include "function.pb.h"
+#include "versions.pb.h"
+// @@protoc_insertion_point(includes)
+
+namespace protobuf_graph_2eproto {
+// Internal implementation detail -- do not use these members.
+struct TableStruct {
+  static const ::google::protobuf::internal::ParseTableField entries[];
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
+  static const ::google::protobuf::internal::ParseTable schema[3];
+  static const ::google::protobuf::internal::FieldMetadata field_metadata[];
+  static const ::google::protobuf::internal::SerializationTable serialization_table[];
+  static const ::google::protobuf::uint32 offsets[];
+};
+void AddDescriptors();
+void InitDefaultsGraphDefImpl();
+void InitDefaultsGraphDef();
+void InitDefaultsNodeDef_AttrEntry_DoNotUseImpl();
+void InitDefaultsNodeDef_AttrEntry_DoNotUse();
+void InitDefaultsNodeDefImpl();
+void InitDefaultsNodeDef();
+inline void InitDefaults() {
+  InitDefaultsGraphDef();
+  InitDefaultsNodeDef_AttrEntry_DoNotUse();
+  InitDefaultsNodeDef();
+}
+}  // namespace protobuf_graph_2eproto
+namespace opencv_tensorflow {
+class GraphDef;
+class GraphDefDefaultTypeInternal;
+extern GraphDefDefaultTypeInternal _GraphDef_default_instance_;
+class NodeDef;
+class NodeDefDefaultTypeInternal;
+extern NodeDefDefaultTypeInternal _NodeDef_default_instance_;
+class NodeDef_AttrEntry_DoNotUse;
+class NodeDef_AttrEntry_DoNotUseDefaultTypeInternal;
+extern NodeDef_AttrEntry_DoNotUseDefaultTypeInternal _NodeDef_AttrEntry_DoNotUse_default_instance_;
+}  // namespace opencv_tensorflow
+namespace opencv_tensorflow {
+
+// ===================================================================
+
+class GraphDef : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.GraphDef) */ {
+ public:
+  GraphDef();
+  virtual ~GraphDef();
+
+  GraphDef(const GraphDef& from);
+
+  inline GraphDef& operator=(const GraphDef& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  GraphDef(GraphDef&& from) noexcept
+    : GraphDef() {
+    *this = ::std::move(from);
+  }
+
+  inline GraphDef& operator=(GraphDef&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const GraphDef& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const GraphDef* internal_default_instance() {
+    return reinterpret_cast<const GraphDef*>(
+               &_GraphDef_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    0;
+
+  void UnsafeArenaSwap(GraphDef* other);
+  void Swap(GraphDef* other);
+  friend void swap(GraphDef& a, GraphDef& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline GraphDef* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  GraphDef* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const GraphDef& from);
+  void MergeFrom(const GraphDef& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(GraphDef* other);
+  protected:
+  explicit GraphDef(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated .opencv_tensorflow.NodeDef node = 1;
+  int node_size() const;
+  void clear_node();
+  static const int kNodeFieldNumber = 1;
+  const ::opencv_tensorflow::NodeDef& node(int index) const;
+  ::opencv_tensorflow::NodeDef* mutable_node(int index);
+  ::opencv_tensorflow::NodeDef* add_node();
+  ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >*
+      mutable_node();
+  const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >&
+      node() const;
+
+  // .opencv_tensorflow.FunctionDefLibrary library = 2;
+  bool has_library() const;
+  void clear_library();
+  static const int kLibraryFieldNumber = 2;
+  private:
+  void _slow_mutable_library();
+  public:
+  const ::opencv_tensorflow::FunctionDefLibrary& library() const;
+  ::opencv_tensorflow::FunctionDefLibrary* release_library();
+  ::opencv_tensorflow::FunctionDefLibrary* mutable_library();
+  void set_allocated_library(::opencv_tensorflow::FunctionDefLibrary* library);
+  void unsafe_arena_set_allocated_library(
+      ::opencv_tensorflow::FunctionDefLibrary* library);
+  ::opencv_tensorflow::FunctionDefLibrary* unsafe_arena_release_library();
+
+  // .opencv_tensorflow.VersionDef versions = 4;
+  bool has_versions() const;
+  void clear_versions();
+  static const int kVersionsFieldNumber = 4;
+  private:
+  void _slow_mutable_versions();
+  public:
+  const ::opencv_tensorflow::VersionDef& versions() const;
+  ::opencv_tensorflow::VersionDef* release_versions();
+  ::opencv_tensorflow::VersionDef* mutable_versions();
+  void set_allocated_versions(::opencv_tensorflow::VersionDef* versions);
+  void unsafe_arena_set_allocated_versions(
+      ::opencv_tensorflow::VersionDef* versions);
+  ::opencv_tensorflow::VersionDef* unsafe_arena_release_versions();
+
+  // int32 version = 3 [deprecated = true];
+  GOOGLE_PROTOBUF_DEPRECATED_ATTR void clear_version();
+  GOOGLE_PROTOBUF_DEPRECATED_ATTR static const int kVersionFieldNumber = 3;
+  GOOGLE_PROTOBUF_DEPRECATED_ATTR ::google::protobuf::int32 version() const;
+  GOOGLE_PROTOBUF_DEPRECATED_ATTR void set_version(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:opencv_tensorflow.GraphDef)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef > node_;
+  ::opencv_tensorflow::FunctionDefLibrary* library_;
+  ::opencv_tensorflow::VersionDef* versions_;
+  ::google::protobuf::int32 version_;
+  mutable int _cached_size_;
+  friend struct ::protobuf_graph_2eproto::TableStruct;
+  friend void ::protobuf_graph_2eproto::InitDefaultsGraphDefImpl();
+};
+// -------------------------------------------------------------------
+
+class NodeDef_AttrEntry_DoNotUse : public ::google::protobuf::internal::MapEntry<NodeDef_AttrEntry_DoNotUse,
+    ::std::string, ::opencv_tensorflow::AttrValue,
+    ::google::protobuf::internal::WireFormatLite::TYPE_STRING,
+    ::google::protobuf::internal::WireFormatLite::TYPE_MESSAGE,
+    0 > {
+public:
+  typedef ::google::protobuf::internal::MapEntry<NodeDef_AttrEntry_DoNotUse,
+    ::std::string, ::opencv_tensorflow::AttrValue,
+    ::google::protobuf::internal::WireFormatLite::TYPE_STRING,
+    ::google::protobuf::internal::WireFormatLite::TYPE_MESSAGE,
+    0 > SuperType;
+  NodeDef_AttrEntry_DoNotUse();
+  NodeDef_AttrEntry_DoNotUse(::google::protobuf::Arena* arena);
+  void MergeFrom(const NodeDef_AttrEntry_DoNotUse& other);
+  static const NodeDef_AttrEntry_DoNotUse* internal_default_instance() { return reinterpret_cast<const NodeDef_AttrEntry_DoNotUse*>(&_NodeDef_AttrEntry_DoNotUse_default_instance_); }
+  void MergeFrom(const ::google::protobuf::Message& other) PROTOBUF_FINAL;
+  ::google::protobuf::Metadata GetMetadata() const;
+};
+
+// -------------------------------------------------------------------
+
+class NodeDef : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.NodeDef) */ {
+ public:
+  NodeDef();
+  virtual ~NodeDef();
+
+  NodeDef(const NodeDef& from);
+
+  inline NodeDef& operator=(const NodeDef& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  NodeDef(NodeDef&& from) noexcept
+    : NodeDef() {
+    *this = ::std::move(from);
+  }
+
+  inline NodeDef& operator=(NodeDef&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const NodeDef& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const NodeDef* internal_default_instance() {
+    return reinterpret_cast<const NodeDef*>(
+               &_NodeDef_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    2;
+
+  void UnsafeArenaSwap(NodeDef* other);
+  void Swap(NodeDef* other);
+  friend void swap(NodeDef& a, NodeDef& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline NodeDef* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  NodeDef* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const NodeDef& from);
+  void MergeFrom(const NodeDef& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(NodeDef* other);
+  protected:
+  explicit NodeDef(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+
+  // accessors -------------------------------------------------------
+
+  // repeated string input = 3;
+  int input_size() const;
+  void clear_input();
+  static const int kInputFieldNumber = 3;
+  const ::std::string& input(int index) const;
+  ::std::string* mutable_input(int index);
+  void set_input(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_input(int index, ::std::string&& value);
+  #endif
+  void set_input(int index, const char* value);
+  void set_input(int index, const char* value, size_t size);
+  ::std::string* add_input();
+  void add_input(const ::std::string& value);
+  #if LANG_CXX11
+  void add_input(::std::string&& value);
+  #endif
+  void add_input(const char* value);
+  void add_input(const char* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField< ::std::string>& input() const;
+  ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_input();
+
+  // map<string, .opencv_tensorflow.AttrValue> attr = 5;
+  int attr_size() const;
+  void clear_attr();
+  static const int kAttrFieldNumber = 5;
+  const ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >&
+      attr() const;
+  ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >*
+      mutable_attr();
+
+  // string name = 1;
+  void clear_name();
+  static const int kNameFieldNumber = 1;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  ::std::string* unsafe_arena_release_name();
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  void unsafe_arena_set_allocated_name(
+      ::std::string* name);
+
+  // string op = 2;
+  void clear_op();
+  static const int kOpFieldNumber = 2;
+  const ::std::string& op() const;
+  void set_op(const ::std::string& value);
+  #if LANG_CXX11
+  void set_op(::std::string&& value);
+  #endif
+  void set_op(const char* value);
+  void set_op(const char* value, size_t size);
+  ::std::string* mutable_op();
+  ::std::string* release_op();
+  void set_allocated_op(::std::string* op);
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  ::std::string* unsafe_arena_release_op();
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  void unsafe_arena_set_allocated_op(
+      ::std::string* op);
+
+  // string device = 4;
+  void clear_device();
+  static const int kDeviceFieldNumber = 4;
+  const ::std::string& device() const;
+  void set_device(const ::std::string& value);
+  #if LANG_CXX11
+  void set_device(::std::string&& value);
+  #endif
+  void set_device(const char* value);
+  void set_device(const char* value, size_t size);
+  ::std::string* mutable_device();
+  ::std::string* release_device();
+  void set_allocated_device(::std::string* device);
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  ::std::string* unsafe_arena_release_device();
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  void unsafe_arena_set_allocated_device(
+      ::std::string* device);
+
+  // @@protoc_insertion_point(class_scope:opencv_tensorflow.NodeDef)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> input_;
+  ::google::protobuf::internal::MapField<
+      NodeDef_AttrEntry_DoNotUse,
+      ::std::string, ::opencv_tensorflow::AttrValue,
+      ::google::protobuf::internal::WireFormatLite::TYPE_STRING,
+      ::google::protobuf::internal::WireFormatLite::TYPE_MESSAGE,
+      0 > attr_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::internal::ArenaStringPtr op_;
+  ::google::protobuf::internal::ArenaStringPtr device_;
+  mutable int _cached_size_;
+  friend struct ::protobuf_graph_2eproto::TableStruct;
+  friend void ::protobuf_graph_2eproto::InitDefaultsNodeDefImpl();
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// GraphDef
+
+// repeated .opencv_tensorflow.NodeDef node = 1;
+inline int GraphDef::node_size() const {
+  return node_.size();
+}
+inline void GraphDef::clear_node() {
+  node_.Clear();
+}
+inline const ::opencv_tensorflow::NodeDef& GraphDef::node(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.node)
+  return node_.Get(index);
+}
+inline ::opencv_tensorflow::NodeDef* GraphDef::mutable_node(int index) {
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.GraphDef.node)
+  return node_.Mutable(index);
+}
+inline ::opencv_tensorflow::NodeDef* GraphDef::add_node() {
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.GraphDef.node)
+  return node_.Add();
+}
+inline ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >*
+GraphDef::mutable_node() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.GraphDef.node)
+  return &node_;
+}
+inline const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >&
+GraphDef::node() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.GraphDef.node)
+  return node_;
+}
+
+// .opencv_tensorflow.VersionDef versions = 4;
+inline bool GraphDef::has_versions() const {
+  return this != internal_default_instance() && versions_ != NULL;
+}
+inline const ::opencv_tensorflow::VersionDef& GraphDef::versions() const {
+  const ::opencv_tensorflow::VersionDef* p = versions_;
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.versions)
+  return p != NULL ? *p : *reinterpret_cast<const ::opencv_tensorflow::VersionDef*>(
+      &::opencv_tensorflow::_VersionDef_default_instance_);
+}
+inline ::opencv_tensorflow::VersionDef* GraphDef::release_versions() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.GraphDef.versions)
+
+  ::opencv_tensorflow::VersionDef* temp = versions_;
+  if (GetArenaNoVirtual() != NULL) {
+    temp = ::google::protobuf::internal::DuplicateIfNonNull(temp, NULL);
+  }
+  versions_ = NULL;
+  return temp;
+}
+inline ::opencv_tensorflow::VersionDef* GraphDef::unsafe_arena_release_versions() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.GraphDef.versions)
+
+  ::opencv_tensorflow::VersionDef* temp = versions_;
+  versions_ = NULL;
+  return temp;
+}
+inline ::opencv_tensorflow::VersionDef* GraphDef::mutable_versions() {
+
+  if (versions_ == NULL) {
+    _slow_mutable_versions();
+  }
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.GraphDef.versions)
+  return versions_;
+}
+inline void GraphDef::set_allocated_versions(::opencv_tensorflow::VersionDef* versions) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == NULL) {
+    delete reinterpret_cast< ::google::protobuf::MessageLite*>(versions_);
+  }
+  if (versions) {
+    ::google::protobuf::Arena* submessage_arena =
+      reinterpret_cast< ::google::protobuf::MessageLite*>(versions)->GetArena();
+    if (message_arena != submessage_arena) {
+      versions = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, versions, submessage_arena);
+    }
+
+  } else {
+
+  }
+  versions_ = versions;
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.GraphDef.versions)
+}
+
+// int32 version = 3 [deprecated = true];
+inline void GraphDef::clear_version() {
+  version_ = 0;
+}
+inline ::google::protobuf::int32 GraphDef::version() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.version)
+  return version_;
+}
+inline void GraphDef::set_version(::google::protobuf::int32 value) {
+
+  version_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.GraphDef.version)
+}
+
+// .opencv_tensorflow.FunctionDefLibrary library = 2;
+inline bool GraphDef::has_library() const {
+  return this != internal_default_instance() && library_ != NULL;
+}
+inline const ::opencv_tensorflow::FunctionDefLibrary& GraphDef::library() const {
+  const ::opencv_tensorflow::FunctionDefLibrary* p = library_;
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.library)
+  return p != NULL ? *p : *reinterpret_cast<const ::opencv_tensorflow::FunctionDefLibrary*>(
+      &::opencv_tensorflow::_FunctionDefLibrary_default_instance_);
+}
+inline ::opencv_tensorflow::FunctionDefLibrary* GraphDef::release_library() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.GraphDef.library)
+
+  ::opencv_tensorflow::FunctionDefLibrary* temp = library_;
+  if (GetArenaNoVirtual() != NULL) {
+    temp = ::google::protobuf::internal::DuplicateIfNonNull(temp, NULL);
+  }
+  library_ = NULL;
+  return temp;
+}
+inline ::opencv_tensorflow::FunctionDefLibrary* GraphDef::unsafe_arena_release_library() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.GraphDef.library)
+
+  ::opencv_tensorflow::FunctionDefLibrary* temp = library_;
+  library_ = NULL;
+  return temp;
+}
+inline ::opencv_tensorflow::FunctionDefLibrary* GraphDef::mutable_library() {
+
+  if (library_ == NULL) {
+    _slow_mutable_library();
+  }
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.GraphDef.library)
+  return library_;
+}
+inline void GraphDef::set_allocated_library(::opencv_tensorflow::FunctionDefLibrary* library) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == NULL) {
+    delete reinterpret_cast< ::google::protobuf::MessageLite*>(library_);
+  }
+  if (library) {
+    ::google::protobuf::Arena* submessage_arena =
+      reinterpret_cast< ::google::protobuf::MessageLite*>(library)->GetArena();
+    if (message_arena != submessage_arena) {
+      library = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, library, submessage_arena);
+    }
+
+  } else {
+
+  }
+  library_ = library;
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.GraphDef.library)
+}
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// NodeDef
+
+// string name = 1;
+inline void NodeDef::clear_name() {
+  name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& NodeDef::name() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.name)
+  return name_.Get();
+}
+inline void NodeDef::set_name(const ::std::string& value) {
+
+  name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.name)
+}
+#if LANG_CXX11
+inline void NodeDef::set_name(::std::string&& value) {
+
+  name_.Set(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.NodeDef.name)
+}
+#endif
+inline void NodeDef::set_name(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+
+  name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.name)
+}
+inline void NodeDef::set_name(const char* value,
+    size_t size) {
+
+  name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.name)
+}
+inline ::std::string* NodeDef::mutable_name() {
+
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.name)
+  return name_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* NodeDef::release_name() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.NodeDef.name)
+
+  return name_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline void NodeDef::set_allocated_name(::std::string* name) {
+  if (name != NULL) {
+
+  } else {
+
+  }
+  name_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.NodeDef.name)
+}
+inline ::std::string* NodeDef::unsafe_arena_release_name() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.NodeDef.name)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+
+  return name_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void NodeDef::unsafe_arena_set_allocated_name(
+    ::std::string* name) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (name != NULL) {
+
+  } else {
+
+  }
+  name_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      name, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.NodeDef.name)
+}
+
+// string op = 2;
+inline void NodeDef::clear_op() {
+  op_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& NodeDef::op() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.op)
+  return op_.Get();
+}
+inline void NodeDef::set_op(const ::std::string& value) {
+
+  op_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.op)
+}
+#if LANG_CXX11
+inline void NodeDef::set_op(::std::string&& value) {
+
+  op_.Set(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.NodeDef.op)
+}
+#endif
+inline void NodeDef::set_op(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+
+  op_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.op)
+}
+inline void NodeDef::set_op(const char* value,
+    size_t size) {
+
+  op_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.op)
+}
+inline ::std::string* NodeDef::mutable_op() {
+
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.op)
+  return op_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* NodeDef::release_op() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.NodeDef.op)
+
+  return op_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline void NodeDef::set_allocated_op(::std::string* op) {
+  if (op != NULL) {
+
+  } else {
+
+  }
+  op_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), op,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.NodeDef.op)
+}
+inline ::std::string* NodeDef::unsafe_arena_release_op() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.NodeDef.op)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+
+  return op_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void NodeDef::unsafe_arena_set_allocated_op(
+    ::std::string* op) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (op != NULL) {
+
+  } else {
+
+  }
+  op_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      op, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.NodeDef.op)
+}
+
+// repeated string input = 3;
+inline int NodeDef::input_size() const {
+  return input_.size();
+}
+inline void NodeDef::clear_input() {
+  input_.Clear();
+}
+inline const ::std::string& NodeDef::input(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.input)
+  return input_.Get(index);
+}
+inline ::std::string* NodeDef::mutable_input(int index) {
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.input)
+  return input_.Mutable(index);
+}
+inline void NodeDef::set_input(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.input)
+  input_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void NodeDef::set_input(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.input)
+  input_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void NodeDef::set_input(int index, const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+  input_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.input)
+}
+inline void NodeDef::set_input(int index, const char* value, size_t size) {
+  input_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.input)
+}
+inline ::std::string* NodeDef::add_input() {
+  // @@protoc_insertion_point(field_add_mutable:opencv_tensorflow.NodeDef.input)
+  return input_.Add();
+}
+inline void NodeDef::add_input(const ::std::string& value) {
+  input_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.NodeDef.input)
+}
+#if LANG_CXX11
+inline void NodeDef::add_input(::std::string&& value) {
+  input_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.NodeDef.input)
+}
+#endif
+inline void NodeDef::add_input(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+  input_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:opencv_tensorflow.NodeDef.input)
+}
+inline void NodeDef::add_input(const char* value, size_t size) {
+  input_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:opencv_tensorflow.NodeDef.input)
+}
+inline const ::google::protobuf::RepeatedPtrField< ::std::string>&
+NodeDef::input() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.NodeDef.input)
+  return input_;
+}
+inline ::google::protobuf::RepeatedPtrField< ::std::string>*
+NodeDef::mutable_input() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.NodeDef.input)
+  return &input_;
+}
+
+// string device = 4;
+inline void NodeDef::clear_device() {
+  device_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& NodeDef::device() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.device)
+  return device_.Get();
+}
+inline void NodeDef::set_device(const ::std::string& value) {
+
+  device_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.device)
+}
+#if LANG_CXX11
+inline void NodeDef::set_device(::std::string&& value) {
+
+  device_.Set(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.NodeDef.device)
+}
+#endif
+inline void NodeDef::set_device(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+
+  device_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.device)
+}
+inline void NodeDef::set_device(const char* value,
+    size_t size) {
+
+  device_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.device)
+}
+inline ::std::string* NodeDef::mutable_device() {
+
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.device)
+  return device_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* NodeDef::release_device() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.NodeDef.device)
+
+  return device_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline void NodeDef::set_allocated_device(::std::string* device) {
+  if (device != NULL) {
+
+  } else {
+
+  }
+  device_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), device,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.NodeDef.device)
+}
+inline ::std::string* NodeDef::unsafe_arena_release_device() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.NodeDef.device)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+
+  return device_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void NodeDef::unsafe_arena_set_allocated_device(
+    ::std::string* device) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (device != NULL) {
+
+  } else {
+
+  }
+  device_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      device, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.NodeDef.device)
+}
+
+// map<string, .opencv_tensorflow.AttrValue> attr = 5;
+inline int NodeDef::attr_size() const {
+  return attr_.size();
+}
+inline const ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >&
+NodeDef::attr() const {
+  // @@protoc_insertion_point(field_map:opencv_tensorflow.NodeDef.attr)
+  return attr_.GetMap();
+}
+inline ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >*
+NodeDef::mutable_attr() {
+  // @@protoc_insertion_point(field_mutable_map:opencv_tensorflow.NodeDef.attr)
+  return attr_.MutableMap();
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_graph_2eproto__INCLUDED
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/op_def.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/op_def.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/op_def.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/op_def.pb.h
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor.pb.cc
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor.pb.h
@@ -0,0 +1,844 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: tensor.proto
+
+#ifndef PROTOBUF_tensor_2eproto__INCLUDED
+#define PROTOBUF_tensor_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+#include "tensor_shape.pb.h"
+#include "types.pb.h"
+// @@protoc_insertion_point(includes)
+
+namespace protobuf_tensor_2eproto {
+// Internal implementation detail -- do not use these members.
+struct TableStruct {
+  static const ::google::protobuf::internal::ParseTableField entries[];
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
+  static const ::google::protobuf::internal::ParseTable schema[1];
+  static const ::google::protobuf::internal::FieldMetadata field_metadata[];
+  static const ::google::protobuf::internal::SerializationTable serialization_table[];
+  static const ::google::protobuf::uint32 offsets[];
+};
+void AddDescriptors();
+void InitDefaultsTensorProtoImpl();
+void InitDefaultsTensorProto();
+inline void InitDefaults() {
+  InitDefaultsTensorProto();
+}
+}  // namespace protobuf_tensor_2eproto
+namespace opencv_tensorflow {
+class TensorProto;
+class TensorProtoDefaultTypeInternal;
+extern TensorProtoDefaultTypeInternal _TensorProto_default_instance_;
+}  // namespace opencv_tensorflow
+namespace opencv_tensorflow {
+
+// ===================================================================
+
+class TensorProto : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.TensorProto) */ {
+ public:
+  TensorProto();
+  virtual ~TensorProto();
+
+  TensorProto(const TensorProto& from);
+
+  inline TensorProto& operator=(const TensorProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorProto(TensorProto&& from) noexcept
+    : TensorProto() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorProto& operator=(TensorProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const TensorProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorProto* internal_default_instance() {
+    return reinterpret_cast<const TensorProto*>(
+               &_TensorProto_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    0;
+
+  void UnsafeArenaSwap(TensorProto* other);
+  void Swap(TensorProto* other);
+  friend void swap(TensorProto& a, TensorProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorProto* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  TensorProto* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const TensorProto& from);
+  void MergeFrom(const TensorProto& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(TensorProto* other);
+  protected:
+  explicit TensorProto(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated float float_val = 5 [packed = true];
+  int float_val_size() const;
+  void clear_float_val();
+  static const int kFloatValFieldNumber = 5;
+  float float_val(int index) const;
+  void set_float_val(int index, float value);
+  void add_float_val(float value);
+  const ::google::protobuf::RepeatedField< float >&
+      float_val() const;
+  ::google::protobuf::RepeatedField< float >*
+      mutable_float_val();
+
+  // repeated double double_val = 6 [packed = true];
+  int double_val_size() const;
+  void clear_double_val();
+  static const int kDoubleValFieldNumber = 6;
+  double double_val(int index) const;
+  void set_double_val(int index, double value);
+  void add_double_val(double value);
+  const ::google::protobuf::RepeatedField< double >&
+      double_val() const;
+  ::google::protobuf::RepeatedField< double >*
+      mutable_double_val();
+
+  // repeated int32 int_val = 7 [packed = true];
+  int int_val_size() const;
+  void clear_int_val();
+  static const int kIntValFieldNumber = 7;
+  ::google::protobuf::int32 int_val(int index) const;
+  void set_int_val(int index, ::google::protobuf::int32 value);
+  void add_int_val(::google::protobuf::int32 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      int_val() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_int_val();
+
+  // repeated bytes string_val = 8;
+  int string_val_size() const;
+  void clear_string_val();
+  static const int kStringValFieldNumber = 8;
+  const ::std::string& string_val(int index) const;
+  ::std::string* mutable_string_val(int index);
+  void set_string_val(int index, const ::std::string& value);
+  #if LANG_CXX11
+  void set_string_val(int index, ::std::string&& value);
+  #endif
+  void set_string_val(int index, const char* value);
+  void set_string_val(int index, const void* value, size_t size);
+  ::std::string* add_string_val();
+  void add_string_val(const ::std::string& value);
+  #if LANG_CXX11
+  void add_string_val(::std::string&& value);
+  #endif
+  void add_string_val(const char* value);
+  void add_string_val(const void* value, size_t size);
+  const ::google::protobuf::RepeatedPtrField< ::std::string>& string_val() const;
+  ::google::protobuf::RepeatedPtrField< ::std::string>* mutable_string_val();
+
+  // repeated float scomplex_val = 9 [packed = true];
+  int scomplex_val_size() const;
+  void clear_scomplex_val();
+  static const int kScomplexValFieldNumber = 9;
+  float scomplex_val(int index) const;
+  void set_scomplex_val(int index, float value);
+  void add_scomplex_val(float value);
+  const ::google::protobuf::RepeatedField< float >&
+      scomplex_val() const;
+  ::google::protobuf::RepeatedField< float >*
+      mutable_scomplex_val();
+
+  // repeated int64 int64_val = 10 [packed = true];
+  int int64_val_size() const;
+  void clear_int64_val();
+  static const int kInt64ValFieldNumber = 10;
+  ::google::protobuf::int64 int64_val(int index) const;
+  void set_int64_val(int index, ::google::protobuf::int64 value);
+  void add_int64_val(::google::protobuf::int64 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+      int64_val() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+      mutable_int64_val();
+
+  // repeated bool bool_val = 11 [packed = true];
+  int bool_val_size() const;
+  void clear_bool_val();
+  static const int kBoolValFieldNumber = 11;
+  bool bool_val(int index) const;
+  void set_bool_val(int index, bool value);
+  void add_bool_val(bool value);
+  const ::google::protobuf::RepeatedField< bool >&
+      bool_val() const;
+  ::google::protobuf::RepeatedField< bool >*
+      mutable_bool_val();
+
+  // repeated double dcomplex_val = 12 [packed = true];
+  int dcomplex_val_size() const;
+  void clear_dcomplex_val();
+  static const int kDcomplexValFieldNumber = 12;
+  double dcomplex_val(int index) const;
+  void set_dcomplex_val(int index, double value);
+  void add_dcomplex_val(double value);
+  const ::google::protobuf::RepeatedField< double >&
+      dcomplex_val() const;
+  ::google::protobuf::RepeatedField< double >*
+      mutable_dcomplex_val();
+
+  // repeated int32 half_val = 13 [packed = true];
+  int half_val_size() const;
+  void clear_half_val();
+  static const int kHalfValFieldNumber = 13;
+  ::google::protobuf::int32 half_val(int index) const;
+  void set_half_val(int index, ::google::protobuf::int32 value);
+  void add_half_val(::google::protobuf::int32 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      half_val() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_half_val();
+
+  // bytes tensor_content = 4;
+  void clear_tensor_content();
+  static const int kTensorContentFieldNumber = 4;
+  const ::std::string& tensor_content() const;
+  void set_tensor_content(const ::std::string& value);
+  #if LANG_CXX11
+  void set_tensor_content(::std::string&& value);
+  #endif
+  void set_tensor_content(const char* value);
+  void set_tensor_content(const void* value, size_t size);
+  ::std::string* mutable_tensor_content();
+  ::std::string* release_tensor_content();
+  void set_allocated_tensor_content(::std::string* tensor_content);
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  ::std::string* unsafe_arena_release_tensor_content();
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  void unsafe_arena_set_allocated_tensor_content(
+      ::std::string* tensor_content);
+
+  // .opencv_tensorflow.TensorShapeProto tensor_shape = 2;
+  bool has_tensor_shape() const;
+  void clear_tensor_shape();
+  static const int kTensorShapeFieldNumber = 2;
+  private:
+  void _slow_mutable_tensor_shape();
+  public:
+  const ::opencv_tensorflow::TensorShapeProto& tensor_shape() const;
+  ::opencv_tensorflow::TensorShapeProto* release_tensor_shape();
+  ::opencv_tensorflow::TensorShapeProto* mutable_tensor_shape();
+  void set_allocated_tensor_shape(::opencv_tensorflow::TensorShapeProto* tensor_shape);
+  void unsafe_arena_set_allocated_tensor_shape(
+      ::opencv_tensorflow::TensorShapeProto* tensor_shape);
+  ::opencv_tensorflow::TensorShapeProto* unsafe_arena_release_tensor_shape();
+
+  // .opencv_tensorflow.DataType dtype = 1;
+  void clear_dtype();
+  static const int kDtypeFieldNumber = 1;
+  ::opencv_tensorflow::DataType dtype() const;
+  void set_dtype(::opencv_tensorflow::DataType value);
+
+  // int32 version_number = 3;
+  void clear_version_number();
+  static const int kVersionNumberFieldNumber = 3;
+  ::google::protobuf::int32 version_number() const;
+  void set_version_number(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:opencv_tensorflow.TensorProto)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedField< float > float_val_;
+  mutable int _float_val_cached_byte_size_;
+  ::google::protobuf::RepeatedField< double > double_val_;
+  mutable int _double_val_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > int_val_;
+  mutable int _int_val_cached_byte_size_;
+  ::google::protobuf::RepeatedPtrField< ::std::string> string_val_;
+  ::google::protobuf::RepeatedField< float > scomplex_val_;
+  mutable int _scomplex_val_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int64 > int64_val_;
+  mutable int _int64_val_cached_byte_size_;
+  ::google::protobuf::RepeatedField< bool > bool_val_;
+  mutable int _bool_val_cached_byte_size_;
+  ::google::protobuf::RepeatedField< double > dcomplex_val_;
+  mutable int _dcomplex_val_cached_byte_size_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > half_val_;
+  mutable int _half_val_cached_byte_size_;
+  ::google::protobuf::internal::ArenaStringPtr tensor_content_;
+  ::opencv_tensorflow::TensorShapeProto* tensor_shape_;
+  int dtype_;
+  ::google::protobuf::int32 version_number_;
+  mutable int _cached_size_;
+  friend struct ::protobuf_tensor_2eproto::TableStruct;
+  friend void ::protobuf_tensor_2eproto::InitDefaultsTensorProtoImpl();
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// TensorProto
+
+// .opencv_tensorflow.DataType dtype = 1;
+inline void TensorProto::clear_dtype() {
+  dtype_ = 0;
+}
+inline ::opencv_tensorflow::DataType TensorProto::dtype() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.dtype)
+  return static_cast< ::opencv_tensorflow::DataType >(dtype_);
+}
+inline void TensorProto::set_dtype(::opencv_tensorflow::DataType value) {
+
+  dtype_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.dtype)
+}
+
+// .opencv_tensorflow.TensorShapeProto tensor_shape = 2;
+inline bool TensorProto::has_tensor_shape() const {
+  return this != internal_default_instance() && tensor_shape_ != NULL;
+}
+inline const ::opencv_tensorflow::TensorShapeProto& TensorProto::tensor_shape() const {
+  const ::opencv_tensorflow::TensorShapeProto* p = tensor_shape_;
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.tensor_shape)
+  return p != NULL ? *p : *reinterpret_cast<const ::opencv_tensorflow::TensorShapeProto*>(
+      &::opencv_tensorflow::_TensorShapeProto_default_instance_);
+}
+inline ::opencv_tensorflow::TensorShapeProto* TensorProto::release_tensor_shape() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.TensorProto.tensor_shape)
+
+  ::opencv_tensorflow::TensorShapeProto* temp = tensor_shape_;
+  if (GetArenaNoVirtual() != NULL) {
+    temp = ::google::protobuf::internal::DuplicateIfNonNull(temp, NULL);
+  }
+  tensor_shape_ = NULL;
+  return temp;
+}
+inline ::opencv_tensorflow::TensorShapeProto* TensorProto::unsafe_arena_release_tensor_shape() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.TensorProto.tensor_shape)
+
+  ::opencv_tensorflow::TensorShapeProto* temp = tensor_shape_;
+  tensor_shape_ = NULL;
+  return temp;
+}
+inline ::opencv_tensorflow::TensorShapeProto* TensorProto::mutable_tensor_shape() {
+
+  if (tensor_shape_ == NULL) {
+    _slow_mutable_tensor_shape();
+  }
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorProto.tensor_shape)
+  return tensor_shape_;
+}
+inline void TensorProto::set_allocated_tensor_shape(::opencv_tensorflow::TensorShapeProto* tensor_shape) {
+  ::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
+  if (message_arena == NULL) {
+    delete reinterpret_cast< ::google::protobuf::MessageLite*>(tensor_shape_);
+  }
+  if (tensor_shape) {
+    ::google::protobuf::Arena* submessage_arena =
+      reinterpret_cast< ::google::protobuf::MessageLite*>(tensor_shape)->GetArena();
+    if (message_arena != submessage_arena) {
+      tensor_shape = ::google::protobuf::internal::GetOwnedMessage(
+          message_arena, tensor_shape, submessage_arena);
+    }
+
+  } else {
+
+  }
+  tensor_shape_ = tensor_shape;
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.TensorProto.tensor_shape)
+}
+
+// int32 version_number = 3;
+inline void TensorProto::clear_version_number() {
+  version_number_ = 0;
+}
+inline ::google::protobuf::int32 TensorProto::version_number() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.version_number)
+  return version_number_;
+}
+inline void TensorProto::set_version_number(::google::protobuf::int32 value) {
+
+  version_number_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.version_number)
+}
+
+// bytes tensor_content = 4;
+inline void TensorProto::clear_tensor_content() {
+  tensor_content_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& TensorProto::tensor_content() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.tensor_content)
+  return tensor_content_.Get();
+}
+inline void TensorProto::set_tensor_content(const ::std::string& value) {
+
+  tensor_content_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.tensor_content)
+}
+#if LANG_CXX11
+inline void TensorProto::set_tensor_content(::std::string&& value) {
+
+  tensor_content_.Set(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.TensorProto.tensor_content)
+}
+#endif
+inline void TensorProto::set_tensor_content(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+
+  tensor_content_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.TensorProto.tensor_content)
+}
+inline void TensorProto::set_tensor_content(const void* value,
+    size_t size) {
+
+  tensor_content_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.TensorProto.tensor_content)
+}
+inline ::std::string* TensorProto::mutable_tensor_content() {
+
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorProto.tensor_content)
+  return tensor_content_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* TensorProto::release_tensor_content() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.TensorProto.tensor_content)
+
+  return tensor_content_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline void TensorProto::set_allocated_tensor_content(::std::string* tensor_content) {
+  if (tensor_content != NULL) {
+
+  } else {
+
+  }
+  tensor_content_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), tensor_content,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.TensorProto.tensor_content)
+}
+inline ::std::string* TensorProto::unsafe_arena_release_tensor_content() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.TensorProto.tensor_content)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+
+  return tensor_content_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void TensorProto::unsafe_arena_set_allocated_tensor_content(
+    ::std::string* tensor_content) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (tensor_content != NULL) {
+
+  } else {
+
+  }
+  tensor_content_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      tensor_content, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.TensorProto.tensor_content)
+}
+
+// repeated int32 half_val = 13 [packed = true];
+inline int TensorProto::half_val_size() const {
+  return half_val_.size();
+}
+inline void TensorProto::clear_half_val() {
+  half_val_.Clear();
+}
+inline ::google::protobuf::int32 TensorProto::half_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.half_val)
+  return half_val_.Get(index);
+}
+inline void TensorProto::set_half_val(int index, ::google::protobuf::int32 value) {
+  half_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.half_val)
+}
+inline void TensorProto::add_half_val(::google::protobuf::int32 value) {
+  half_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.half_val)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+TensorProto::half_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.half_val)
+  return half_val_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+TensorProto::mutable_half_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.half_val)
+  return &half_val_;
+}
+
+// repeated float float_val = 5 [packed = true];
+inline int TensorProto::float_val_size() const {
+  return float_val_.size();
+}
+inline void TensorProto::clear_float_val() {
+  float_val_.Clear();
+}
+inline float TensorProto::float_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.float_val)
+  return float_val_.Get(index);
+}
+inline void TensorProto::set_float_val(int index, float value) {
+  float_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.float_val)
+}
+inline void TensorProto::add_float_val(float value) {
+  float_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.float_val)
+}
+inline const ::google::protobuf::RepeatedField< float >&
+TensorProto::float_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.float_val)
+  return float_val_;
+}
+inline ::google::protobuf::RepeatedField< float >*
+TensorProto::mutable_float_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.float_val)
+  return &float_val_;
+}
+
+// repeated double double_val = 6 [packed = true];
+inline int TensorProto::double_val_size() const {
+  return double_val_.size();
+}
+inline void TensorProto::clear_double_val() {
+  double_val_.Clear();
+}
+inline double TensorProto::double_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.double_val)
+  return double_val_.Get(index);
+}
+inline void TensorProto::set_double_val(int index, double value) {
+  double_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.double_val)
+}
+inline void TensorProto::add_double_val(double value) {
+  double_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.double_val)
+}
+inline const ::google::protobuf::RepeatedField< double >&
+TensorProto::double_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.double_val)
+  return double_val_;
+}
+inline ::google::protobuf::RepeatedField< double >*
+TensorProto::mutable_double_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.double_val)
+  return &double_val_;
+}
+
+// repeated int32 int_val = 7 [packed = true];
+inline int TensorProto::int_val_size() const {
+  return int_val_.size();
+}
+inline void TensorProto::clear_int_val() {
+  int_val_.Clear();
+}
+inline ::google::protobuf::int32 TensorProto::int_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.int_val)
+  return int_val_.Get(index);
+}
+inline void TensorProto::set_int_val(int index, ::google::protobuf::int32 value) {
+  int_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.int_val)
+}
+inline void TensorProto::add_int_val(::google::protobuf::int32 value) {
+  int_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.int_val)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+TensorProto::int_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.int_val)
+  return int_val_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+TensorProto::mutable_int_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.int_val)
+  return &int_val_;
+}
+
+// repeated bytes string_val = 8;
+inline int TensorProto::string_val_size() const {
+  return string_val_.size();
+}
+inline void TensorProto::clear_string_val() {
+  string_val_.Clear();
+}
+inline const ::std::string& TensorProto::string_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.string_val)
+  return string_val_.Get(index);
+}
+inline ::std::string* TensorProto::mutable_string_val(int index) {
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorProto.string_val)
+  return string_val_.Mutable(index);
+}
+inline void TensorProto::set_string_val(int index, const ::std::string& value) {
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.string_val)
+  string_val_.Mutable(index)->assign(value);
+}
+#if LANG_CXX11
+inline void TensorProto::set_string_val(int index, ::std::string&& value) {
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.string_val)
+  string_val_.Mutable(index)->assign(std::move(value));
+}
+#endif
+inline void TensorProto::set_string_val(int index, const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+  string_val_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.TensorProto.string_val)
+}
+inline void TensorProto::set_string_val(int index, const void* value, size_t size) {
+  string_val_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.TensorProto.string_val)
+}
+inline ::std::string* TensorProto::add_string_val() {
+  // @@protoc_insertion_point(field_add_mutable:opencv_tensorflow.TensorProto.string_val)
+  return string_val_.Add();
+}
+inline void TensorProto::add_string_val(const ::std::string& value) {
+  string_val_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.string_val)
+}
+#if LANG_CXX11
+inline void TensorProto::add_string_val(::std::string&& value) {
+  string_val_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.string_val)
+}
+#endif
+inline void TensorProto::add_string_val(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+  string_val_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:opencv_tensorflow.TensorProto.string_val)
+}
+inline void TensorProto::add_string_val(const void* value, size_t size) {
+  string_val_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:opencv_tensorflow.TensorProto.string_val)
+}
+inline const ::google::protobuf::RepeatedPtrField< ::std::string>&
+TensorProto::string_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.string_val)
+  return string_val_;
+}
+inline ::google::protobuf::RepeatedPtrField< ::std::string>*
+TensorProto::mutable_string_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.string_val)
+  return &string_val_;
+}
+
+// repeated float scomplex_val = 9 [packed = true];
+inline int TensorProto::scomplex_val_size() const {
+  return scomplex_val_.size();
+}
+inline void TensorProto::clear_scomplex_val() {
+  scomplex_val_.Clear();
+}
+inline float TensorProto::scomplex_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.scomplex_val)
+  return scomplex_val_.Get(index);
+}
+inline void TensorProto::set_scomplex_val(int index, float value) {
+  scomplex_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.scomplex_val)
+}
+inline void TensorProto::add_scomplex_val(float value) {
+  scomplex_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.scomplex_val)
+}
+inline const ::google::protobuf::RepeatedField< float >&
+TensorProto::scomplex_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.scomplex_val)
+  return scomplex_val_;
+}
+inline ::google::protobuf::RepeatedField< float >*
+TensorProto::mutable_scomplex_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.scomplex_val)
+  return &scomplex_val_;
+}
+
+// repeated int64 int64_val = 10 [packed = true];
+inline int TensorProto::int64_val_size() const {
+  return int64_val_.size();
+}
+inline void TensorProto::clear_int64_val() {
+  int64_val_.Clear();
+}
+inline ::google::protobuf::int64 TensorProto::int64_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.int64_val)
+  return int64_val_.Get(index);
+}
+inline void TensorProto::set_int64_val(int index, ::google::protobuf::int64 value) {
+  int64_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.int64_val)
+}
+inline void TensorProto::add_int64_val(::google::protobuf::int64 value) {
+  int64_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.int64_val)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
+TensorProto::int64_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.int64_val)
+  return int64_val_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
+TensorProto::mutable_int64_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.int64_val)
+  return &int64_val_;
+}
+
+// repeated bool bool_val = 11 [packed = true];
+inline int TensorProto::bool_val_size() const {
+  return bool_val_.size();
+}
+inline void TensorProto::clear_bool_val() {
+  bool_val_.Clear();
+}
+inline bool TensorProto::bool_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.bool_val)
+  return bool_val_.Get(index);
+}
+inline void TensorProto::set_bool_val(int index, bool value) {
+  bool_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.bool_val)
+}
+inline void TensorProto::add_bool_val(bool value) {
+  bool_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.bool_val)
+}
+inline const ::google::protobuf::RepeatedField< bool >&
+TensorProto::bool_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.bool_val)
+  return bool_val_;
+}
+inline ::google::protobuf::RepeatedField< bool >*
+TensorProto::mutable_bool_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.bool_val)
+  return &bool_val_;
+}
+
+// repeated double dcomplex_val = 12 [packed = true];
+inline int TensorProto::dcomplex_val_size() const {
+  return dcomplex_val_.size();
+}
+inline void TensorProto::clear_dcomplex_val() {
+  dcomplex_val_.Clear();
+}
+inline double TensorProto::dcomplex_val(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.dcomplex_val)
+  return dcomplex_val_.Get(index);
+}
+inline void TensorProto::set_dcomplex_val(int index, double value) {
+  dcomplex_val_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.dcomplex_val)
+}
+inline void TensorProto::add_dcomplex_val(double value) {
+  dcomplex_val_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.dcomplex_val)
+}
+inline const ::google::protobuf::RepeatedField< double >&
+TensorProto::dcomplex_val() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.dcomplex_val)
+  return dcomplex_val_;
+}
+inline ::google::protobuf::RepeatedField< double >*
+TensorProto::mutable_dcomplex_val() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.dcomplex_val)
+  return &dcomplex_val_;
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_tensor_2eproto__INCLUDED
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor_shape.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor_shape.pb.cc
@@ -0,0 +1,783 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: tensor_shape.proto
+
+#include "tensor_shape.pb.h"
+
+#include <algorithm>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/port.h>
+#include <google/protobuf/stubs/once.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/wire_format_lite_inl.h>
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/reflection_ops.h>
+#include <google/protobuf/wire_format.h>
+// This is a temporary google only hack
+#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+#include "third_party/protobuf/version.h"
+#endif
+// @@protoc_insertion_point(includes)
+namespace opencv_tensorflow {
+class TensorShapeProto_DimDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TensorShapeProto_Dim>
+      _instance;
+} _TensorShapeProto_Dim_default_instance_;
+class TensorShapeProtoDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<TensorShapeProto>
+      _instance;
+} _TensorShapeProto_default_instance_;
+}  // namespace opencv_tensorflow
+namespace protobuf_tensor_5fshape_2eproto {
+void InitDefaultsTensorShapeProto_DimImpl() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+  ::google::protobuf::internal::InitProtobufDefaultsForceUnique();
+#else
+  ::google::protobuf::internal::InitProtobufDefaults();
+#endif  // GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+  {
+    void* ptr = &::opencv_tensorflow::_TensorShapeProto_Dim_default_instance_;
+    new (ptr) ::opencv_tensorflow::TensorShapeProto_Dim();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::opencv_tensorflow::TensorShapeProto_Dim::InitAsDefaultInstance();
+}
+
+void InitDefaultsTensorShapeProto_Dim() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &InitDefaultsTensorShapeProto_DimImpl);
+}
+
+void InitDefaultsTensorShapeProtoImpl() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+  ::google::protobuf::internal::InitProtobufDefaultsForceUnique();
+#else
+  ::google::protobuf::internal::InitProtobufDefaults();
+#endif  // GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+  protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
+  {
+    void* ptr = &::opencv_tensorflow::_TensorShapeProto_default_instance_;
+    new (ptr) ::opencv_tensorflow::TensorShapeProto();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::opencv_tensorflow::TensorShapeProto::InitAsDefaultInstance();
+}
+
+void InitDefaultsTensorShapeProto() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &InitDefaultsTensorShapeProtoImpl);
+}
+
+::google::protobuf::Metadata file_level_metadata[2];
+
+const ::google::protobuf::uint32 TableStruct::offsets[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+  ~0u,  // no _has_bits_
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto_Dim, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto_Dim, size_),
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto_Dim, name_),
+  ~0u,  // no _has_bits_
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto, dim_),
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto, unknown_rank_),
+};
+static const ::google::protobuf::internal::MigrationSchema schemas[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+  { 0, -1, sizeof(::opencv_tensorflow::TensorShapeProto_Dim)},
+  { 7, -1, sizeof(::opencv_tensorflow::TensorShapeProto)},
+};
+
+static ::google::protobuf::Message const * const file_default_instances[] = {
+  reinterpret_cast<const ::google::protobuf::Message*>(&::opencv_tensorflow::_TensorShapeProto_Dim_default_instance_),
+  reinterpret_cast<const ::google::protobuf::Message*>(&::opencv_tensorflow::_TensorShapeProto_default_instance_),
+};
+
+void protobuf_AssignDescriptors() {
+  AddDescriptors();
+  ::google::protobuf::MessageFactory* factory = NULL;
+  AssignDescriptors(
+      "tensor_shape.proto", schemas, file_default_instances, TableStruct::offsets, factory,
+      file_level_metadata, NULL, NULL);
+}
+
+void protobuf_AssignDescriptorsOnce() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);
+}
+
+void protobuf_RegisterTypes(const ::std::string&) GOOGLE_PROTOBUF_ATTRIBUTE_COLD;
+void protobuf_RegisterTypes(const ::std::string&) {
+  protobuf_AssignDescriptorsOnce();
+  ::google::protobuf::internal::RegisterAllTypes(file_level_metadata, 2);
+}
+
+void AddDescriptorsImpl() {
+  InitDefaults();
+  static const char descriptor[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+      "\n\022tensor_shape.proto\022\021opencv_tensorflow\""
+      "\201\001\n\020TensorShapeProto\0224\n\003dim\030\002 \003(\0132\'.open"
+      "cv_tensorflow.TensorShapeProto.Dim\022\024\n\014un"
+      "known_rank\030\003 \001(\010\032!\n\003Dim\022\014\n\004size\030\001 \001(\003\022\014\n"
+      "\004name\030\002 \001(\tB2\n\030org.tensorflow.frameworkB"
+      "\021TensorShapeProtosP\001\370\001\001b\006proto3"
+  };
+  ::google::protobuf::DescriptorPool::InternalAddGeneratedFile(
+      descriptor, 231);
+  ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
+    "tensor_shape.proto", &protobuf_RegisterTypes);
+}
+
+void AddDescriptors() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
+}
+// Force AddDescriptors() to be called at dynamic initialization time.
+struct StaticDescriptorInitializer {
+  StaticDescriptorInitializer() {
+    AddDescriptors();
+  }
+} static_descriptor_initializer;
+}  // namespace protobuf_tensor_5fshape_2eproto
+namespace opencv_tensorflow {
+
+// ===================================================================
+
+void TensorShapeProto_Dim::InitAsDefaultInstance() {
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TensorShapeProto_Dim::kSizeFieldNumber;
+const int TensorShapeProto_Dim::kNameFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TensorShapeProto_Dim::TensorShapeProto_Dim()
+  : ::google::protobuf::Message(), _internal_metadata_(NULL) {
+  if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
+    ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
+  }
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:opencv_tensorflow.TensorShapeProto.Dim)
+}
+TensorShapeProto_Dim::TensorShapeProto_Dim(::google::protobuf::Arena* arena)
+  : ::google::protobuf::Message(),
+  _internal_metadata_(arena) {
+  ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
+  SharedCtor();
+  RegisterArenaDtor(arena);
+  // @@protoc_insertion_point(arena_constructor:opencv_tensorflow.TensorShapeProto.Dim)
+}
+TensorShapeProto_Dim::TensorShapeProto_Dim(const TensorShapeProto_Dim& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(NULL),
+      _cached_size_(0) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  if (from.name().size() > 0) {
+    name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name(),
+      GetArenaNoVirtual());
+  }
+  size_ = from.size_;
+  // @@protoc_insertion_point(copy_constructor:opencv_tensorflow.TensorShapeProto.Dim)
+}
+
+void TensorShapeProto_Dim::SharedCtor() {
+  name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+  size_ = GOOGLE_LONGLONG(0);
+  _cached_size_ = 0;
+}
+
+TensorShapeProto_Dim::~TensorShapeProto_Dim() {
+  // @@protoc_insertion_point(destructor:opencv_tensorflow.TensorShapeProto.Dim)
+  SharedDtor();
+}
+
+void TensorShapeProto_Dim::SharedDtor() {
+  GOOGLE_DCHECK(GetArenaNoVirtual() == NULL);
+  name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
+}
+
+void TensorShapeProto_Dim::ArenaDtor(void* object) {
+  TensorShapeProto_Dim* _this = reinterpret_cast< TensorShapeProto_Dim* >(object);
+  (void)_this;
+}
+void TensorShapeProto_Dim::RegisterArenaDtor(::google::protobuf::Arena* arena) {
+}
+void TensorShapeProto_Dim::SetCachedSize(int size) const {
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+}
+const ::google::protobuf::Descriptor* TensorShapeProto_Dim::descriptor() {
+  ::protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
+  return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages].descriptor;
+}
+
+const TensorShapeProto_Dim& TensorShapeProto_Dim::default_instance() {
+  ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
+  return *internal_default_instance();
+}
+
+TensorShapeProto_Dim* TensorShapeProto_Dim::New(::google::protobuf::Arena* arena) const {
+  return ::google::protobuf::Arena::CreateMessage<TensorShapeProto_Dim>(arena);
+}
+
+void TensorShapeProto_Dim::Clear() {
+// @@protoc_insertion_point(message_clear_start:opencv_tensorflow.TensorShapeProto.Dim)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+  size_ = GOOGLE_LONGLONG(0);
+  _internal_metadata_.Clear();
+}
+
+bool TensorShapeProto_Dim::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:opencv_tensorflow.TensorShapeProto.Dim)
+  for (;;) {
+    ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // int64 size = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(8u /* 8 & 0xFF */)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
+                 input, &size_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // string name = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(18u /* 18 & 0xFF */)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadString(
+                input, this->mutable_name()));
+          DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+            this->name().data(), static_cast<int>(this->name().length()),
+            ::google::protobuf::internal::WireFormatLite::PARSE,
+            "opencv_tensorflow.TensorShapeProto.Dim.name"));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:opencv_tensorflow.TensorShapeProto.Dim)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:opencv_tensorflow.TensorShapeProto.Dim)
+  return false;
+#undef DO_
+}
+
+void TensorShapeProto_Dim::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:opencv_tensorflow.TensorShapeProto.Dim)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 size = 1;
+  if (this->size() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt64(1, this->size(), output);
+  }
+
+  // string name = 2;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "opencv_tensorflow.TensorShapeProto.Dim.name");
+    ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
+      2, this->name(), output);
+  }
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()), output);
+  }
+  // @@protoc_insertion_point(serialize_end:opencv_tensorflow.TensorShapeProto.Dim)
+}
+
+::google::protobuf::uint8* TensorShapeProto_Dim::InternalSerializeWithCachedSizesToArray(
+    bool deterministic, ::google::protobuf::uint8* target) const {
+  (void)deterministic; // Unused
+  // @@protoc_insertion_point(serialize_to_array_start:opencv_tensorflow.TensorShapeProto.Dim)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int64 size = 1;
+  if (this->size() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(1, this->size(), target);
+  }
+
+  // string name = 2;
+  if (this->name().size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
+      this->name().data(), static_cast<int>(this->name().length()),
+      ::google::protobuf::internal::WireFormatLite::SERIALIZE,
+      "opencv_tensorflow.TensorShapeProto.Dim.name");
+    target =
+      ::google::protobuf::internal::WireFormatLite::WriteStringToArray(
+        2, this->name(), target);
+  }
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:opencv_tensorflow.TensorShapeProto.Dim)
+  return target;
+}
+
+size_t TensorShapeProto_Dim::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:opencv_tensorflow.TensorShapeProto.Dim)
+  size_t total_size = 0;
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()));
+  }
+  // string name = 2;
+  if (this->name().size() > 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::StringSize(
+        this->name());
+  }
+
+  // int64 size = 1;
+  if (this->size() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int64Size(
+        this->size());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = cached_size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+  return total_size;
+}
+
+void TensorShapeProto_Dim::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:opencv_tensorflow.TensorShapeProto.Dim)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TensorShapeProto_Dim* source =
+      ::google::protobuf::internal::DynamicCastToGenerated<const TensorShapeProto_Dim>(
+          &from);
+  if (source == NULL) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:opencv_tensorflow.TensorShapeProto.Dim)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:opencv_tensorflow.TensorShapeProto.Dim)
+    MergeFrom(*source);
+  }
+}
+
+void TensorShapeProto_Dim::MergeFrom(const TensorShapeProto_Dim& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:opencv_tensorflow.TensorShapeProto.Dim)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  if (from.name().size() > 0) {
+    set_name(from.name());
+  }
+  if (from.size() != 0) {
+    set_size(from.size());
+  }
+}
+
+void TensorShapeProto_Dim::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:opencv_tensorflow.TensorShapeProto.Dim)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TensorShapeProto_Dim::CopyFrom(const TensorShapeProto_Dim& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:opencv_tensorflow.TensorShapeProto.Dim)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TensorShapeProto_Dim::IsInitialized() const {
+  return true;
+}
+
+void TensorShapeProto_Dim::Swap(TensorShapeProto_Dim* other) {
+  if (other == this) return;
+  if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) {
+    InternalSwap(other);
+  } else {
+    TensorShapeProto_Dim* temp = New(GetArenaNoVirtual());
+    temp->MergeFrom(*other);
+    other->CopyFrom(*this);
+    InternalSwap(temp);
+    if (GetArenaNoVirtual() == NULL) {
+      delete temp;
+    }
+  }
+}
+void TensorShapeProto_Dim::UnsafeArenaSwap(TensorShapeProto_Dim* other) {
+  if (other == this) return;
+  GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual());
+  InternalSwap(other);
+}
+void TensorShapeProto_Dim::InternalSwap(TensorShapeProto_Dim* other) {
+  using std::swap;
+  name_.Swap(&other->name_);
+  swap(size_, other->size_);
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  swap(_cached_size_, other->_cached_size_);
+}
+
+::google::protobuf::Metadata TensorShapeProto_Dim::GetMetadata() const {
+  protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
+  return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages];
+}
+
+
+// ===================================================================
+
+void TensorShapeProto::InitAsDefaultInstance() {
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int TensorShapeProto::kDimFieldNumber;
+const int TensorShapeProto::kUnknownRankFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+TensorShapeProto::TensorShapeProto()
+  : ::google::protobuf::Message(), _internal_metadata_(NULL) {
+  if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
+    ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto();
+  }
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:opencv_tensorflow.TensorShapeProto)
+}
+TensorShapeProto::TensorShapeProto(::google::protobuf::Arena* arena)
+  : ::google::protobuf::Message(),
+  _internal_metadata_(arena),
+  dim_(arena) {
+  ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto();
+  SharedCtor();
+  RegisterArenaDtor(arena);
+  // @@protoc_insertion_point(arena_constructor:opencv_tensorflow.TensorShapeProto)
+}
+TensorShapeProto::TensorShapeProto(const TensorShapeProto& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(NULL),
+      dim_(from.dim_),
+      _cached_size_(0) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  unknown_rank_ = from.unknown_rank_;
+  // @@protoc_insertion_point(copy_constructor:opencv_tensorflow.TensorShapeProto)
+}
+
+void TensorShapeProto::SharedCtor() {
+  unknown_rank_ = false;
+  _cached_size_ = 0;
+}
+
+TensorShapeProto::~TensorShapeProto() {
+  // @@protoc_insertion_point(destructor:opencv_tensorflow.TensorShapeProto)
+  SharedDtor();
+}
+
+void TensorShapeProto::SharedDtor() {
+  GOOGLE_DCHECK(GetArenaNoVirtual() == NULL);
+}
+
+void TensorShapeProto::ArenaDtor(void* object) {
+  TensorShapeProto* _this = reinterpret_cast< TensorShapeProto* >(object);
+  (void)_this;
+}
+void TensorShapeProto::RegisterArenaDtor(::google::protobuf::Arena* arena) {
+}
+void TensorShapeProto::SetCachedSize(int size) const {
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+}
+const ::google::protobuf::Descriptor* TensorShapeProto::descriptor() {
+  ::protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
+  return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages].descriptor;
+}
+
+const TensorShapeProto& TensorShapeProto::default_instance() {
+  ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto();
+  return *internal_default_instance();
+}
+
+TensorShapeProto* TensorShapeProto::New(::google::protobuf::Arena* arena) const {
+  return ::google::protobuf::Arena::CreateMessage<TensorShapeProto>(arena);
+}
+
+void TensorShapeProto::Clear() {
+// @@protoc_insertion_point(message_clear_start:opencv_tensorflow.TensorShapeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  dim_.Clear();
+  unknown_rank_ = false;
+  _internal_metadata_.Clear();
+}
+
+bool TensorShapeProto::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:opencv_tensorflow.TensorShapeProto)
+  for (;;) {
+    ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(18u /* 18 & 0xFF */)) {
+          DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(input, add_dim()));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // bool unknown_rank = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(24u /* 24 & 0xFF */)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   bool, ::google::protobuf::internal::WireFormatLite::TYPE_BOOL>(
+                 input, &unknown_rank_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:opencv_tensorflow.TensorShapeProto)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:opencv_tensorflow.TensorShapeProto)
+  return false;
+#undef DO_
+}
+
+void TensorShapeProto::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:opencv_tensorflow.TensorShapeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->dim_size()); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
+      2, this->dim(static_cast<int>(i)), output);
+  }
+
+  // bool unknown_rank = 3;
+  if (this->unknown_rank() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteBool(3, this->unknown_rank(), output);
+  }
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()), output);
+  }
+  // @@protoc_insertion_point(serialize_end:opencv_tensorflow.TensorShapeProto)
+}
+
+::google::protobuf::uint8* TensorShapeProto::InternalSerializeWithCachedSizesToArray(
+    bool deterministic, ::google::protobuf::uint8* target) const {
+  (void)deterministic; // Unused
+  // @@protoc_insertion_point(serialize_to_array_start:opencv_tensorflow.TensorShapeProto)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
+  for (unsigned int i = 0,
+      n = static_cast<unsigned int>(this->dim_size()); i < n; i++) {
+    target = ::google::protobuf::internal::WireFormatLite::
+      InternalWriteMessageToArray(
+        2, this->dim(static_cast<int>(i)), deterministic, target);
+  }
+
+  // bool unknown_rank = 3;
+  if (this->unknown_rank() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(3, this->unknown_rank(), target);
+  }
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:opencv_tensorflow.TensorShapeProto)
+  return target;
+}
+
+size_t TensorShapeProto::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:opencv_tensorflow.TensorShapeProto)
+  size_t total_size = 0;
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()));
+  }
+  // repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
+  {
+    unsigned int count = static_cast<unsigned int>(this->dim_size());
+    total_size += 1UL * count;
+    for (unsigned int i = 0; i < count; i++) {
+      total_size +=
+        ::google::protobuf::internal::WireFormatLite::MessageSize(
+          this->dim(static_cast<int>(i)));
+    }
+  }
+
+  // bool unknown_rank = 3;
+  if (this->unknown_rank() != 0) {
+    total_size += 1 + 1;
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = cached_size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+  return total_size;
+}
+
+void TensorShapeProto::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:opencv_tensorflow.TensorShapeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  const TensorShapeProto* source =
+      ::google::protobuf::internal::DynamicCastToGenerated<const TensorShapeProto>(
+          &from);
+  if (source == NULL) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:opencv_tensorflow.TensorShapeProto)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:opencv_tensorflow.TensorShapeProto)
+    MergeFrom(*source);
+  }
+}
+
+void TensorShapeProto::MergeFrom(const TensorShapeProto& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:opencv_tensorflow.TensorShapeProto)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  dim_.MergeFrom(from.dim_);
+  if (from.unknown_rank() != 0) {
+    set_unknown_rank(from.unknown_rank());
+  }
+}
+
+void TensorShapeProto::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:opencv_tensorflow.TensorShapeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void TensorShapeProto::CopyFrom(const TensorShapeProto& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:opencv_tensorflow.TensorShapeProto)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool TensorShapeProto::IsInitialized() const {
+  return true;
+}
+
+void TensorShapeProto::Swap(TensorShapeProto* other) {
+  if (other == this) return;
+  if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) {
+    InternalSwap(other);
+  } else {
+    TensorShapeProto* temp = New(GetArenaNoVirtual());
+    temp->MergeFrom(*other);
+    other->CopyFrom(*this);
+    InternalSwap(temp);
+    if (GetArenaNoVirtual() == NULL) {
+      delete temp;
+    }
+  }
+}
+void TensorShapeProto::UnsafeArenaSwap(TensorShapeProto* other) {
+  if (other == this) return;
+  GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual());
+  InternalSwap(other);
+}
+void TensorShapeProto::InternalSwap(TensorShapeProto* other) {
+  using std::swap;
+  dim_.InternalSwap(&other->dim_);
+  swap(unknown_rank_, other->unknown_rank_);
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  swap(_cached_size_, other->_cached_size_);
+}
+
+::google::protobuf::Metadata TensorShapeProto::GetMetadata() const {
+  protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
+  return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages];
+}
+
+
+// @@protoc_insertion_point(namespace_scope)
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor_shape.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/tensor_shape.pb.h
@@ -0,0 +1,491 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: tensor_shape.proto
+
+#ifndef PROTOBUF_tensor_5fshape_2eproto__INCLUDED
+#define PROTOBUF_tensor_5fshape_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+
+namespace protobuf_tensor_5fshape_2eproto {
+// Internal implementation detail -- do not use these members.
+struct TableStruct {
+  static const ::google::protobuf::internal::ParseTableField entries[];
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
+  static const ::google::protobuf::internal::ParseTable schema[2];
+  static const ::google::protobuf::internal::FieldMetadata field_metadata[];
+  static const ::google::protobuf::internal::SerializationTable serialization_table[];
+  static const ::google::protobuf::uint32 offsets[];
+};
+void AddDescriptors();
+void InitDefaultsTensorShapeProto_DimImpl();
+void InitDefaultsTensorShapeProto_Dim();
+void InitDefaultsTensorShapeProtoImpl();
+void InitDefaultsTensorShapeProto();
+inline void InitDefaults() {
+  InitDefaultsTensorShapeProto_Dim();
+  InitDefaultsTensorShapeProto();
+}
+}  // namespace protobuf_tensor_5fshape_2eproto
+namespace opencv_tensorflow {
+class TensorShapeProto;
+class TensorShapeProtoDefaultTypeInternal;
+extern TensorShapeProtoDefaultTypeInternal _TensorShapeProto_default_instance_;
+class TensorShapeProto_Dim;
+class TensorShapeProto_DimDefaultTypeInternal;
+extern TensorShapeProto_DimDefaultTypeInternal _TensorShapeProto_Dim_default_instance_;
+}  // namespace opencv_tensorflow
+namespace opencv_tensorflow {
+
+// ===================================================================
+
+class TensorShapeProto_Dim : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.TensorShapeProto.Dim) */ {
+ public:
+  TensorShapeProto_Dim();
+  virtual ~TensorShapeProto_Dim();
+
+  TensorShapeProto_Dim(const TensorShapeProto_Dim& from);
+
+  inline TensorShapeProto_Dim& operator=(const TensorShapeProto_Dim& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorShapeProto_Dim(TensorShapeProto_Dim&& from) noexcept
+    : TensorShapeProto_Dim() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorShapeProto_Dim& operator=(TensorShapeProto_Dim&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const TensorShapeProto_Dim& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorShapeProto_Dim* internal_default_instance() {
+    return reinterpret_cast<const TensorShapeProto_Dim*>(
+               &_TensorShapeProto_Dim_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    0;
+
+  void UnsafeArenaSwap(TensorShapeProto_Dim* other);
+  void Swap(TensorShapeProto_Dim* other);
+  friend void swap(TensorShapeProto_Dim& a, TensorShapeProto_Dim& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorShapeProto_Dim* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  TensorShapeProto_Dim* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const TensorShapeProto_Dim& from);
+  void MergeFrom(const TensorShapeProto_Dim& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(TensorShapeProto_Dim* other);
+  protected:
+  explicit TensorShapeProto_Dim(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // string name = 2;
+  void clear_name();
+  static const int kNameFieldNumber = 2;
+  const ::std::string& name() const;
+  void set_name(const ::std::string& value);
+  #if LANG_CXX11
+  void set_name(::std::string&& value);
+  #endif
+  void set_name(const char* value);
+  void set_name(const char* value, size_t size);
+  ::std::string* mutable_name();
+  ::std::string* release_name();
+  void set_allocated_name(::std::string* name);
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  ::std::string* unsafe_arena_release_name();
+  PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
+  "    string fields are deprecated and will be removed in a"
+  "    future release.")
+  void unsafe_arena_set_allocated_name(
+      ::std::string* name);
+
+  // int64 size = 1;
+  void clear_size();
+  static const int kSizeFieldNumber = 1;
+  ::google::protobuf::int64 size() const;
+  void set_size(::google::protobuf::int64 value);
+
+  // @@protoc_insertion_point(class_scope:opencv_tensorflow.TensorShapeProto.Dim)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::internal::ArenaStringPtr name_;
+  ::google::protobuf::int64 size_;
+  mutable int _cached_size_;
+  friend struct ::protobuf_tensor_5fshape_2eproto::TableStruct;
+  friend void ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_DimImpl();
+};
+// -------------------------------------------------------------------
+
+class TensorShapeProto : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.TensorShapeProto) */ {
+ public:
+  TensorShapeProto();
+  virtual ~TensorShapeProto();
+
+  TensorShapeProto(const TensorShapeProto& from);
+
+  inline TensorShapeProto& operator=(const TensorShapeProto& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  TensorShapeProto(TensorShapeProto&& from) noexcept
+    : TensorShapeProto() {
+    *this = ::std::move(from);
+  }
+
+  inline TensorShapeProto& operator=(TensorShapeProto&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const TensorShapeProto& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const TensorShapeProto* internal_default_instance() {
+    return reinterpret_cast<const TensorShapeProto*>(
+               &_TensorShapeProto_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    1;
+
+  void UnsafeArenaSwap(TensorShapeProto* other);
+  void Swap(TensorShapeProto* other);
+  friend void swap(TensorShapeProto& a, TensorShapeProto& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline TensorShapeProto* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  TensorShapeProto* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const TensorShapeProto& from);
+  void MergeFrom(const TensorShapeProto& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(TensorShapeProto* other);
+  protected:
+  explicit TensorShapeProto(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  typedef TensorShapeProto_Dim Dim;
+
+  // accessors -------------------------------------------------------
+
+  // repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
+  int dim_size() const;
+  void clear_dim();
+  static const int kDimFieldNumber = 2;
+  const ::opencv_tensorflow::TensorShapeProto_Dim& dim(int index) const;
+  ::opencv_tensorflow::TensorShapeProto_Dim* mutable_dim(int index);
+  ::opencv_tensorflow::TensorShapeProto_Dim* add_dim();
+  ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >*
+      mutable_dim();
+  const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >&
+      dim() const;
+
+  // bool unknown_rank = 3;
+  void clear_unknown_rank();
+  static const int kUnknownRankFieldNumber = 3;
+  bool unknown_rank() const;
+  void set_unknown_rank(bool value);
+
+  // @@protoc_insertion_point(class_scope:opencv_tensorflow.TensorShapeProto)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim > dim_;
+  bool unknown_rank_;
+  mutable int _cached_size_;
+  friend struct ::protobuf_tensor_5fshape_2eproto::TableStruct;
+  friend void ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProtoImpl();
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// TensorShapeProto_Dim
+
+// int64 size = 1;
+inline void TensorShapeProto_Dim::clear_size() {
+  size_ = GOOGLE_LONGLONG(0);
+}
+inline ::google::protobuf::int64 TensorShapeProto_Dim::size() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.Dim.size)
+  return size_;
+}
+inline void TensorShapeProto_Dim::set_size(::google::protobuf::int64 value) {
+
+  size_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorShapeProto.Dim.size)
+}
+
+// string name = 2;
+inline void TensorShapeProto_Dim::clear_name() {
+  name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline const ::std::string& TensorShapeProto_Dim::name() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.Dim.name)
+  return name_.Get();
+}
+inline void TensorShapeProto_Dim::set_name(const ::std::string& value) {
+
+  name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorShapeProto.Dim.name)
+}
+#if LANG_CXX11
+inline void TensorShapeProto_Dim::set_name(::std::string&& value) {
+
+  name_.Set(
+    &::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.TensorShapeProto.Dim.name)
+}
+#endif
+inline void TensorShapeProto_Dim::set_name(const char* value) {
+  GOOGLE_DCHECK(value != NULL);
+
+  name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
+              GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_char:opencv_tensorflow.TensorShapeProto.Dim.name)
+}
+inline void TensorShapeProto_Dim::set_name(const char* value,
+    size_t size) {
+
+  name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.TensorShapeProto.Dim.name)
+}
+inline ::std::string* TensorShapeProto_Dim::mutable_name() {
+
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorShapeProto.Dim.name)
+  return name_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline ::std::string* TensorShapeProto_Dim::release_name() {
+  // @@protoc_insertion_point(field_release:opencv_tensorflow.TensorShapeProto.Dim.name)
+
+  return name_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
+}
+inline void TensorShapeProto_Dim::set_allocated_name(::std::string* name) {
+  if (name != NULL) {
+
+  } else {
+
+  }
+  name_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name,
+      GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.TensorShapeProto.Dim.name)
+}
+inline ::std::string* TensorShapeProto_Dim::unsafe_arena_release_name() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.TensorShapeProto.Dim.name)
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+
+  return name_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      GetArenaNoVirtual());
+}
+inline void TensorShapeProto_Dim::unsafe_arena_set_allocated_name(
+    ::std::string* name) {
+  GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
+  if (name != NULL) {
+
+  } else {
+
+  }
+  name_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
+      name, GetArenaNoVirtual());
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.TensorShapeProto.Dim.name)
+}
+
+// -------------------------------------------------------------------
+
+// TensorShapeProto
+
+// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
+inline int TensorShapeProto::dim_size() const {
+  return dim_.size();
+}
+inline void TensorShapeProto::clear_dim() {
+  dim_.Clear();
+}
+inline const ::opencv_tensorflow::TensorShapeProto_Dim& TensorShapeProto::dim(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.dim)
+  return dim_.Get(index);
+}
+inline ::opencv_tensorflow::TensorShapeProto_Dim* TensorShapeProto::mutable_dim(int index) {
+  // @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorShapeProto.dim)
+  return dim_.Mutable(index);
+}
+inline ::opencv_tensorflow::TensorShapeProto_Dim* TensorShapeProto::add_dim() {
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.TensorShapeProto.dim)
+  return dim_.Add();
+}
+inline ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >*
+TensorShapeProto::mutable_dim() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorShapeProto.dim)
+  return &dim_;
+}
+inline const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >&
+TensorShapeProto::dim() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.TensorShapeProto.dim)
+  return dim_;
+}
+
+// bool unknown_rank = 3;
+inline void TensorShapeProto::clear_unknown_rank() {
+  unknown_rank_ = false;
+}
+inline bool TensorShapeProto::unknown_rank() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.unknown_rank)
+  return unknown_rank_;
+}
+inline void TensorShapeProto::set_unknown_rank(bool value) {
+
+  unknown_rank_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.TensorShapeProto.unknown_rank)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_tensor_5fshape_2eproto__INCLUDED
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/types.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/types.pb.cc
@@ -0,0 +1,144 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: types.proto
+
+#include "types.pb.h"
+
+#include <algorithm>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/port.h>
+#include <google/protobuf/stubs/once.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/wire_format_lite_inl.h>
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/reflection_ops.h>
+#include <google/protobuf/wire_format.h>
+// This is a temporary google only hack
+#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+#include "third_party/protobuf/version.h"
+#endif
+// @@protoc_insertion_point(includes)
+namespace opencv_tensorflow {
+}  // namespace opencv_tensorflow
+namespace protobuf_types_2eproto {
+const ::google::protobuf::EnumDescriptor* file_level_enum_descriptors[1];
+const ::google::protobuf::uint32 TableStruct::offsets[1] = {};
+static const ::google::protobuf::internal::MigrationSchema* schemas = NULL;
+static const ::google::protobuf::Message* const* file_default_instances = NULL;
+
+void protobuf_AssignDescriptors() {
+  AddDescriptors();
+  ::google::protobuf::MessageFactory* factory = NULL;
+  AssignDescriptors(
+      "types.proto", schemas, file_default_instances, TableStruct::offsets, factory,
+      NULL, file_level_enum_descriptors, NULL);
+}
+
+void protobuf_AssignDescriptorsOnce() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);
+}
+
+void protobuf_RegisterTypes(const ::std::string&) GOOGLE_PROTOBUF_ATTRIBUTE_COLD;
+void protobuf_RegisterTypes(const ::std::string&) {
+  protobuf_AssignDescriptorsOnce();
+}
+
+void AddDescriptorsImpl() {
+  InitDefaults();
+  static const char descriptor[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+      "\n\013types.proto\022\021opencv_tensorflow*\234\005\n\010Dat"
+      "aType\022\016\n\nDT_INVALID\020\000\022\014\n\010DT_FLOAT\020\001\022\r\n\tD"
+      "T_DOUBLE\020\002\022\014\n\010DT_INT32\020\003\022\014\n\010DT_UINT8\020\004\022\014"
+      "\n\010DT_INT16\020\005\022\013\n\007DT_INT8\020\006\022\r\n\tDT_STRING\020\007"
+      "\022\020\n\014DT_COMPLEX64\020\010\022\014\n\010DT_INT64\020\t\022\013\n\007DT_B"
+      "OOL\020\n\022\014\n\010DT_QINT8\020\013\022\r\n\tDT_QUINT8\020\014\022\r\n\tDT"
+      "_QINT32\020\r\022\017\n\013DT_BFLOAT16\020\016\022\r\n\tDT_QINT16\020"
+      "\017\022\016\n\nDT_QUINT16\020\020\022\r\n\tDT_UINT16\020\021\022\021\n\rDT_C"
+      "OMPLEX128\020\022\022\013\n\007DT_HALF\020\023\022\020\n\014DT_FLOAT_REF"
+      "\020e\022\021\n\rDT_DOUBLE_REF\020f\022\020\n\014DT_INT32_REF\020g\022"
+      "\020\n\014DT_UINT8_REF\020h\022\020\n\014DT_INT16_REF\020i\022\017\n\013D"
+      "T_INT8_REF\020j\022\021\n\rDT_STRING_REF\020k\022\024\n\020DT_CO"
+      "MPLEX64_REF\020l\022\020\n\014DT_INT64_REF\020m\022\017\n\013DT_BO"
+      "OL_REF\020n\022\020\n\014DT_QINT8_REF\020o\022\021\n\rDT_QUINT8_"
+      "REF\020p\022\021\n\rDT_QINT32_REF\020q\022\023\n\017DT_BFLOAT16_"
+      "REF\020r\022\021\n\rDT_QINT16_REF\020s\022\022\n\016DT_QUINT16_R"
+      "EF\020t\022\021\n\rDT_UINT16_REF\020u\022\025\n\021DT_COMPLEX128"
+      "_REF\020v\022\017\n\013DT_HALF_REF\020wB,\n\030org.tensorflo"
+      "w.frameworkB\013TypesProtosP\001\370\001\001b\006proto3"
+  };
+  ::google::protobuf::DescriptorPool::InternalAddGeneratedFile(
+      descriptor, 757);
+  ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
+    "types.proto", &protobuf_RegisterTypes);
+}
+
+void AddDescriptors() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
+}
+// Force AddDescriptors() to be called at dynamic initialization time.
+struct StaticDescriptorInitializer {
+  StaticDescriptorInitializer() {
+    AddDescriptors();
+  }
+} static_descriptor_initializer;
+}  // namespace protobuf_types_2eproto
+namespace opencv_tensorflow {
+const ::google::protobuf::EnumDescriptor* DataType_descriptor() {
+  protobuf_types_2eproto::protobuf_AssignDescriptorsOnce();
+  return protobuf_types_2eproto::file_level_enum_descriptors[0];
+}
+bool DataType_IsValid(int value) {
+  switch (value) {
+    case 0:
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+    case 16:
+    case 17:
+    case 18:
+    case 19:
+    case 101:
+    case 102:
+    case 103:
+    case 104:
+    case 105:
+    case 106:
+    case 107:
+    case 108:
+    case 109:
+    case 110:
+    case 111:
+    case 112:
+    case 113:
+    case 114:
+    case 115:
+    case 116:
+    case 117:
+    case 118:
+    case 119:
+      return true;
+    default:
+      return false;
+  }
+}
+
+
+// @@protoc_insertion_point(namespace_scope)
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/types.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/types.pb.h
@@ -0,0 +1,143 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: types.proto
+
+#ifndef PROTOBUF_types_2eproto__INCLUDED
+#define PROTOBUF_types_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/generated_enum_reflection.h>
+// @@protoc_insertion_point(includes)
+
+namespace protobuf_types_2eproto {
+// Internal implementation detail -- do not use these members.
+struct TableStruct {
+  static const ::google::protobuf::internal::ParseTableField entries[];
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
+  static const ::google::protobuf::internal::ParseTable schema[1];
+  static const ::google::protobuf::internal::FieldMetadata field_metadata[];
+  static const ::google::protobuf::internal::SerializationTable serialization_table[];
+  static const ::google::protobuf::uint32 offsets[];
+};
+void AddDescriptors();
+inline void InitDefaults() {
+}
+}  // namespace protobuf_types_2eproto
+namespace opencv_tensorflow {
+}  // namespace opencv_tensorflow
+namespace opencv_tensorflow {
+
+enum DataType {
+  DT_INVALID = 0,
+  DT_FLOAT = 1,
+  DT_DOUBLE = 2,
+  DT_INT32 = 3,
+  DT_UINT8 = 4,
+  DT_INT16 = 5,
+  DT_INT8 = 6,
+  DT_STRING = 7,
+  DT_COMPLEX64 = 8,
+  DT_INT64 = 9,
+  DT_BOOL = 10,
+  DT_QINT8 = 11,
+  DT_QUINT8 = 12,
+  DT_QINT32 = 13,
+  DT_BFLOAT16 = 14,
+  DT_QINT16 = 15,
+  DT_QUINT16 = 16,
+  DT_UINT16 = 17,
+  DT_COMPLEX128 = 18,
+  DT_HALF = 19,
+  DT_FLOAT_REF = 101,
+  DT_DOUBLE_REF = 102,
+  DT_INT32_REF = 103,
+  DT_UINT8_REF = 104,
+  DT_INT16_REF = 105,
+  DT_INT8_REF = 106,
+  DT_STRING_REF = 107,
+  DT_COMPLEX64_REF = 108,
+  DT_INT64_REF = 109,
+  DT_BOOL_REF = 110,
+  DT_QINT8_REF = 111,
+  DT_QUINT8_REF = 112,
+  DT_QINT32_REF = 113,
+  DT_BFLOAT16_REF = 114,
+  DT_QINT16_REF = 115,
+  DT_QUINT16_REF = 116,
+  DT_UINT16_REF = 117,
+  DT_COMPLEX128_REF = 118,
+  DT_HALF_REF = 119,
+  DataType_INT_MIN_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32min,
+  DataType_INT_MAX_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32max
+};
+bool DataType_IsValid(int value);
+const DataType DataType_MIN = DT_INVALID;
+const DataType DataType_MAX = DT_HALF_REF;
+const int DataType_ARRAYSIZE = DataType_MAX + 1;
+
+const ::google::protobuf::EnumDescriptor* DataType_descriptor();
+inline const ::std::string& DataType_Name(DataType value) {
+  return ::google::protobuf::internal::NameOfEnum(
+    DataType_descriptor(), value);
+}
+inline bool DataType_Parse(
+    const ::std::string& name, DataType* value) {
+  return ::google::protobuf::internal::ParseNamedEnum<DataType>(
+    DataType_descriptor(), name, value);
+}
+// ===================================================================
+
+
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace opencv_tensorflow
+
+namespace google {
+namespace protobuf {
+
+template <> struct is_proto_enum< ::opencv_tensorflow::DataType> : ::google::protobuf::internal::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< ::opencv_tensorflow::DataType>() {
+  return ::opencv_tensorflow::DataType_descriptor();
+}
+
+}  // namespace protobuf
+}  // namespace google
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_types_2eproto__INCLUDED
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/versions.pb.cc
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/versions.pb.cc
@@ -0,0 +1,492 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: versions.proto
+
+#include "versions.pb.h"
+
+#include <algorithm>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/port.h>
+#include <google/protobuf/stubs/once.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/wire_format_lite_inl.h>
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/reflection_ops.h>
+#include <google/protobuf/wire_format.h>
+// This is a temporary google only hack
+#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+#include "third_party/protobuf/version.h"
+#endif
+// @@protoc_insertion_point(includes)
+namespace opencv_tensorflow {
+class VersionDefDefaultTypeInternal {
+ public:
+  ::google::protobuf::internal::ExplicitlyConstructed<VersionDef>
+      _instance;
+} _VersionDef_default_instance_;
+}  // namespace opencv_tensorflow
+namespace protobuf_versions_2eproto {
+void InitDefaultsVersionDefImpl() {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+  ::google::protobuf::internal::InitProtobufDefaultsForceUnique();
+#else
+  ::google::protobuf::internal::InitProtobufDefaults();
+#endif  // GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
+  {
+    void* ptr = &::opencv_tensorflow::_VersionDef_default_instance_;
+    new (ptr) ::opencv_tensorflow::VersionDef();
+    ::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
+  }
+  ::opencv_tensorflow::VersionDef::InitAsDefaultInstance();
+}
+
+void InitDefaultsVersionDef() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &InitDefaultsVersionDefImpl);
+}
+
+::google::protobuf::Metadata file_level_metadata[1];
+
+const ::google::protobuf::uint32 TableStruct::offsets[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+  ~0u,  // no _has_bits_
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, _internal_metadata_),
+  ~0u,  // no _extensions_
+  ~0u,  // no _oneof_case_
+  ~0u,  // no _weak_field_map_
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, producer_),
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, min_consumer_),
+  GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, bad_consumers_),
+};
+static const ::google::protobuf::internal::MigrationSchema schemas[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+  { 0, -1, sizeof(::opencv_tensorflow::VersionDef)},
+};
+
+static ::google::protobuf::Message const * const file_default_instances[] = {
+  reinterpret_cast<const ::google::protobuf::Message*>(&::opencv_tensorflow::_VersionDef_default_instance_),
+};
+
+void protobuf_AssignDescriptors() {
+  AddDescriptors();
+  ::google::protobuf::MessageFactory* factory = NULL;
+  AssignDescriptors(
+      "versions.proto", schemas, file_default_instances, TableStruct::offsets, factory,
+      file_level_metadata, NULL, NULL);
+}
+
+void protobuf_AssignDescriptorsOnce() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);
+}
+
+void protobuf_RegisterTypes(const ::std::string&) GOOGLE_PROTOBUF_ATTRIBUTE_COLD;
+void protobuf_RegisterTypes(const ::std::string&) {
+  protobuf_AssignDescriptorsOnce();
+  ::google::protobuf::internal::RegisterAllTypes(file_level_metadata, 1);
+}
+
+void AddDescriptorsImpl() {
+  InitDefaults();
+  static const char descriptor[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
+      "\n\016versions.proto\022\021opencv_tensorflow\"K\n\nV"
+      "ersionDef\022\020\n\010producer\030\001 \001(\005\022\024\n\014min_consu"
+      "mer\030\002 \001(\005\022\025\n\rbad_consumers\030\003 \003(\005B/\n\030org."
+      "tensorflow.frameworkB\016VersionsProtosP\001\370\001"
+      "\001b\006proto3"
+  };
+  ::google::protobuf::DescriptorPool::InternalAddGeneratedFile(
+      descriptor, 169);
+  ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
+    "versions.proto", &protobuf_RegisterTypes);
+}
+
+void AddDescriptors() {
+  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
+  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
+}
+// Force AddDescriptors() to be called at dynamic initialization time.
+struct StaticDescriptorInitializer {
+  StaticDescriptorInitializer() {
+    AddDescriptors();
+  }
+} static_descriptor_initializer;
+}  // namespace protobuf_versions_2eproto
+namespace opencv_tensorflow {
+
+// ===================================================================
+
+void VersionDef::InitAsDefaultInstance() {
+}
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+const int VersionDef::kProducerFieldNumber;
+const int VersionDef::kMinConsumerFieldNumber;
+const int VersionDef::kBadConsumersFieldNumber;
+#endif  // !defined(_MSC_VER) || _MSC_VER >= 1900
+
+VersionDef::VersionDef()
+  : ::google::protobuf::Message(), _internal_metadata_(NULL) {
+  if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
+    ::protobuf_versions_2eproto::InitDefaultsVersionDef();
+  }
+  SharedCtor();
+  // @@protoc_insertion_point(constructor:opencv_tensorflow.VersionDef)
+}
+VersionDef::VersionDef(::google::protobuf::Arena* arena)
+  : ::google::protobuf::Message(),
+  _internal_metadata_(arena),
+  bad_consumers_(arena) {
+  ::protobuf_versions_2eproto::InitDefaultsVersionDef();
+  SharedCtor();
+  RegisterArenaDtor(arena);
+  // @@protoc_insertion_point(arena_constructor:opencv_tensorflow.VersionDef)
+}
+VersionDef::VersionDef(const VersionDef& from)
+  : ::google::protobuf::Message(),
+      _internal_metadata_(NULL),
+      bad_consumers_(from.bad_consumers_),
+      _cached_size_(0) {
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::memcpy(&producer_, &from.producer_,
+    static_cast<size_t>(reinterpret_cast<char*>(&min_consumer_) -
+    reinterpret_cast<char*>(&producer_)) + sizeof(min_consumer_));
+  // @@protoc_insertion_point(copy_constructor:opencv_tensorflow.VersionDef)
+}
+
+void VersionDef::SharedCtor() {
+  ::memset(&producer_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&min_consumer_) -
+      reinterpret_cast<char*>(&producer_)) + sizeof(min_consumer_));
+  _cached_size_ = 0;
+}
+
+VersionDef::~VersionDef() {
+  // @@protoc_insertion_point(destructor:opencv_tensorflow.VersionDef)
+  SharedDtor();
+}
+
+void VersionDef::SharedDtor() {
+  GOOGLE_DCHECK(GetArenaNoVirtual() == NULL);
+}
+
+void VersionDef::ArenaDtor(void* object) {
+  VersionDef* _this = reinterpret_cast< VersionDef* >(object);
+  (void)_this;
+}
+void VersionDef::RegisterArenaDtor(::google::protobuf::Arena* arena) {
+}
+void VersionDef::SetCachedSize(int size) const {
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+}
+const ::google::protobuf::Descriptor* VersionDef::descriptor() {
+  ::protobuf_versions_2eproto::protobuf_AssignDescriptorsOnce();
+  return ::protobuf_versions_2eproto::file_level_metadata[kIndexInFileMessages].descriptor;
+}
+
+const VersionDef& VersionDef::default_instance() {
+  ::protobuf_versions_2eproto::InitDefaultsVersionDef();
+  return *internal_default_instance();
+}
+
+VersionDef* VersionDef::New(::google::protobuf::Arena* arena) const {
+  return ::google::protobuf::Arena::CreateMessage<VersionDef>(arena);
+}
+
+void VersionDef::Clear() {
+// @@protoc_insertion_point(message_clear_start:opencv_tensorflow.VersionDef)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  // Prevent compiler warnings about cached_has_bits being unused
+  (void) cached_has_bits;
+
+  bad_consumers_.Clear();
+  ::memset(&producer_, 0, static_cast<size_t>(
+      reinterpret_cast<char*>(&min_consumer_) -
+      reinterpret_cast<char*>(&producer_)) + sizeof(min_consumer_));
+  _internal_metadata_.Clear();
+}
+
+bool VersionDef::MergePartialFromCodedStream(
+    ::google::protobuf::io::CodedInputStream* input) {
+#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
+  ::google::protobuf::uint32 tag;
+  // @@protoc_insertion_point(parse_start:opencv_tensorflow.VersionDef)
+  for (;;) {
+    ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
+    tag = p.first;
+    if (!p.second) goto handle_unusual;
+    switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
+      // int32 producer = 1;
+      case 1: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(8u /* 8 & 0xFF */)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 input, &producer_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // int32 min_consumer = 2;
+      case 2: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(16u /* 16 & 0xFF */)) {
+
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 input, &min_consumer_)));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      // repeated int32 bad_consumers = 3;
+      case 3: {
+        if (static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(26u /* 26 & 0xFF */)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 input, this->mutable_bad_consumers())));
+        } else if (
+            static_cast< ::google::protobuf::uint8>(tag) ==
+            static_cast< ::google::protobuf::uint8>(24u /* 24 & 0xFF */)) {
+          DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
+                   ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
+                 1, 26u, input, this->mutable_bad_consumers())));
+        } else {
+          goto handle_unusual;
+        }
+        break;
+      }
+
+      default: {
+      handle_unusual:
+        if (tag == 0) {
+          goto success;
+        }
+        DO_(::google::protobuf::internal::WireFormat::SkipField(
+              input, tag, _internal_metadata_.mutable_unknown_fields()));
+        break;
+      }
+    }
+  }
+success:
+  // @@protoc_insertion_point(parse_success:opencv_tensorflow.VersionDef)
+  return true;
+failure:
+  // @@protoc_insertion_point(parse_failure:opencv_tensorflow.VersionDef)
+  return false;
+#undef DO_
+}
+
+void VersionDef::SerializeWithCachedSizes(
+    ::google::protobuf::io::CodedOutputStream* output) const {
+  // @@protoc_insertion_point(serialize_start:opencv_tensorflow.VersionDef)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int32 producer = 1;
+  if (this->producer() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt32(1, this->producer(), output);
+  }
+
+  // int32 min_consumer = 2;
+  if (this->min_consumer() != 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt32(2, this->min_consumer(), output);
+  }
+
+  // repeated int32 bad_consumers = 3;
+  if (this->bad_consumers_size() > 0) {
+    ::google::protobuf::internal::WireFormatLite::WriteTag(3, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+    output->WriteVarint32(static_cast< ::google::protobuf::uint32>(
+        _bad_consumers_cached_byte_size_));
+  }
+  for (int i = 0, n = this->bad_consumers_size(); i < n; i++) {
+    ::google::protobuf::internal::WireFormatLite::WriteInt32NoTag(
+      this->bad_consumers(i), output);
+  }
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()), output);
+  }
+  // @@protoc_insertion_point(serialize_end:opencv_tensorflow.VersionDef)
+}
+
+::google::protobuf::uint8* VersionDef::InternalSerializeWithCachedSizesToArray(
+    bool deterministic, ::google::protobuf::uint8* target) const {
+  (void)deterministic; // Unused
+  // @@protoc_insertion_point(serialize_to_array_start:opencv_tensorflow.VersionDef)
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  // int32 producer = 1;
+  if (this->producer() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(1, this->producer(), target);
+  }
+
+  // int32 min_consumer = 2;
+  if (this->min_consumer() != 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(2, this->min_consumer(), target);
+  }
+
+  // repeated int32 bad_consumers = 3;
+  if (this->bad_consumers_size() > 0) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
+      3,
+      ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+      target);
+    target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
+        static_cast< ::google::protobuf::int32>(
+            _bad_consumers_cached_byte_size_), target);
+    target = ::google::protobuf::internal::WireFormatLite::
+      WriteInt32NoTagToArray(this->bad_consumers_, target);
+  }
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()), target);
+  }
+  // @@protoc_insertion_point(serialize_to_array_end:opencv_tensorflow.VersionDef)
+  return target;
+}
+
+size_t VersionDef::ByteSizeLong() const {
+// @@protoc_insertion_point(message_byte_size_start:opencv_tensorflow.VersionDef)
+  size_t total_size = 0;
+
+  if ((_internal_metadata_.have_unknown_fields() &&  ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
+    total_size +=
+      ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
+        (::google::protobuf::internal::GetProto3PreserveUnknownsDefault()   ? _internal_metadata_.unknown_fields()   : _internal_metadata_.default_instance()));
+  }
+  // repeated int32 bad_consumers = 3;
+  {
+    size_t data_size = ::google::protobuf::internal::WireFormatLite::
+      Int32Size(this->bad_consumers_);
+    if (data_size > 0) {
+      total_size += 1 +
+        ::google::protobuf::internal::WireFormatLite::Int32Size(
+            static_cast< ::google::protobuf::int32>(data_size));
+    }
+    int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
+    GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+    _bad_consumers_cached_byte_size_ = cached_size;
+    GOOGLE_SAFE_CONCURRENT_WRITES_END();
+    total_size += data_size;
+  }
+
+  // int32 producer = 1;
+  if (this->producer() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int32Size(
+        this->producer());
+  }
+
+  // int32 min_consumer = 2;
+  if (this->min_consumer() != 0) {
+    total_size += 1 +
+      ::google::protobuf::internal::WireFormatLite::Int32Size(
+        this->min_consumer());
+  }
+
+  int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
+  GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
+  _cached_size_ = cached_size;
+  GOOGLE_SAFE_CONCURRENT_WRITES_END();
+  return total_size;
+}
+
+void VersionDef::MergeFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_merge_from_start:opencv_tensorflow.VersionDef)
+  GOOGLE_DCHECK_NE(&from, this);
+  const VersionDef* source =
+      ::google::protobuf::internal::DynamicCastToGenerated<const VersionDef>(
+          &from);
+  if (source == NULL) {
+  // @@protoc_insertion_point(generalized_merge_from_cast_fail:opencv_tensorflow.VersionDef)
+    ::google::protobuf::internal::ReflectionOps::Merge(from, this);
+  } else {
+  // @@protoc_insertion_point(generalized_merge_from_cast_success:opencv_tensorflow.VersionDef)
+    MergeFrom(*source);
+  }
+}
+
+void VersionDef::MergeFrom(const VersionDef& from) {
+// @@protoc_insertion_point(class_specific_merge_from_start:opencv_tensorflow.VersionDef)
+  GOOGLE_DCHECK_NE(&from, this);
+  _internal_metadata_.MergeFrom(from._internal_metadata_);
+  ::google::protobuf::uint32 cached_has_bits = 0;
+  (void) cached_has_bits;
+
+  bad_consumers_.MergeFrom(from.bad_consumers_);
+  if (from.producer() != 0) {
+    set_producer(from.producer());
+  }
+  if (from.min_consumer() != 0) {
+    set_min_consumer(from.min_consumer());
+  }
+}
+
+void VersionDef::CopyFrom(const ::google::protobuf::Message& from) {
+// @@protoc_insertion_point(generalized_copy_from_start:opencv_tensorflow.VersionDef)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+void VersionDef::CopyFrom(const VersionDef& from) {
+// @@protoc_insertion_point(class_specific_copy_from_start:opencv_tensorflow.VersionDef)
+  if (&from == this) return;
+  Clear();
+  MergeFrom(from);
+}
+
+bool VersionDef::IsInitialized() const {
+  return true;
+}
+
+void VersionDef::Swap(VersionDef* other) {
+  if (other == this) return;
+  if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) {
+    InternalSwap(other);
+  } else {
+    VersionDef* temp = New(GetArenaNoVirtual());
+    temp->MergeFrom(*other);
+    other->CopyFrom(*this);
+    InternalSwap(temp);
+    if (GetArenaNoVirtual() == NULL) {
+      delete temp;
+    }
+  }
+}
+void VersionDef::UnsafeArenaSwap(VersionDef* other) {
+  if (other == this) return;
+  GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual());
+  InternalSwap(other);
+}
+void VersionDef::InternalSwap(VersionDef* other) {
+  using std::swap;
+  bad_consumers_.InternalSwap(&other->bad_consumers_);
+  swap(producer_, other->producer_);
+  swap(min_consumer_, other->min_consumer_);
+  _internal_metadata_.Swap(&other->_internal_metadata_);
+  swap(_cached_size_, other->_cached_size_);
+}
+
+::google::protobuf::Metadata VersionDef::GetMetadata() const {
+  protobuf_versions_2eproto::protobuf_AssignDescriptorsOnce();
+  return ::protobuf_versions_2eproto::file_level_metadata[kIndexInFileMessages];
+}
+
+
+// @@protoc_insertion_point(namespace_scope)
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
--- a/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/versions.pb.h
+++ b/3rdparty/opencv-4.5.4/modules/dnn/misc/tensorflow/versions.pb.h
@@ -0,0 +1,272 @@
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: versions.proto
+
+#ifndef PROTOBUF_versions_2eproto__INCLUDED
+#define PROTOBUF_versions_2eproto__INCLUDED
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please update
+#error your headers.
+#endif
+#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers.  Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/metadata.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+
+namespace protobuf_versions_2eproto {
+// Internal implementation detail -- do not use these members.
+struct TableStruct {
+  static const ::google::protobuf::internal::ParseTableField entries[];
+  static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
+  static const ::google::protobuf::internal::ParseTable schema[1];
+  static const ::google::protobuf::internal::FieldMetadata field_metadata[];
+  static const ::google::protobuf::internal::SerializationTable serialization_table[];
+  static const ::google::protobuf::uint32 offsets[];
+};
+void AddDescriptors();
+void InitDefaultsVersionDefImpl();
+void InitDefaultsVersionDef();
+inline void InitDefaults() {
+  InitDefaultsVersionDef();
+}
+}  // namespace protobuf_versions_2eproto
+namespace opencv_tensorflow {
+class VersionDef;
+class VersionDefDefaultTypeInternal;
+extern VersionDefDefaultTypeInternal _VersionDef_default_instance_;
+}  // namespace opencv_tensorflow
+namespace opencv_tensorflow {
+
+// ===================================================================
+
+class VersionDef : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.VersionDef) */ {
+ public:
+  VersionDef();
+  virtual ~VersionDef();
+
+  VersionDef(const VersionDef& from);
+
+  inline VersionDef& operator=(const VersionDef& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  #if LANG_CXX11
+  VersionDef(VersionDef&& from) noexcept
+    : VersionDef() {
+    *this = ::std::move(from);
+  }
+
+  inline VersionDef& operator=(VersionDef&& from) noexcept {
+    if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+  #endif
+  inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
+    return GetArenaNoVirtual();
+  }
+  inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
+    return MaybeArenaPtr();
+  }
+  static const ::google::protobuf::Descriptor* descriptor();
+  static const VersionDef& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const VersionDef* internal_default_instance() {
+    return reinterpret_cast<const VersionDef*>(
+               &_VersionDef_default_instance_);
+  }
+  static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
+    0;
+
+  void UnsafeArenaSwap(VersionDef* other);
+  void Swap(VersionDef* other);
+  friend void swap(VersionDef& a, VersionDef& b) {
+    a.Swap(&b);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline VersionDef* New() const PROTOBUF_FINAL { return New(NULL); }
+
+  VersionDef* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
+  void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
+  void CopyFrom(const VersionDef& from);
+  void MergeFrom(const VersionDef& from);
+  void Clear() PROTOBUF_FINAL;
+  bool IsInitialized() const PROTOBUF_FINAL;
+
+  size_t ByteSizeLong() const PROTOBUF_FINAL;
+  bool MergePartialFromCodedStream(
+      ::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
+  void SerializeWithCachedSizes(
+      ::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
+  ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
+      bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
+  int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
+  private:
+  void SharedCtor();
+  void SharedDtor();
+  void SetCachedSize(int size) const PROTOBUF_FINAL;
+  void InternalSwap(VersionDef* other);
+  protected:
+  explicit VersionDef(::google::protobuf::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
+  private:
+  inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
+    return _internal_metadata_.arena();
+  }
+  inline void* MaybeArenaPtr() const {
+    return _internal_metadata_.raw_arena_ptr();
+  }
+  public:
+
+  ::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  // repeated int32 bad_consumers = 3;
+  int bad_consumers_size() const;
+  void clear_bad_consumers();
+  static const int kBadConsumersFieldNumber = 3;
+  ::google::protobuf::int32 bad_consumers(int index) const;
+  void set_bad_consumers(int index, ::google::protobuf::int32 value);
+  void add_bad_consumers(::google::protobuf::int32 value);
+  const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+      bad_consumers() const;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+      mutable_bad_consumers();
+
+  // int32 producer = 1;
+  void clear_producer();
+  static const int kProducerFieldNumber = 1;
+  ::google::protobuf::int32 producer() const;
+  void set_producer(::google::protobuf::int32 value);
+
+  // int32 min_consumer = 2;
+  void clear_min_consumer();
+  static const int kMinConsumerFieldNumber = 2;
+  ::google::protobuf::int32 min_consumer() const;
+  void set_min_consumer(::google::protobuf::int32 value);
+
+  // @@protoc_insertion_point(class_scope:opencv_tensorflow.VersionDef)
+ private:
+
+  ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
+  template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::google::protobuf::RepeatedField< ::google::protobuf::int32 > bad_consumers_;
+  mutable int _bad_consumers_cached_byte_size_;
+  ::google::protobuf::int32 producer_;
+  ::google::protobuf::int32 min_consumer_;
+  mutable int _cached_size_;
+  friend struct ::protobuf_versions_2eproto::TableStruct;
+  friend void ::protobuf_versions_2eproto::InitDefaultsVersionDefImpl();
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// VersionDef
+
+// int32 producer = 1;
+inline void VersionDef::clear_producer() {
+  producer_ = 0;
+}
+inline ::google::protobuf::int32 VersionDef::producer() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.VersionDef.producer)
+  return producer_;
+}
+inline void VersionDef::set_producer(::google::protobuf::int32 value) {
+
+  producer_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.VersionDef.producer)
+}
+
+// int32 min_consumer = 2;
+inline void VersionDef::clear_min_consumer() {
+  min_consumer_ = 0;
+}
+inline ::google::protobuf::int32 VersionDef::min_consumer() const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.VersionDef.min_consumer)
+  return min_consumer_;
+}
+inline void VersionDef::set_min_consumer(::google::protobuf::int32 value) {
+
+  min_consumer_ = value;
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.VersionDef.min_consumer)
+}
+
+// repeated int32 bad_consumers = 3;
+inline int VersionDef::bad_consumers_size() const {
+  return bad_consumers_.size();
+}
+inline void VersionDef::clear_bad_consumers() {
+  bad_consumers_.Clear();
+}
+inline ::google::protobuf::int32 VersionDef::bad_consumers(int index) const {
+  // @@protoc_insertion_point(field_get:opencv_tensorflow.VersionDef.bad_consumers)
+  return bad_consumers_.Get(index);
+}
+inline void VersionDef::set_bad_consumers(int index, ::google::protobuf::int32 value) {
+  bad_consumers_.Set(index, value);
+  // @@protoc_insertion_point(field_set:opencv_tensorflow.VersionDef.bad_consumers)
+}
+inline void VersionDef::add_bad_consumers(::google::protobuf::int32 value) {
+  bad_consumers_.Add(value);
+  // @@protoc_insertion_point(field_add:opencv_tensorflow.VersionDef.bad_consumers)
+}
+inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
+VersionDef::bad_consumers() const {
+  // @@protoc_insertion_point(field_list:opencv_tensorflow.VersionDef.bad_consumers)
+  return bad_consumers_;
+}
+inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
+VersionDef::mutable_bad_consumers() {
+  // @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.VersionDef.bad_consumers)
+  return &bad_consumers_;
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+}  // namespace opencv_tensorflow
+
+// @@protoc_insertion_point(global_scope)
+
+#endif  // PROTOBUF_versions_2eproto__INCLUDED
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_caffe.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_caffe.cpp
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+// Recommends run this performance test via
+// ./bin/opencv_perf_dnn 2> /dev/null | grep "PERFSTAT" -A 3
+// because whole output includes Caffe's logs.
+//
+// Note: Be sure that interesting version of Caffe was linked.
+// Note: There is an impact on Halide performance. Comment this tests if you
+//       want to run the last one.
+//
+// How to build Intel-Caffe with MKLDNN backend
+// ============================================
+// mkdir build && cd build
+// cmake -DCMAKE_BUILD_TYPE=Release \
+//       -DUSE_MKLDNN_AS_DEFAULT_ENGINE=ON \
+//       -DUSE_MKL2017_AS_DEFAULT_ENGINE=OFF \
+//       -DCPU_ONLY=ON \
+//       -DCMAKE_INSTALL_PREFIX=/usr/local .. && make -j8
+// sudo make install
+//
+// In case of problems with cublas_v2.h at include/caffe/util/device_alternate.hpp: add line
+// #define CPU_ONLY
+// before the first line
+// #ifdef CPU_ONLY  // CPU-only Caffe.
+
+#if defined(HAVE_CAFFE) || defined(HAVE_CLCAFFE)
+
+#include "perf_precomp.hpp"
+#include <iostream>
+#include <caffe/caffe.hpp>
+
+namespace opencv_test {
+
+static caffe::Net<float>* initNet(std::string proto, std::string weights)
+{
+    proto = findDataFile(proto);
+    weights = findDataFile(weights, false);
+
+#ifdef HAVE_CLCAFFE
+    caffe::Caffe::set_mode(caffe::Caffe::GPU);
+    caffe::Caffe::SetDevice(0);
+
+    caffe::Net<float>* net =
+        new caffe::Net<float>(proto, caffe::TEST, caffe::Caffe::GetDefaultDevice());
+#else
+    caffe::Caffe::set_mode(caffe::Caffe::CPU);
+
+    caffe::Net<float>* net = new caffe::Net<float>(proto, caffe::TEST);
+#endif
+
+    net->CopyTrainedLayersFrom(weights);
+
+    caffe::Blob<float>* input = net->input_blobs()[0];
+
+    CV_Assert(input->num() == 1);
+    CV_Assert(input->channels() == 3);
+
+    Mat inputMat(input->height(), input->width(), CV_32FC3, (char*)input->cpu_data());
+    randu(inputMat, 0.0f, 1.0f);
+
+    net->Forward();
+    return net;
+}
+
+PERF_TEST(AlexNet_caffe, CaffePerfTest)
+{
+    caffe::Net<float>* net = initNet("dnn/bvlc_alexnet.prototxt",
+                                     "dnn/bvlc_alexnet.caffemodel");
+    TEST_CYCLE() net->Forward();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(GoogLeNet_caffe, CaffePerfTest)
+{
+    caffe::Net<float>* net = initNet("dnn/bvlc_googlenet.prototxt",
+                                     "dnn/bvlc_googlenet.caffemodel");
+    TEST_CYCLE() net->Forward();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(ResNet50_caffe, CaffePerfTest)
+{
+    caffe::Net<float>* net = initNet("dnn/ResNet-50-deploy.prototxt",
+                                     "dnn/ResNet-50-model.caffemodel");
+    TEST_CYCLE() net->Forward();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
+{
+    caffe::Net<float>* net = initNet("dnn/squeezenet_v1.1.prototxt",
+                                     "dnn/squeezenet_v1.1.caffemodel");
+    TEST_CYCLE() net->Forward();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST(MobileNet_SSD, CaffePerfTest)
+{
+    caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
+                                     "dnn/MobileNetSSD_deploy.caffemodel");
+    TEST_CYCLE() net->Forward();
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+#endif  // HAVE_CAFFE
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_common.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_common.cpp
@@ -0,0 +1,6 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "../test/test_common.impl.hpp"  // shared with accuracy tests
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_convolution.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_convolution.cpp
@@ -0,0 +1,894 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+// Flops_Kernel_Input_OutCN_Group_Stride_Pad_Dilation_PadAdjust_PadMode_Bias
+struct TestSize_ {
+    int width, height;
+    operator Size() const { return Size(width, height); }
+};
+struct ConvParam_t {
+    struct TestSize_ kernel;
+    struct BlobShape { int dims[4]; } shapeIn;
+    int outCN;
+    int groups;
+    struct TestSize_ stride;
+    struct TestSize_ dilation;
+    struct TestSize_ pad;
+    struct TestSize_ padAdjust;
+    const char* padMode;
+    bool hasBias;
+    double declared_flops;
+};
+// Details: #12142
+// Last update: 2021-09
+static const ConvParam_t testConvolutionConfigs[] = {
+    /* GFLOPS 3.398 x 20 = 67.956 */ {{7, 7}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {3, 3}, {0, 0}, "", true, 3397788160.},
+    /* GFLOPS 16.987 x 3 = 50.962 */ {{5, 5}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16987226112.},
+    /* GFLOPS 23.122 x 2 = 46.244 */ {{5, 5}, {{1, 672, 32, 32}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23121788928.},
+    /* GFLOPS 9.987 x 3 = 29.960 */ {{3, 3}, {{1, 256, 92, 92}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9986707456.},
+    /* GFLOPS 1.595 x 16 = 25.524 */ {{3, 3}, {{1, 256, 26, 26}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595230208.},
+    /* GFLOPS 4.566 x 5 = 22.828 */ {{7, 7}, {{1, 172, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {3, 3}, {0, 0}, "", true, 4565684736.},
+    /* GFLOPS 1.596 x 14 = 22.338 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
+    /* GFLOPS 1.595 x 12 = 19.141 */ {{3, 3}, {{1, 512, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
+    /* GFLOPS 6.814 x 2 = 13.629 */ {{3, 3}, {{1, 512, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6814386176.},
+    /* GFLOPS 6.637 x 2 = 13.274 */ {{3, 3}, {{1, 256, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6636960000.},
+    /* GFLOPS 11.797 x 1 = 11.797 */ {{5, 5}, {{1, 240, 64, 64}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 11797463040.},
+    /* GFLOPS 11.797 x 1 = 11.797 */ {{5, 5}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 11796971520.},
+    /* GFLOPS 10.701 x 1 = 10.701 */ {{3, 3}, {{1, 512, 38, 38}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 10700715792.},
+    /* GFLOPS 10.087 x 1 = 10.087 */ {{3, 3}, {{1, 576, 38, 50}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10086963200.},
+    /* GFLOPS 9.993 x 1 = 9.993 */ {{3, 3}, {{1, 64, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9993207808.},
+    /* GFLOPS 9.989 x 1 = 9.989 */ {{3, 3}, {{1, 128, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9988874240.},
+    /* GFLOPS 9.986 x 1 = 9.986 */ {{3, 3}, {{1, 512, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9985624064.},
+    /* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
+    /* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703596544.},
+    /* GFLOPS 4.247 x 2 = 8.494 */ {{3, 3}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247224320.},
+    /* GFLOPS 8.025 x 1 = 8.025 */ {{3, 3}, {{1, 1024, 19, 19}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 8025101478.},
+    /* GFLOPS 0.798 x 9 = 7.180 */ {{3, 3}, {{1, 128, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797788160.},
+    /* GFLOPS 0.798 x 9 = 7.179 */ {{3, 3}, {{1, 256, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797615104.},
+    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6641280000.},
+    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 150, 200}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6641280000.},
+    /* GFLOPS 6.638 x 1 = 6.638 */ {{3, 3}, {{1, 128, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6638400000.},
+    /* GFLOPS 6.118 x 1 = 6.118 */ {{3, 3}, {{1, 144, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6117654528.},
+    /* GFLOPS 6.116 x 1 = 6.116 */ {{3, 3}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6115590144.},
+    /* GFLOPS 5.780 x 1 = 5.780 */ {{5, 5}, {{1, 672, 32, 32}}, 672, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5780447232.},
+    /* GFLOPS 1.704 x 3 = 5.111 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
+    /* GFLOPS 4.997 x 1 = 4.997 */ {{3, 3}, {{1, 64, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4996603904.},
+    /* GFLOPS 4.994 x 1 = 4.994 */ {{3, 3}, {{1, 128, 92, 92}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4994437120.},
+    /* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 256, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4993353728.},
+    /* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 512, 46, 46}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4992812032.},
+    /* GFLOPS 1.659 x 3 = 4.977 */ {{3, 3}, {{1, 960, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1658976000.},
+    /* GFLOPS 2.156 x 2 = 4.312 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2156088384.},
+    /* GFLOPS 4.247 x 1 = 4.247 */ {{5, 5}, {{1, 144, 128, 128}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247322624.},
+    /* GFLOPS 0.798 x 5 = 3.988 */ {{3, 3}, {{1, 512, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797528576.},
+    /* GFLOPS 0.958 x 4 = 3.833 */ {{3, 3}, {{1, 384, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958307712.},
+    /* GFLOPS 0.624 x 6 = 3.746 */ {{3, 3}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 624304640.},
+    /* GFLOPS 3.408 x 1 = 3.408 */ {{3, 3}, {{1, 256, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3407562752.},
+    /* GFLOPS 3.407 x 1 = 3.407 */ {{3, 3}, {{1, 512, 19, 19}}, 1024, 1, {1, 1}, {6, 6}, {6, 6}, {0, 0}, "", true, 3407193088.},
+    /* GFLOPS 0.177 x 19 = 3.370 */ {{1, 1}, {{1, 512, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177382400.},
+    /* GFLOPS 0.302 x 11 = 3.325 */ {{3, 3}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 302252032.},
+    /* GFLOPS 3.321 x 1 = 3.321 */ {{3, 3}, {{1, 64, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3320640000.},
+    /* GFLOPS 0.830 x 4 = 3.321 */ {{3, 3}, {{1, 64, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 830160000.},
+    /* GFLOPS 3.319 x 1 = 3.319 */ {{3, 3}, {{1, 128, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3319200000.},
+    /* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 416, 416}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
+    /* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
+    /* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 208, 208}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
+    /* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
+    /* GFLOPS 1.596 x 2 = 3.191 */ {{3, 3}, {{1, 128, 104, 104}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
+    /* GFLOPS 1.595 x 2 = 3.190 */ {{3, 3}, {{1, 256, 52, 52}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595230208.},
+    /* GFLOPS 1.595 x 2 = 3.190 */ {{3, 3}, {{1, 512, 26, 26}}, 1024, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
+    /* GFLOPS 0.178 x 16 = 2.841 */ {{1, 1}, {{1, 256, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177555456.},
+    /* GFLOPS 2.719 x 1 = 2.719 */ {{3, 3}, {{1, 96, 256, 256}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2719481856.},
+    /* GFLOPS 0.177 x 15 = 2.659 */ {{1, 1}, {{1, 1024, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177295872.},
+    /* GFLOPS 1.245 x 2 = 2.490 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1244880000.},
+    /* GFLOPS 0.798 x 3 = 2.394 */ {{3, 3}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 798134272.},
+    /* GFLOPS 0.472 x 5 = 2.360 */ {{3, 3}, {{1, 256, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471961600.},
+    /* GFLOPS 2.255 x 1 = 2.255 */ {{3, 3}, {{1, 128, 80, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2255285760.},
+    /* GFLOPS 2.153 x 1 = 2.153 */ {{3, 3}, {{1, 128, 78, 98}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2152611840.},
+    /* GFLOPS 2.100 x 1 = 2.100 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2100330000.},
+    /* GFLOPS 2.052 x 1 = 2.052 */ {{3, 3}, {{1, 128, 76, 96}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2052298240.},
+    /* GFLOPS 1.022 x 2 = 2.044 */ {{3, 3}, {{1, 576, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1021896057.},
+    /* GFLOPS 1.995 x 1 = 1.995 */ {{9, 9}, {{1, 3, 320, 400}}, 32, 1, {1, 1}, {1, 1}, {4, 4}, {0, 0}, "", true, 1994752000.},
+    /* GFLOPS 1.954 x 1 = 1.954 */ {{3, 3}, {{1, 128, 74, 94}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1954344960.},
+    /* GFLOPS 0.958 x 2 = 1.917 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958446336.},
+    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1887539200.},
+    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1024, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1887539200.},
+    /* GFLOPS 1.859 x 1 = 1.859 */ {{3, 3}, {{1, 128, 72, 92}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1858752000.},
+    /* GFLOPS 1.766 x 1 = 1.766 */ {{3, 3}, {{1, 128, 70, 90}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1765519360.},
+    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703781376.},
+    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703781376.},
+    /* GFLOPS 1.675 x 1 = 1.675 */ {{3, 3}, {{1, 128, 68, 88}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1674647040.},
+    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1659600000.},
+    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1659600000.},
+    /* GFLOPS 1.586 x 1 = 1.586 */ {{3, 3}, {{1, 128, 66, 86}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1586135040.},
+    /* GFLOPS 1.500 x 1 = 1.500 */ {{3, 3}, {{1, 128, 64, 84}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1499983360.},
+    /* GFLOPS 1.416 x 1 = 1.416 */ {{3, 3}, {{1, 128, 62, 82}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1416192000.},
+    /* GFLOPS 0.472 x 3 = 1.416 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 472064000.},
+    /* GFLOPS 0.472 x 3 = 1.416 */ {{3, 3}, {{1, 512, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
+    /* GFLOPS 0.280 x 5 = 1.402 */ {{1, 1}, {{1, 576, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 280409600.},
+    /* GFLOPS 0.701 x 2 = 1.401 */ {{3, 3}, {{1, 128, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
+    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 128, 56, 56}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 256, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231261184.},
+    /* GFLOPS 0.210 x 6 = 1.262 */ {{1, 1}, {{1, 576, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
+    /* GFLOPS 0.420 x 3 = 1.261 */ {{3, 3}, {{1, 96, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420492800.},
+    /* GFLOPS 1.261 x 1 = 1.261 */ {{3, 3}, {{1, 192, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1261113600.},
+    /* GFLOPS 1.258 x 1 = 1.258 */ {{3, 3}, {{1, 1280, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258038600.},
+    /* GFLOPS 1.248 x 1 = 1.248 */ {{3, 3}, {{1, 256, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1248338432.},
+    /* GFLOPS 1.245 x 1 = 1.245 */ {{3, 3}, {{1, 64, 75, 75}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1245240000.},
+    /* GFLOPS 1.210 x 1 = 1.210 */ {{3, 3}, {{1, 32, 256, 256}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1210056704.},
+    /* GFLOPS 1.196 x 1 = 1.196 */ {{3, 3}, {{1, 384, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1196336128.},
+    /* GFLOPS 1.195 x 1 = 1.195 */ {{9, 9}, {{1, 32, 240, 320}}, 3, 1, {1, 1}, {1, 1}, {4, 4}, {0, 0}, "", true, 1194624000.},
+    /* GFLOPS 1.182 x 1 = 1.182 */ {{3, 3}, {{1, 32, 320, 400}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1181696000.},
+    /* GFLOPS 1.181 x 1 = 1.181 */ {{3, 3}, {{1, 64, 160, 200}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1180672000.},
+    /* GFLOPS 0.561 x 2 = 1.121 */ {{3, 3}, {{1, 128, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 560576000.},
+    /* GFLOPS 1.112 x 1 = 1.112 */ {{3, 3}, {{1, 512, 10, 10}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1111570200.},
+    /* GFLOPS 0.357 x 3 = 1.072 */ {{1, 1}, {{1, 64, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 357187584.},
+    /* GFLOPS 1.062 x 1 = 1.062 */ {{3, 3}, {{1, 240, 64, 64}}, 240, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1061928960.},
+    /* GFLOPS 0.076 x 14 = 1.058 */ {{3, 3}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75563008.},
+    /* GFLOPS 1.051 x 1 = 1.051 */ {{3, 3}, {{1, 160, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1050988800.},
+    /* GFLOPS 0.210 x 5 = 1.051 */ {{1, 1}, {{1, 256, 20, 20}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
+    /* GFLOPS 0.210 x 5 = 1.049 */ {{1, 1}, {{1, 1024, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
+    /* GFLOPS 1.006 x 1 = 1.006 */ {{3, 3}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1006441800.},
+    /* GFLOPS 0.246 x 4 = 0.985 */ {{1, 1}, {{1, 256, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 246240000.},
+    /* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189452800.},
+    /* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189452800.},
+    /* GFLOPS 0.472 x 2 = 0.945 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 472268800.},
+    /* GFLOPS 0.934 x 1 = 0.934 */ {{3, 3}, {{1, 96, 150, 150}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 933660000.},
+    /* GFLOPS 0.231 x 4 = 0.925 */ {{3, 3}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.896 x 1 = 0.896 */ {{5, 5}, {{1, 96, 27, 27}}, 256, 2, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 895981824.},
+    /* GFLOPS 0.089 x 10 = 0.890 */ {{1, 1}, {{1, 128, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88950784.},
+    /* GFLOPS 0.089 x 10 = 0.888 */ {{1, 1}, {{1, 256, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88777728.},
+    /* GFLOPS 0.876 x 1 = 0.876 */ {{3, 3}, {{1, 160, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 875824000.},
+    /* GFLOPS 0.850 x 1 = 0.850 */ {{7, 7}, {{1, 3, 600, 800}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 849600000.},
+    /* GFLOPS 0.841 x 1 = 0.841 */ {{3, 3}, {{1, 128, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 840864000.},
+    /* GFLOPS 0.415 x 2 = 0.831 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415440000.},
+    /* GFLOPS 0.757 x 1 = 0.757 */ {{1, 1}, {{1, 1024, 19, 19}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 757441536.},
+    /* GFLOPS 0.712 x 1 = 0.712 */ {{1, 1}, {{1, 128, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 711606272.},
+    /* GFLOPS 0.178 x 4 = 0.712 */ {{1, 1}, {{1, 128, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177901568.},
+    /* GFLOPS 0.354 x 2 = 0.707 */ {{1, 1}, {{1, 256, 52, 52}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 353723760.},
+    /* GFLOPS 0.351 x 2 = 0.701 */ {{1, 1}, {{1, 576, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 350512000.},
+    /* GFLOPS 0.701 x 1 = 0.701 */ {{3, 3}, {{1, 128, 75, 100}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
+    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 694235136.},
+    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 694235136.},
+    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231411712.},
+    /* GFLOPS 0.058 x 12 = 0.694 */ {{3, 3}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 57827840.},
+    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 512, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231236096.},
+    /* GFLOPS 0.160 x 4 = 0.639 */ {{3, 3}, {{1, 64, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159833472.},
+    /* GFLOPS 0.211 x 3 = 0.634 */ {{1, 1}, {{1, 64, 80, 80}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 211353600.},
+    /* GFLOPS 0.211 x 3 = 0.632 */ {{1, 1}, {{1, 128, 40, 40}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210534400.},
+    /* GFLOPS 0.210 x 3 = 0.630 */ {{1, 1}, {{1, 512, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209920000.},
+    /* GFLOPS 0.210 x 3 = 0.630 */ {{1, 1}, {{1, 512, 10, 10}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209920000.},
+    /* GFLOPS 0.103 x 6 = 0.618 */ {{1, 1}, {{1, 256, 14, 14}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
+    /* GFLOPS 0.615 x 1 = 0.615 */ {{1, 1}, {{1, 320, 75, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615360000.},
+    /* GFLOPS 0.305 x 2 = 0.609 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 304578560.},
+    /* GFLOPS 0.597 x 1 = 0.597 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 597254400.},
+    /* GFLOPS 0.278 x 2 = 0.557 */ {{1, 1}, {{1, 128, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 278431744.},
+    /* GFLOPS 0.185 x 3 = 0.554 */ {{1, 1}, {{1, 192, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 184800000.},
+    /* GFLOPS 0.553 x 1 = 0.553 */ {{3, 3}, {{1, 64, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 553440000.},
+    /* GFLOPS 0.539 x 1 = 0.539 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 539178048.},
+    /* GFLOPS 0.103 x 5 = 0.514 */ {{1, 1}, {{1, 1024, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102810624.},
+    /* GFLOPS 0.491 x 1 = 0.491 */ {{1, 1}, {{1, 576, 38, 50}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 490716800.},
+    /* GFLOPS 0.483 x 1 = 0.483 */ {{7, 7}, {{1, 3, 320, 320}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 483328000.},
+    /* GFLOPS 0.240 x 2 = 0.479 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239680896.},
+    /* GFLOPS 0.477 x 1 = 0.477 */ {{3, 3}, {{1, 3, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 476692480.},
+    /* GFLOPS 0.237 x 2 = 0.474 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 236830720.},
+    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 471910400.},
+    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
+    /* GFLOPS 0.155 x 3 = 0.464 */ {{1, 1}, {{1, 112, 32, 32}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 154828800.},
+    /* GFLOPS 0.114 x 4 = 0.454 */ {{1, 1}, {{1, 192, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113541120.},
+    /* GFLOPS 0.449 x 1 = 0.449 */ {{3, 3}, {{1, 384, 13, 13}}, 384, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 448626048.},
+    /* GFLOPS 0.089 x 5 = 0.443 */ {{1, 1}, {{1, 512, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88691200.},
+    /* GFLOPS 0.428 x 1 = 0.428 */ {{1, 1}, {{1, 64, 64, 64}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 427991040.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 425945344.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 425945344.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 425945344.},
+    /* GFLOPS 0.421 x 1 = 0.421 */ {{1, 1}, {{1, 576, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420614400.},
+    /* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 256, 40, 40}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 420249600.},
+    /* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 256, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
+    /* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 512, 20, 20}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 419840000.},
+    /* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 1024, 10, 10}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 419635200.},
+    /* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 2048, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209766400.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 415440000.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 415080000.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415080000.},
+    /* GFLOPS 0.104 x 4 = 0.414 */ {{1, 1}, {{1, 64, 56, 56}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103563264.},
+    /* GFLOPS 0.103 x 4 = 0.413 */ {{1, 1}, {{1, 128, 28, 28}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
+    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 399413248.},
+    /* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 32, 104, 104}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199706624.},
+    /* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 64, 52, 52}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199533568.},
+    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 398894080.},
+    /* GFLOPS 0.199 x 2 = 0.399 */ {{3, 3}, {{1, 128, 26, 26}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199447040.},
+    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 26, 26}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
+    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
+    /* GFLOPS 0.376 x 1 = 0.376 */ {{1, 1}, {{1, 24, 300, 400}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 376320000.},
+    /* GFLOPS 0.179 x 2 = 0.357 */ {{1, 1}, {{1, 64, 208, 208}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 178593792.},
+    /* GFLOPS 0.089 x 4 = 0.357 */ {{1, 1}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 89296896.},
+    /* GFLOPS 0.356 x 1 = 0.356 */ {{1, 1}, {{1, 128, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 355803136.},
+    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 256, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 355110912.},
+    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 512, 26, 26}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354764800.},
+    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 1024, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354591744.},
+    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 2048, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354505216.},
+    /* GFLOPS 0.177 x 2 = 0.353 */ {{1, 1}, {{1, 512, 26, 26}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 176689500.},
+    /* GFLOPS 0.070 x 5 = 0.348 */ {{1, 1}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 69607936.},
+    /* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 346967040.},
+    /* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 346967040.},
+    /* GFLOPS 0.014 x 24 = 0.347 */ {{3, 3}, {{1, 128, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 14456960.},
+    /* GFLOPS 0.113 x 3 = 0.340 */ {{1, 1}, {{1, 1152, 16, 16}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113295360.},
+    /* GFLOPS 0.053 x 6 = 0.320 */ {{1, 1}, {{1, 576, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53277824.},
+    /* GFLOPS 0.319 x 1 = 0.319 */ {{3, 3}, {{1, 192, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 319482112.},
+    /* GFLOPS 0.317 x 1 = 0.317 */ {{3, 3}, {{1, 3, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 316800000.},
+    /* GFLOPS 0.315 x 1 = 0.315 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 315369600.},
+    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 7, 7}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
+    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
+    /* GFLOPS 0.154 x 2 = 0.309 */ {{1, 1}, {{1, 672, 32, 32}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 154255360.},
+    /* GFLOPS 0.308 x 1 = 0.308 */ {{1, 1}, {{1, 320, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 307680000.},
+    /* GFLOPS 0.034 x 9 = 0.304 */ {{1, 1}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 33816576.},
+    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 256, 13, 13}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299105664.},
+    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 384, 13, 13}}, 256, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299084032.},
+    /* GFLOPS 0.017 x 17 = 0.290 */ {{1, 1}, {{1, 32, 32, 64}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17039360.},
+    /* GFLOPS 0.017 x 16 = 0.269 */ {{1, 1}, {{1, 128, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16842752.},
+    /* GFLOPS 0.133 x 2 = 0.266 */ {{3, 3}, {{1, 128, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
+    /* GFLOPS 0.266 x 1 = 0.266 */ {{1, 1}, {{1, 384, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 266160128.},
+    /* GFLOPS 0.266 x 1 = 0.266 */ {{1, 1}, {{1, 768, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 265987072.},
+    /* GFLOPS 0.038 x 7 = 0.265 */ {{3, 3}, {{1, 16, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37879808.},
+    /* GFLOPS 0.019 x 14 = 0.264 */ {{3, 3}, {{1, 64, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18890752.},
+    /* GFLOPS 0.262 x 1 = 0.262 */ {{1, 1}, {{1, 2560, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 262195200.},
+    /* GFLOPS 0.126 x 2 = 0.252 */ {{3, 3}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 125812050.},
+    /* GFLOPS 0.248 x 1 = 0.248 */ {{1, 1}, {{1, 64, 150, 200}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 247680000.},
+    /* GFLOPS 0.040 x 6 = 0.240 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
+    /* GFLOPS 0.080 x 3 = 0.240 */ {{3, 3}, {{1, 96, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79893632.},
+    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 239611584.},
+    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239611584.},
+    /* GFLOPS 0.079 x 3 = 0.237 */ {{1, 1}, {{1, 80, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 79134720.},
+    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 236830720.},
+    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 236830720.},
+    /* GFLOPS 0.118 x 2 = 0.236 */ {{3, 3}, {{1, 32, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118169600.},
+    /* GFLOPS 0.236 x 1 = 0.236 */ {{3, 3}, {{1, 256, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 235980800.},
+    /* GFLOPS 0.116 x 2 = 0.231 */ {{1, 1}, {{1, 24, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115605504.},
+    /* GFLOPS 0.111 x 2 = 0.221 */ {{3, 3}, {{1, 192, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110624000.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 213018880.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", false, 213018880.},
+    /* GFLOPS 0.107 x 2 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106509440.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 212972672.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 512, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 212949568.},
+    /* GFLOPS 0.212 x 1 = 0.212 */ {{7, 7}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 212400000.},
+    /* GFLOPS 0.211 x 1 = 0.211 */ {{11, 11}, {{1, 3, 227, 227}}, 96, 1, {4, 4}, {1, 1}, {0, 0}, {0, 0}, "", true, 211120800.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{3, 3}, {{1, 64, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 209817600.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
+    /* GFLOPS 0.104 x 2 = 0.208 */ {{3, 3}, {{1, 32, 75, 75}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 103860000.},
+    /* GFLOPS 0.208 x 1 = 0.208 */ {{1, 1}, {{1, 16, 256, 256}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 207618048.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
+    /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 1024, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
+    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 2048, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
+    /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 2048, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102785536.},
+    /* GFLOPS 0.201 x 1 = 0.201 */ {{1, 1}, {{1, 512, 14, 14}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 200900000.},
+    /* GFLOPS 0.200 x 1 = 0.200 */ {{3, 3}, {{1, 160, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 199687872.},
+    /* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189637632.},
+    /* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189637632.},
+    /* GFLOPS 0.047 x 4 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 47409408.},
+    /* GFLOPS 0.189 x 1 = 0.189 */ {{1, 1}, {{1, 1024, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189360384.},
+    /* GFLOPS 0.038 x 5 = 0.189 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.189 x 1 = 0.189 */ {{1, 1}, {{1, 1152, 16, 16}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 188825600.},
+    /* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 185040000.},
+    /* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 185040000.},
+    /* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 180696320.},
+    /* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 180696320.},
+    /* GFLOPS 0.090 x 2 = 0.181 */ {{3, 3}, {{1, 224, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 90339200.},
+    /* GFLOPS 0.180 x 1 = 0.180 */ {{1, 1}, {{1, 224, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 180232192.},
+    /* GFLOPS 0.088 x 2 = 0.177 */ {{1, 1}, {{1, 1024, 13, 13}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 88301655.},
+    /* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 173508608.},
+    /* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 173508608.},
+    /* GFLOPS 0.166 x 1 = 0.166 */ {{3, 3}, {{1, 160, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 166406560.},
+    /* GFLOPS 0.080 x 2 = 0.160 */ {{1, 1}, {{1, 576, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79916736.},
+    /* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159764160.},
+    /* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 1024, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 159703512.},
+    /* GFLOPS 0.159 x 1 = 0.159 */ {{7, 7}, {{1, 3, 300, 300}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 159300000.},
+    /* GFLOPS 0.080 x 2 = 0.159 */ {{1, 1}, {{1, 40, 64, 64}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 79626240.},
+    /* GFLOPS 0.079 x 2 = 0.157 */ {{1, 1}, {{1, 480, 32, 32}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78725120.},
+    /* GFLOPS 0.155 x 1 = 0.155 */ {{1, 1}, {{1, 192, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 154542080.},
+    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 146369664.},
+    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 146369664.},
+    /* GFLOPS 0.072 x 2 = 0.144 */ {{1, 1}, {{1, 1024, 10, 10}}, 352, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 72124800.},
+    /* GFLOPS 0.140 x 1 = 0.140 */ {{1, 1}, {{1, 576, 38, 50}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140204800.},
+    /* GFLOPS 0.139 x 1 = 0.139 */ {{3, 3}, {{1, 256, 5, 5}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 138961350.},
+    /* GFLOPS 0.017 x 8 = 0.138 */ {{1, 1}, {{1, 16, 64, 128}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17301504.},
+    /* GFLOPS 0.067 x 2 = 0.133 */ {{1, 1}, {{1, 576, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 66597280.},
+    /* GFLOPS 0.133 x 1 = 0.133 */ {{3, 3}, {{1, 128, 38, 38}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
+    /* GFLOPS 0.044 x 3 = 0.133 */ {{1, 1}, {{1, 512, 13, 13}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44345600.},
+    /* GFLOPS 0.129 x 1 = 0.129 */ {{1, 1}, {{1, 160, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 128851968.},
+    /* GFLOPS 0.128 x 1 = 0.128 */ {{3, 3}, {{1, 64, 24, 24}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 127512576.},
+    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 120497664.},
+    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 120497664.},
+    /* GFLOPS 0.040 x 3 = 0.120 */ {{1, 1}, {{1, 96, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 40131648.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{1, 1}, {{1, 320, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 118477312.},
+    /* GFLOPS 0.017 x 7 = 0.118 */ {{1, 1}, {{1, 64, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118067200.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118067200.},
+    /* GFLOPS 0.039 x 3 = 0.118 */ {{1, 1}, {{1, 1024, 10, 10}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39340800.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118016000.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118016000.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 20, 20}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 117990400.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 117990400.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 58003456.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57903104.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57852928.},
+    /* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 115655680.},
+    /* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 115655680.},
+    /* GFLOPS 0.115 x 1 = 0.115 */ {{3, 3}, {{1, 3, 512, 512}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115343360.},
+    /* GFLOPS 0.114 x 1 = 0.114 */ {{1, 1}, {{1, 144, 128, 128}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113639424.},
+    /* GFLOPS 0.112 x 1 = 0.112 */ {{1, 1}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111875400.},
+    /* GFLOPS 0.110 x 1 = 0.110 */ {{1, 1}, {{1, 480, 32, 32}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 110215168.},
+    /* GFLOPS 0.107 x 1 = 0.107 */ {{1, 1}, {{1, 64, 32, 32}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 106997760.},
+    /* GFLOPS 0.036 x 3 = 0.107 */ {{1, 1}, {{1, 192, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 35580160.},
+    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 106648064.},
+    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 64, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106555648.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 256, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 105062400.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 104960000.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
+    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 1024, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104908800.},
+    /* GFLOPS 0.103 x 1 = 0.103 */ {{1, 1}, {{1, 128, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
+    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 51480576.},
+    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 51480576.},
+    /* GFLOPS 0.008 x 12 = 0.101 */ {{1, 1}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 8454144.},
+    /* GFLOPS 0.101 x 1 = 0.101 */ {{1, 1}, {{1, 512, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 101016825.},
+    /* GFLOPS 0.096 x 1 = 0.096 */ {{1, 1}, {{1, 480, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 96438272.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 95003648.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 95003648.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 94818816.},
+    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 94818816.},
+    /* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 93600000.},
+    /* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 93600000.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 512, 38, 50}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93480000.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 576, 19, 19}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93236192.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 92880000.},
+    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 92880000.},
+    /* GFLOPS 0.031 x 3 = 0.092 */ {{1, 1}, {{1, 160, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30816000.},
+    /* GFLOPS 0.092 x 1 = 0.092 */ {{1, 1}, {{1, 192, 75, 100}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 92400000.},
+    /* GFLOPS 0.090 x 1 = 0.090 */ {{1, 1}, {{1, 448, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 90015744.},
+    /* GFLOPS 0.045 x 2 = 0.090 */ {{3, 3}, {{1, 576, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44918508.},
+    /* GFLOPS 0.044 x 2 = 0.089 */ {{1, 1}, {{1, 256, 26, 26}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44388864.},
+    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 88554368.},
+    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 88554368.},
+    /* GFLOPS 0.088 x 1 = 0.088 */ {{1, 1}, {{1, 256, 26, 26}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 88430940.},
+    /* GFLOPS 0.021 x 4 = 0.084 */ {{5, 1}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {2, 0}, {0, 0}, "", false, 21037056.},
+    /* GFLOPS 0.021 x 4 = 0.084 */ {{1, 5}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 2}, {0, 0}, "", true, 21037056.},
+    /* GFLOPS 0.084 x 1 = 0.084 */ {{1, 1}, {{1, 416, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 83593216.},
+    /* GFLOPS 0.082 x 1 = 0.082 */ {{1, 1}, {{1, 320, 10, 10}}, 1280, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 82048000.},
+    /* GFLOPS 0.040 x 2 = 0.080 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39958368.},
+    /* GFLOPS 0.040 x 2 = 0.079 */ {{1, 1}, {{1, 24, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39690000.},
+    /* GFLOPS 0.040 x 2 = 0.079 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39600000.},
+    /* GFLOPS 0.079 x 1 = 0.079 */ {{1, 1}, {{1, 240, 64, 64}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78807040.},
+    /* GFLOPS 0.079 x 1 = 0.079 */ {{1, 1}, {{1, 384, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78745600.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 96, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77471744.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{3, 3}, {{1, 192, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 77436800.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 384, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77170688.},
+    /* GFLOPS 0.076 x 1 = 0.076 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 76144640.},
+    /* GFLOPS 0.076 x 1 = 0.076 */ {{1, 1}, {{1, 96, 128, 128}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75890688.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {8, 8}, {8, 8}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {4, 4}, {4, 4}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {16, 16}, {16, 16}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.018 x 4 = 0.072 */ {{1, 1}, {{1, 64, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17882496.},
+    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 16, 150, 150}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 71280000.},
+    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 352, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 70748160.},
+    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 24, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 70560000.},
+    /* GFLOPS 0.070 x 1 = 0.070 */ {{3, 3}, {{1, 96, 14, 14}}, 208, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 70487872.},
+    /* GFLOPS 0.069 x 1 = 0.069 */ {{3, 3}, {{1, 96, 14, 14}}, 204, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 69132336.},
+    /* GFLOPS 0.068 x 1 = 0.068 */ {{1, 1}, {{1, 32, 256, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 68157440.},
+    /* GFLOPS 0.005 x 14 = 0.066 */ {{3, 3}, {{1, 64, 8, 8}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4722688.},
+    /* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 672, 16, 16}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 66109440.},
+    /* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 1280, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 65561600.},
+    /* GFLOPS 0.033 x 2 = 0.065 */ {{3, 3}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 32551680.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 65046912.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 65046912.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 160, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 64534400.},
+    /* GFLOPS 0.064 x 1 = 0.064 */ {{1, 1}, {{1, 320, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 64325632.},
+    /* GFLOPS 0.032 x 2 = 0.064 */ {{3, 3}, {{1, 96, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 31868928.},
+    /* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 61472000.},
+    /* GFLOPS 0.031 x 2 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30736000.},
+    /* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 512, 46, 46}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 60729200.},
+    /* GFLOPS 0.060 x 1 = 0.060 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59920224.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{1, 1}, {{1, 320, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59238656.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 59008000.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 58995200.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 58995200.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 58995200.},
+    /* GFLOPS 0.058 x 1 = 0.058 */ {{1, 1}, {{1, 288, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 57903104.},
+    /* GFLOPS 0.004 x 16 = 0.058 */ {{3, 3}, {{1, 128, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 3614240.},
+    /* GFLOPS 0.055 x 1 = 0.055 */ {{3, 3}, {{1, 1280, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55298400.},
+    /* GFLOPS 0.018 x 3 = 0.054 */ {{1, 1}, {{1, 32, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18021120.},
+    /* GFLOPS 0.018 x 3 = 0.053 */ {{1, 1}, {{1, 384, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17766976.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{3, 3}, {{1, 128, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 53254720.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 53036032.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53036032.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 64, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52838400.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 64, 40, 40}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52838400.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 128, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52633600.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 128, 20, 20}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52633600.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 256, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52531200.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 52454400.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 52454400.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52454400.},
+    /* GFLOPS 0.026 x 2 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26227200.},
+    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 64, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51781632.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51430400.},
+    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25715200.},
+    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25715200.},
+    /* GFLOPS 0.013 x 4 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12857600.},
+    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51405312.},
+    /* GFLOPS 0.050 x 1 = 0.050 */ {{1, 1}, {{1, 992, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 49799680.},
+    /* GFLOPS 0.048 x 1 = 0.048 */ {{1, 1}, {{1, 960, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 48194048.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 47409408.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 144, 64, 64}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 47349760.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 512, 38, 50}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46740000.},
+    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 928, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 46588416.},
+    /* GFLOPS 0.046 x 1 = 0.046 */ {{1, 1}, {{1, 64, 75, 75}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46440000.},
+    /* GFLOPS 0.023 x 2 = 0.045 */ {{3, 3}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22648626.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 45174080.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 45174080.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 224, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 45058048.},
+    /* GFLOPS 0.023 x 2 = 0.045 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 22500800.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 896, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44982784.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 3, 227, 227}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 44946880.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44256000.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44239200.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{1, 1}, {{1, 512, 13, 13}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 44172375.},
+    /* GFLOPS 0.043 x 1 = 0.043 */ {{7, 7}, {{1, 3, 96, 96}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 43499520.},
+    /* GFLOPS 0.043 x 1 = 0.043 */ {{1, 1}, {{1, 864, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 43377152.},
+    /* GFLOPS 0.042 x 1 = 0.042 */ {{1, 1}, {{1, 832, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 41771520.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 40165888.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 40165888.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{1, 1}, {{1, 800, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 40165888.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 64, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 256, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 39932376.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 39600000.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 240, 32, 32}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39403520.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 144, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39015000.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 192, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38635520.},
+    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 768, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38560256.},
+    /* GFLOPS 0.037 x 1 = 0.037 */ {{1, 1}, {{1, 736, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 36954624.},
+    /* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 36164352.},
+    /* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 36164352.},
+    /* GFLOPS 0.018 x 2 = 0.036 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17790080.},
+    /* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 704, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 35348992.},
+    /* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 512, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 34702400.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 672, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33743360.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 128, 32, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33685504.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{2, 2}, {{1, 64, 64, 128}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 33619968.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{3, 3}, {{1, 256, 3, 3}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 33350724.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 33147520.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 33147520.},
+    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 1024, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 32784000.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 160, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32212992.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 512, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32144000.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 640, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32137728.},
+    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 508, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31893120.},
+    /* GFLOPS 0.011 x 3 = 0.032 */ {{1, 1}, {{1, 320, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10502144.},
+    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 31328640.},
+    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31328640.},
+    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 608, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 30532096.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{1, 1}, {{1, 128, 46, 46}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15226736.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15065344.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15065344.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15059072.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{3, 3}, {{1, 256, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 29497600.},
+    /* GFLOPS 0.015 x 2 = 0.029 */ {{1, 1}, {{1, 112, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 14745600.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28976640.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28976640.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28929600.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28929600.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 576, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 28926464.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 544, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 27320832.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 64, 16, 16}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 26749440.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 384, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26650464.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 576, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26638912.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{3, 3}, {{1, 128, 38, 38}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 26627360.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26518016.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26518016.},
+    /* GFLOPS 0.009 x 3 = 0.026 */ {{1, 1}, {{1, 128, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8700992.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 96, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26055000.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 25890816.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25890816.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25890816.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 1024, 10, 10}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25817400.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25790464.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25740288.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25740288.},
+    /* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12870144.},
+    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25715200.},
+    /* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12857600.},
+    /* GFLOPS 0.002 x 12 = 0.025 */ {{1, 1}, {{1, 64, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 2113536.},
+    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 480, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 24109568.},
+    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 23750912.},
+    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23704704.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{3, 3}, {{1, 3, 256, 512}}, 13, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 23429120.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 32, 150, 150}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23400000.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 19, 19}}, 63, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23311575.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 448, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 22503936.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22500800.},
+    /* GFLOPS 0.022 x 1 = 0.022 */ {{1, 1}, {{1, 508, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22325184.},
+    /* GFLOPS 0.022 x 1 = 0.022 */ {{3, 3}, {{1, 512, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 22120800.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{3, 3}, {{1, 128, 12, 12}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 21242880.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 40, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 21233664.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 416, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 20898304.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 20885760.},
+    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20885760.},
+    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 10442880.},
+    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10442880.},
+    /* GFLOPS 0.010 x 2 = 0.020 */ {{3, 3}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10066056.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20095488.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20095488.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20082944.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20082944.},
+    /* GFLOPS 0.020 x 1 = 0.020 */ {{3, 3}, {{1, 256, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 19966188.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 19317760.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19317760.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 384, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 19292672.},
+    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 64, 64, 64}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 19021824.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 576, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18448000.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 18082176.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 18082176.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 17790080.},
+    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 352, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17687040.},
+    /* GFLOPS 0.017 x 1 = 0.017 */ {{2, 2}, {{1, 16, 128, 256}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
+    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 320, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16081408.},
+    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15664320.},
+    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15664320.},
+    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15059072.},
+    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 14754816.},
+    /* GFLOPS 0.015 x 1 = 0.015 */ {{3, 3}, {{1, 128, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 14752000.},
+    /* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 288, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 14475776.},
+    /* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13991250.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 144, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 13354112.},
+    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6623232.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 512, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13120000.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13053600.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13053600.},
+    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6522880.},
+    /* GFLOPS 0.001 x 11 = 0.013 */ {{3, 3}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180672.},
+    /* GFLOPS 0.006 x 2 = 0.013 */ {{1, 1}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 128, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12895232.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12870144.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12870144.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 508, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12757248.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 992, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12449920.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12054784.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12054784.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 960, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12048512.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 12014080.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{3, 3}, {{1, 96, 6, 6}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11950848.},
+    /* GFLOPS 0.006 x 2 = 0.012 */ {{3, 3}, {{1, 96, 3, 3}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5975424.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 320, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11814912.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 640, 6, 6}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11805696.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 928, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11647104.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 896, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11245696.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 256, 13, 13}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11097216.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{3, 3}, {{1, 256, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11061600.},
+    /* GFLOPS 0.006 x 2 = 0.011 */ {{3, 3}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5530200.},
+    /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 864, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10844288.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10442880.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{5, 5}, {{1, 32, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 10041472.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 800, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10041472.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9658880.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 9658880.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 384, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9646336.},
+    /* GFLOPS 0.005 x 2 = 0.010 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4821600.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 768, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9640064.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{3, 3}, {{1, 4, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9568256.},
+    /* GFLOPS 0.005 x 2 = 0.009 */ {{1, 1}, {{1, 4, 128, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4718592.},
+    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 736, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9238656.},
+    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 192, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 8895040.},
+    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 704, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8837248.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 672, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8435840.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 128, 32, 64}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8421376.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 640, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8034432.},
+    /* GFLOPS 0.004 x 2 = 0.008 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3916080.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 608, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7633024.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 7535808.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 7535808.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 640, 6, 6}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7378560.},
+    /* GFLOPS 0.004 x 2 = 0.007 */ {{1, 1}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3650304.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 384, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7234752.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 576, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7231616.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7091712.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 544, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6830208.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 64, 8, 8}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 6687360.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{3, 3}, {{1, 160, 6, 6}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 6637824.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6629504.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6629504.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 5, 5}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6566400.},
+    /* GFLOPS 0.003 x 2 = 0.007 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 3280000.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 64, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6447616.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6428800.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6428800.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6428800.},
+    /* GFLOPS 0.001 x 12 = 0.006 */ {{1, 1}, {{1, 64, 8, 8}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 528384.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{3, 3}, {{1, 256, 10, 10}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5530800.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5322240.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5310720.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4917600.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4917600.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4829440.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4829440.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4826304.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4821600.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 508, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4783968.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 32, 32}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 4755456.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 24, 24}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4755456.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4727808.},
+    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4720896.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4440300.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4440300.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 640, 6, 6}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4427136.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 16, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4325376.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 64, 64, 128}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4227072.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3916080.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 256, 1, 1}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3705636.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 16, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3691008.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 64, 10, 10}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 3689600.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 64, 6, 6}}, 128, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3687552.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 192, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3548160.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 736, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3393792.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 10, 10}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3283200.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3280000.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3280000.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3228750.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3013696.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3013696.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 320, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2953728.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 640, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2951424.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 256, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2765400.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 128, 5, 5}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2655360.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 832, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2610720.},
+    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2520882.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258530.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2363904.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 2360320.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2360320.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2360320.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 528, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2164736.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 508, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2082816.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 1, 1}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2049000.},
+    /* GFLOPS 0.001 x 2 = 0.002 */ {{3, 3}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 995544.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1770336.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 64, 4, 4}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 1671840.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 32, 80, 80}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1664000.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1641600.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 6, 6}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1475712.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1383000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 64, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1328256.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 736, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1272672.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 64, 16, 16}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 1188864.},
+    /* GFLOPS 0.000 x 9 = 0.001 */ {{1, 1}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 132096.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590976.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 590976.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180160.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1120392.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 192, 12, 12}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 887040.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 442464.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 32, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 416000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 691500.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 256, 3, 3}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 663696.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 2, 2}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 655872.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 615000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615000.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 592128.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 590976.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590080.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 581742.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 525312.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 48, 1, 1}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111744.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 1152, 1, 1}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110640.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 5, 5}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 411200.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 3, 3}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 331920.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 192, 5, 5}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 308000.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 8, 8}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 297216.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 2, 2}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 263168.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 131328.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 258552.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 1024, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 196704.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 3, 3}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 165960.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 3, 3}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 148032.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 736, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 141408.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140322.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 131328.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 131328.},
+    /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 28, 1, 1}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 38304.},
+    /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 672, 1, 1}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 37660.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 110808.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110808.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55320.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 4, 4}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 74304.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 73792.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 256, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 73744.},
+    /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 20, 1, 1}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19680.},
+    /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 480, 1, 1}}, 20, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19220.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 49248.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 49248.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 36880.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32382.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 18440.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 1, 1}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16512.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 10, 1, 1}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5040.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 240, 1, 1}}, 10, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4810.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6168.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6168.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 6, 1, 1}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1872.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 144, 1, 1}}, 6, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1734.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 4, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 864.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 96, 1, 1}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 772.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 8, 1, 1}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 544.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 32, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 520.}
+};
+struct ConvParamID
+{
+    enum {
+        CONV_0 = 0,
+        CONV_100 = 100,
+        CONV_LAST = sizeof(testConvolutionConfigs) / sizeof(testConvolutionConfigs[0])
+    };
+    int val_;
+    ConvParamID(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<ConvParamID> all()
+    {
+#if 0
+        enum { NUM = (int)CONV_LAST };
+#else
+        enum { NUM = (int)CONV_100 };
+#endif
+        ConvParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = ConvParamID(i); } // reduce generated code size
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};
+static inline void PrintTo(const ConvParamID& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < ConvParamID::CONV_LAST);
+    const ConvParam_t& p = testConvolutionConfigs[(int)v];
+
+    *os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
+        << ", K=" << (Size)p.kernel
+        << ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << ", " << p.shapeIn.dims[3] << "}"
+        << ", OCN=" << p.outCN;
+    if (p.groups > 1)
+       *os << ", G=" << p.groups;
+    if (((Size)p.stride).area() != 1)
+        *os << ", S=" << ((Size)p.stride);
+    if (((Size)p.dilation).area() != 1)
+        *os << ", D=" << ((Size)p.dilation);
+    if (!((Size)p.pad).empty())
+        *os << ", P=" << ((Size)p.pad);
+    if (!((Size)p.padAdjust).empty())
+        *os << ", PAdj=" << ((Size)p.padAdjust);
+    if (!((std::string)p.padMode).empty())
+        *os << ", PM=" << ((std::string)p.padMode);
+    if (p.hasBias)
+        *os << ", BIAS";
+}
+
+
+
+typedef tuple<ConvParamID, tuple<Backend, Target> > ConvTestParam_t;
+typedef TestBaseWithParam<ConvTestParam_t> Conv;
+
+PERF_TEST_P_(Conv, conv)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, ConvParamID::CONV_LAST);
+    const ConvParam_t& params = testConvolutionConfigs[test_id];
+    double declared_flops = params.declared_flops;
+    Size kernel = params.kernel;
+    MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 4);
+    int outChannels = params.outCN;
+    int groups = params.groups;
+    Size stride = params.stride;
+    Size dilation = params.dilation;
+    Size pad = params.pad;
+    Size padAdjust = params.padAdjust;
+    std::string padMode(params.padMode);
+    bool hasBias = params.hasBias;
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    int inChannels = inputShape[1];
+    Size inSize(inputShape[3], inputShape[2]);
+
+    int sz[] = {outChannels, inChannels / groups, kernel.height, kernel.width};
+    Mat weights(4, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("kernel_w", kernel.width);
+    lp.set("kernel_h", kernel.height);
+    lp.set("pad_w", pad.width);
+    lp.set("pad_h", pad.height);
+    if (padAdjust.width > 0 || padAdjust.height > 0)
+    {
+        lp.set("adj_w", padAdjust.width);
+        lp.set("adj_h", padAdjust.height);
+    }
+    if (!padMode.empty())
+        lp.set("pad_mode", padMode);
+    lp.set("stride_w", stride.width);
+    lp.set("stride_h", stride.height);
+    lp.set("dilation_w", dilation.width);
+    lp.set("dilation_h", dilation.height);
+    lp.set("num_output", outChannels);
+    lp.set("group", groups);
+    lp.set("bias_term", hasBias);
+    lp.type = "Convolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+    if (hasBias)
+    {
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
+    }
+    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
+    Mat input(4, &inpSz[0], CV_32F);
+    randu(input, -1.0f, 1.0f);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    MatShape netInputShape = shape(input);
+    size_t weightsMemory = 0, blobsMemory = 0;
+    net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
+    int64 flops = net.getFLOPS(netInputShape);
+    CV_Assert(flops > 0);
+
+    std::cout
+        << "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
+        << "    OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
+        << "    Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
+        << "    MFLOPS=" << flops * 1e-6 << std::endl;
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+
+    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Conv, Combine(
+    ConvParamID::all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_convolution1d.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_convolution1d.cpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+struct Conv1DParam_t {
+    int kernel;
+    struct BlobShape { int dims[3]; } shapeIn;
+    int outCN;
+    int groups;
+    int stride;
+    int dilation;
+    int pad[2];
+    const char* padMode;
+    bool hasBias;
+    double declared_flops;
+};
+// Details: #12142
+static const Conv1DParam_t testConvolution1DConfigs[] = {
+        {3, {{1, 6, 10}}, 6, 1, 1, 1, {0, 0}, "VALID", true, 1776.},
+        {3, {{1, 2, 19}}, 2, 2, 2, 1, {1, 1}, "", true, 260.},
+        {3, {{1, 2, 25}}, 2, 2, 1, 1, {2, 2}, "SAME", false, 650.},
+};
+
+struct Conv1DParamID
+{
+    enum {
+        CONV_0 = 0,
+        CONV_LAST = sizeof(testConvolution1DConfigs) / sizeof(testConvolution1DConfigs[0])
+    };
+    int val_;
+    Conv1DParamID(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<Conv1DParamID> all()
+    {
+        enum { NUM = (int)CONV_LAST };
+        Conv1DParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = Conv1DParamID(i); } // reduce generated code size
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};
+static inline void PrintTo(const Conv1DParamID& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < Conv1DParamID::CONV_LAST);
+    const Conv1DParam_t& p = testConvolution1DConfigs[(int)v];
+
+    *os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
+        << ", K=[" << p.kernel << "]"
+        << ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << "}"
+        << ", OCN=" << p.outCN;
+    if (p.groups > 1)
+        *os << ", G=" << p.groups;
+    if (p.stride != 1)
+        *os << ", S=" << p.stride;
+    if (p.dilation != 1)
+        *os << ", D="  << p.dilation;
+    if (p.pad[0] != 0 && p.pad[1] != 0 )
+        *os << ", P=(" << p.pad[0] << ", " << p.pad[1] << ")";
+    if (!((std::string)p.padMode).empty())
+        *os << ", PM=" << ((std::string)p.padMode);
+    if (p.hasBias)
+        *os << ", BIAS";
+}
+
+
+typedef tuple<Conv1DParamID, tuple<Backend, Target> > Conv1DTestParam_t;
+typedef TestBaseWithParam<Conv1DTestParam_t> Conv1D;
+
+PERF_TEST_P_(Conv1D, conv1d)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, Conv1DParamID::CONV_LAST);
+    const Conv1DParam_t& params = testConvolution1DConfigs[test_id];
+    double declared_flops = params.declared_flops;
+
+    DictValue kernel   = DictValue::arrayInt(&params.kernel, 1);
+    DictValue stride   = DictValue::arrayInt(&params.stride, 1);
+    DictValue pad      = DictValue::arrayInt(&params.pad[0], 2);
+    DictValue dilation = DictValue::arrayInt(&params.dilation, 1);
+
+    MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 3);
+    int outChannels = params.outCN;
+    int groups = params.groups;
+    std::string padMode(params.padMode);
+
+    bool hasBias = params.hasBias;
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    if (targetId != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+
+    int inChannels = inputShape[1];
+
+    int sz[] = {outChannels, inChannels / groups, params.kernel};
+    Mat weights(3, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("kernel_size", kernel);
+    lp.set("pad", pad);
+    if (!padMode.empty())
+        lp.set("pad_mode", padMode);
+
+    lp.set("stride", stride);
+    lp.set("dilation", dilation);
+    lp.set("num_output", outChannels);
+    lp.set("group", groups);
+    lp.set("bias_term", hasBias);
+    lp.type = "Convolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+
+    if (hasBias)
+    {
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
+    }
+
+    int inpSz[] = {1, inChannels, inputShape[2]};
+    Mat input(3, &inpSz[0], CV_32F);
+    randu(input, -1.0f, 1.0f);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    MatShape netInputShape = shape(input);
+    size_t weightsMemory = 0, blobsMemory = 0;
+    net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
+    int64 flops = net.getFLOPS(netInputShape);
+    CV_Assert(flops > 0);
+
+    std::cout
+    << "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
+    << "    OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
+    << "    Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
+    << "    MFLOPS=" << flops * 1e-6 << std::endl;
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Conv1D, Combine(
+        Conv1DParamID::all(),
+        dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_convolution3d.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_convolution3d.cpp
@@ -0,0 +1,182 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+struct Conv3DParam_t {
+    int kernel[3];
+    struct BlobShape { int dims[5]; } shapeIn;
+    int outCN;
+    int groups;
+    int stride[3];
+    int dilation[3];
+    int pad[6];
+    const char* padMode;
+    bool hasBias;
+    double declared_flops;
+};
+// Details: #12142
+static const Conv3DParam_t testConvolution3DConfigs[] = {
+    {{3, 3, 3}, {{1, 6, 10, 38, 50}}, 6, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "VALID", true, 26956800.},
+    {{3, 3, 3}, {{1, 2, 19, 19, 19}}, 2, 2, {2, 2, 2}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "", true, 218000.},
+    {{3, 3, 3}, {{1, 2, 25, 19, 19}}, 2, 2, {1, 2, 2}, {1, 1, 1}, {2, 2, 2, 2, 2, 2}, "SAME", false, 545000.},
+    {{3, 3, 3}, {{1, 11, 9, 150, 200}}, 11, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "VALID", true, 1342562760.},
+    {{3, 3, 3}, {{1, 10, 98, 10, 10}}, 10, 1, {1, 1, 1}, {1, 1, 1}, {1, 0, 1, 1, 0,1}, "SAME", false, 53018000.},
+    {{5, 5, 5}, {{1, 6, 19, 19, 19}}, 6, 2, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", false, 30395250.},
+    {{5, 5, 5}, {{1, 4, 50, 19, 19}}, 4, 1, {2, 2, 2}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "VALID", false, 5893888.},
+    {{5, 5, 5}, {{1, 3, 75, 75, 100}}, 3, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "SAME", true, 1267312500.},
+    {{5, 5, 5}, {{1, 2, 21, 75, 100}}, 2, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", true, 116103744.},
+    {{5, 5, 5}, {{1, 4, 40, 75, 75}}, 4, 1, {2, 2, 2}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", false, 93405312.},
+    {{7, 7, 7}, {{1, 6, 15, 19, 19}}, 6, 1, {2, 1, 1}, {1, 1, 1}, {3, 3, 3, 3, 3, 3}, "SAME", true, 71339376.},
+    {{7, 7, 7}, {{1, 2, 38, 38, 38}}, 2, 1, {1, 2, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", false, 44990464.},
+    {{1, 1, 1}, {{1, 4, 9, 10, 10}}, 4, 1, {1, 1, 2}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "VALID", false, 16200.},
+    {{3, 1, 4}, {{1, 14, 5, 10, 10}}, 14, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "SAME", false, 2359000.},
+    {{1, 1, 1}, {{1, 8, 1, 10, 10}}, 8, 8, {1, 1, 1}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "", true, 58752.},
+    {{3, 4, 2}, {{1, 4, 8, 10, 10}}, 4, 4, {1, 2, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", true, 166752.}
+};
+
+struct Conv3DParamID
+{
+    enum {
+        CONV_0 = 0,
+        CONV_100 = 16,
+        CONV_LAST = sizeof(testConvolution3DConfigs) / sizeof(testConvolution3DConfigs[0])
+    };
+    int val_;
+    Conv3DParamID(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<Conv3DParamID> all()
+    {
+#if 0
+        enum { NUM = (int)CONV_LAST };
+#else
+        enum { NUM = (int)CONV_100 };
+#endif
+        Conv3DParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = Conv3DParamID(i); } // reduce generated code size
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};
+static inline void PrintTo(const Conv3DParamID& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < Conv3DParamID::CONV_LAST);
+    const Conv3DParam_t& p = testConvolution3DConfigs[(int)v];
+
+    *os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
+        << ", K=[" << p.kernel[0] << " x " << p.kernel[1]  << " x " << p.kernel[2] << "]"
+        << ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << ", " << p.shapeIn.dims[3] << ", " << p.shapeIn.dims[4] << "}"
+        << ", OCN=" << p.outCN;
+    if (p.groups > 1)
+       *os << ", G=" << p.groups;
+    if (p.stride[0] * p.stride[1] * p.stride[2] != 1)
+        *os << ", S=[" << p.stride[0] << " x " << p.stride[1]  << " x " << p.stride[2] << "]";
+    if (p.dilation[0] * p.dilation[1] * p.dilation[2] != 1)
+        *os << ", D=["  << p.dilation[0] << " x " << p.dilation[1]  << " x " << p.dilation[2] << "]";
+    if (p.pad[0] != 0 && p.pad[1] != 0 && p.pad[2] != 0 &&
+        p.pad[3] != 0 && p.pad[4] != 0 && p.pad[5] != 0)
+        *os << ", P=(" << p.pad[0] << ", " << p.pad[3] << ") x ("
+                       << p.pad[1] << ", " << p.pad[4] << ") x ("
+                       << p.pad[2] << ", " << p.pad[5] << ")";
+    if (!((std::string)p.padMode).empty())
+        *os << ", PM=" << ((std::string)p.padMode);
+    if (p.hasBias)
+        *os << ", BIAS";
+}
+
+
+typedef tuple<Conv3DParamID, tuple<Backend, Target> > Conv3DTestParam_t;
+typedef TestBaseWithParam<Conv3DTestParam_t> Conv3D;
+
+PERF_TEST_P_(Conv3D, conv3d)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, Conv3DParamID::CONV_LAST);
+    const Conv3DParam_t& params = testConvolution3DConfigs[test_id];
+    double declared_flops = params.declared_flops;
+
+    DictValue kernel   = DictValue::arrayInt(&params.kernel[0], 3);
+    DictValue stride   = DictValue::arrayInt(&params.stride[0], 3);
+    DictValue pad      = DictValue::arrayInt(&params.pad[0], 6);
+    DictValue dilation = DictValue::arrayInt(&params.dilation[0], 3);
+
+    MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 5);
+    int outChannels = params.outCN;
+    int groups = params.groups;
+    std::string padMode(params.padMode);
+
+    bool hasBias = params.hasBias;
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    if (targetId != DNN_TARGET_CPU && backendId != DNN_BACKEND_CUDA)
+        throw SkipTestException("Only CPU and CUDA is supported");
+
+    int inChannels = inputShape[1];
+
+    int sz[] = {outChannels, inChannels / groups, params.kernel[0], params.kernel[1], params.kernel[2]};
+    Mat weights(5, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("kernel_size", kernel);
+    lp.set("pad", pad);
+    if (!padMode.empty())
+        lp.set("pad_mode", padMode);
+
+    lp.set("stride", stride);
+    lp.set("dilation", dilation);
+    lp.set("num_output", outChannels);
+    lp.set("group", groups);
+    lp.set("bias_term", hasBias);
+    lp.type = "Convolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+
+    if (hasBias)
+    {
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
+    }
+    int inpSz[] = {1, inChannels, inputShape[2], inputShape[3], inputShape[4]};
+    Mat input(5, &inpSz[0], CV_32F);
+    randu(input, -1.0f, 1.0f);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    Mat output = net.forward();
+
+    MatShape netInputShape = shape(input);
+    size_t weightsMemory = 0, blobsMemory = 0;
+    net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
+    int64 flops = net.getFLOPS(netInputShape);
+    CV_Assert(flops > 0);
+
+    std::cout
+        << "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
+        << "    OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
+        << "    Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
+        << "    MFLOPS=" << flops * 1e-6 << std::endl;
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Conv3D, Combine(
+    Conv3DParamID::all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_layer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_layer.cpp
@@ -0,0 +1,95 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    template<int DIMS>
+    void test_slice(const int* inputShape, const int* begin, const int* end)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
+        for (int i = 0; i < (int)input.total(); ++i)
+            input.ptr<float>()[i] = (float)(i & 4095);
+
+        std::vector<Range> range(DIMS);
+        for (int i = 0; i < DIMS; ++i)
+            range[i] = Range(begin[i], end[i]);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "Slice";
+        lp.name = "testLayer";
+        lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
+        lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
+        net.addLayerToPrev(lp.name, lp.type, lp);
+
+        // warmup
+        {
+            net.setInput(input);
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+
+            EXPECT_GT(cv::norm(out, NORM_INF), 0);
+#if 0
+            //normAssert(out, input(range));
+            cout << input(range).clone().reshape(1, 1) << endl;
+            cout << out.reshape(1, 1) << endl;
+#endif
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+};
+
+
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
+{
+    const int inputShape[4] = {1, 64, 104, 104};
+    const int begin[] = {0, 32, 0, 0};
+    const int end[] = {1, 64, 104, 104};
+    test_slice<4>(inputShape, begin, end);
+}
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2)
+{
+    const int inputShape[4] = {1, 128, 52, 52};
+    const int begin[] = {0, 64, 0, 0};
+    const int end[] = {1, 128, 52, 52};
+    test_slice<4>(inputShape, begin, end);
+}
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3)
+{
+    const int inputShape[4] = {1, 256, 26, 26};
+    const int begin[] = {0, 128, 0, 0};
+    const int end[] = {1, 256, 26, 26};
+    test_slice<4>(inputShape, begin, end);
+}
+
+
+PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
+{
+    const int inputShape[4] = {1, 128, 80, 100};
+    const int begin[] = {0, 0, 2, 2};
+    const int end[] = {1, 128, 76, 96};
+    test_slice<4>(inputShape, begin, end);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
+
+} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_main.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_main.cpp
@@ -0,0 +1,16 @@
+#include "perf_precomp.hpp"
+
+static const char* extraTestDataPath =
+#ifdef WINRT
+        NULL;
+#else
+        getenv("OPENCV_DNN_TEST_DATA_PATH");
+#endif
+
+#if defined(HAVE_HPX)
+    #include <hpx/hpx_main.hpp>
+#endif
+
+CV_PERF_TEST_MAIN(dnn,
+    extraTestDataPath ? (void)cvtest::addDataSearchPath(extraTestDataPath) : (void)0
+)
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_net.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_net.cpp
@@ -0,0 +1,305 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "perf_precomp.hpp"
+#include "opencv2/core/ocl.hpp"
+
+#include "opencv2/dnn/shape_utils.hpp"
+
+#include "../test/test_common.hpp"
+
+namespace opencv_test {
+
+class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target> >
+{
+public:
+    dnn::Backend backend;
+    dnn::Target target;
+
+    dnn::Net net;
+
+    DNNTestNetwork()
+    {
+        backend = (dnn::Backend)(int)get<0>(GetParam());
+        target = (dnn::Target)(int)get<1>(GetParam());
+    }
+
+    void processNet(std::string weights, std::string proto, std::string halide_scheduler,
+                    const Mat& input, const std::string& outputLayer = "")
+    {
+        randu(input, 0.0f, 1.0f);
+
+        weights = findDataFile(weights, false);
+        if (!proto.empty())
+            proto = findDataFile(proto);
+        if (backend == DNN_BACKEND_HALIDE)
+        {
+            if (halide_scheduler == "disabled")
+                throw cvtest::SkipTestException("Halide test is disabled");
+            if (!halide_scheduler.empty())
+                halide_scheduler = findDataFile(std::string("dnn/halide_scheduler_") + (target == DNN_TARGET_OPENCL ? "opencl_" : "") + halide_scheduler, true);
+        }
+        net = readNet(proto, weights);
+        net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+        if (backend == DNN_BACKEND_HALIDE)
+        {
+            net.setHalideScheduler(halide_scheduler);
+        }
+
+        MatShape netInputShape = shape(1, 3, input.rows, input.cols);
+        size_t weightsMemory = 0, blobsMemory = 0;
+        net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
+        int64 flops = net.getFLOPS(netInputShape);
+        CV_Assert(flops > 0);
+
+        net.forward(outputLayer); // warmup
+
+        std::cout << "Memory consumption:" << std::endl;
+        std::cout << "    Weights(parameters): " << divUp(weightsMemory, 1u<<20) << " Mb" << std::endl;
+        std::cout << "    Blobs: " << divUp(blobsMemory, 1u<<20) << " Mb" << std::endl;
+        std::cout << "Calculation complexity: " << flops * 1e-9 << " GFlops" << std::endl;
+
+        PERF_SAMPLE_BEGIN()
+            net.forward();
+        PERF_SAMPLE_END()
+
+        SANITY_CHECK_NOTHING();
+    }
+};
+
+
+PERF_TEST_P_(DNNTestNetwork, AlexNet)
+{
+    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
+            "alexnet.yml", Mat(cv::Size(227, 227), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
+{
+    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
+            "", Mat(cv::Size(224, 224), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, ResNet_50)
+{
+    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
+            "resnet_50.yml", Mat(cv::Size(224, 224), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
+{
+    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
+            "squeezenet_v1_1.yml", Mat(cv::Size(227, 227), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, Inception_5h)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) throw SkipTestException("");
+    processNet("dnn/tensorflow_inception_graph.pb", "",
+            "inception_5h.yml",
+            Mat(cv::Size(224, 224), CV_32FC3), "softmax2");
+}
+
+PERF_TEST_P_(DNNTestNetwork, ENet)
+{
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        throw SkipTestException("");
+#endif
+    processNet("dnn/Enet-model-best.net", "", "enet.yml",
+            Mat(cv::Size(512, 256), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, SSD)
+{
+    processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", "disabled",
+            Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, OpenFace)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL))
+        throw SkipTestException("");
+#endif
+    processNet("dnn/openface_nn4.small2.v1.t7", "", "",
+            Mat(cv::Size(96, 96), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
+            Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", "",
+            Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", "",
+            Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
+               Mat(cv::Size(224, 224), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL)))
+        throw SkipTestException("");
+    // The same .caffemodel but modified .prototxt
+    // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
+    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", "",
+               Mat(cv::Size(368, 368), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", "",
+               Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
+            Mat(cv::Size(300, 300), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOv3)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+#endif
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)  // nGraph compilation failure
+    if (target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
+
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    cvtColor(sample, sample, COLOR_BGR2RGB);
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    processNet("dnn/yolov3.weights", "dnn/yolov3.cfg", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOv4)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    if (target == DNN_TARGET_MYRIAD)  // not enough resources
+        throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+#endif
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    cvtColor(sample, sample, COLOR_BGR2RGB);
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    processNet("dnn/yolov4.weights", "dnn/yolov4.cfg", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)  // nGraph compilation failure
+    if (target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    cvtColor(sample, sample, COLOR_BGR2RGB);
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    processNet("dnn/yolov4-tiny.weights", "dnn/yolov4-tiny.cfg", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        throw SkipTestException("Test is disabled in OpenVINO 2019R1");
+#endif
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        throw SkipTestException("Test is disabled in OpenVINO 2019R2");
+#endif
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+    if (target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("Test is disabled in OpenVINO 2021.1+ / MYRIAD");
+#endif
+    if (backend == DNN_BACKEND_HALIDE ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
+               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", "",
+               Mat(cv::Size(800, 600), CV_32FC3));
+}
+
+PERF_TEST_P_(DNNTestNetwork, EfficientDet)
+{
+    if (backend == DNN_BACKEND_HALIDE || target != DNN_TARGET_CPU)
+        throw SkipTestException("");
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    resize(sample, sample, Size(512, 512));
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3, 1.0/255);
+    processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", "", inp);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
+
+} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_precomp.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/perf/perf_precomp.hpp
@@ -0,0 +1,14 @@
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include <opencv2/ts.hpp>
+#include <opencv2/dnn.hpp>
+
+#include "../test/test_common.hpp"
+
+namespace opencv_test {
+using namespace perf;
+using namespace cv::dnn;
+} // namespace
+
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_importer.cpp
@@ -0,0 +1,593 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+
+#ifdef HAVE_PROTOBUF
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include "caffe_io.hpp"
+#endif
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+#ifdef HAVE_PROTOBUF
+using ::google::protobuf::RepeatedField;
+using ::google::protobuf::RepeatedPtrField;
+using ::google::protobuf::Message;
+using ::google::protobuf::Descriptor;
+using ::google::protobuf::FieldDescriptor;
+using ::google::protobuf::Reflection;
+
+namespace
+{
+
+template<typename T>
+static cv::String toString(const T &v)
+{
+    std::ostringstream ss;
+    ss << v;
+    return ss.str();
+}
+
+static inline
+MatShape parseBlobShape(const caffe::BlobShape& _input_shape)
+{
+    MatShape shape;
+    for (int i = 0; i < _input_shape.dim_size(); i++)
+    {
+        shape.push_back((int)_input_shape.dim(i));
+    }
+    return shape;
+}
+
+class CaffeImporter
+{
+    caffe::NetParameter net;
+    caffe::NetParameter netBinary;
+
+public:
+
+    CaffeImporter(const char *pototxt, const char *caffeModel)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromTextFileOrDie(pototxt, &net);
+
+        if (caffeModel && caffeModel[0])
+            ReadNetParamsFromBinaryFileOrDie(caffeModel, &netBinary);
+    }
+
+    CaffeImporter(const char *dataProto, size_t lenProto,
+                  const char *dataModel, size_t lenModel)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromTextBufferOrDie(dataProto, lenProto, &net);
+
+        if (dataModel != NULL && lenModel > 0)
+            ReadNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBinary);
+    }
+
+    void extractCustomParams(const google::protobuf::UnknownFieldSet& unknownFields, cv::dnn::LayerParams &params)
+    {
+        const int numFields = unknownFields.field_count();
+        for (int i = 0; i < numFields; ++i)
+        {
+            const google::protobuf::UnknownField& field = unknownFields.field(i);
+            CV_Assert(field.type() == google::protobuf::UnknownField::TYPE_GROUP);
+            std::string fieldName = field.group().field(0).length_delimited();
+            std::string fieldValue = field.group().field(1).length_delimited();
+            params.set(fieldName, fieldValue);
+        }
+    }
+
+    void addParam(const Message &msg, const FieldDescriptor *field, cv::dnn::LayerParams &params)
+    {
+        const Reflection *refl = msg.GetReflection();
+        int type = field->cpp_type();
+        bool isRepeated = field->is_repeated();
+        const std::string &name = field->name();
+
+        #define SET_UP_FILED(getter, arrayConstr, gtype)                                    \
+            if (isRepeated) {                                                               \
+                const RepeatedField<gtype> &v = refl->GetRepeatedField<gtype>(msg, field);  \
+                params.set(name, DictValue::arrayConstr(v.begin(), (int)v.size()));                  \
+            }                                                                               \
+            else {                                                                          \
+                params.set(name, refl->getter(msg, field));                               \
+            }
+
+        switch (type)
+        {
+        case FieldDescriptor::CPPTYPE_INT32:
+            SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int32);
+            break;
+        case FieldDescriptor::CPPTYPE_UINT32:
+            SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint32);
+            break;
+        case FieldDescriptor::CPPTYPE_INT64:
+            SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int64);
+            break;
+        case FieldDescriptor::CPPTYPE_UINT64:
+            SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint64);
+            break;
+        case FieldDescriptor::CPPTYPE_BOOL:
+            SET_UP_FILED(GetBool, arrayInt, bool);
+            break;
+        case FieldDescriptor::CPPTYPE_DOUBLE:
+            SET_UP_FILED(GetDouble, arrayReal, double);
+            break;
+        case FieldDescriptor::CPPTYPE_FLOAT:
+            SET_UP_FILED(GetFloat, arrayReal, float);
+            break;
+        case FieldDescriptor::CPPTYPE_STRING:
+            if (isRepeated) {
+                const RepeatedPtrField<std::string> &v = refl->GetRepeatedPtrField<std::string>(msg, field);
+                params.set(name, DictValue::arrayString(v.begin(), (int)v.size()));
+            }
+            else {
+                params.set(name, refl->GetString(msg, field));
+            }
+            break;
+        case FieldDescriptor::CPPTYPE_ENUM:
+            if (isRepeated) {
+                int size = refl->FieldSize(msg, field);
+                std::vector<cv::String> buf(size);
+                for (int i = 0; i < size; i++)
+                    buf[i] = refl->GetRepeatedEnum(msg, field, i)->name();
+                params.set(name, DictValue::arrayString(buf.begin(), size));
+            }
+            else {
+                params.set(name, refl->GetEnum(msg, field)->name());
+            }
+            break;
+        default:
+            CV_Error(Error::StsError, "Unknown type \"" + String(field->type_name()) + "\" in prototxt");
+            break;
+        }
+    }
+
+    inline static bool ends_with_param(const std::string &str)
+    {
+        static const std::string _param("_param");
+        return (str.size() >= _param.size()) && str.compare(str.size() - _param.size(), _param.size(), _param) == 0;
+    }
+
+    void extractLayerParams(const Message &msg, cv::dnn::LayerParams &params, bool isInternal = false)
+    {
+        const Descriptor *msgDesc = msg.GetDescriptor();
+        const Reflection *msgRefl = msg.GetReflection();
+
+        for (int fieldId = 0; fieldId < msgDesc->field_count(); fieldId++)
+        {
+            const FieldDescriptor *fd = msgDesc->field(fieldId);
+
+            if (!isInternal && !ends_with_param(fd->name()))
+                continue;
+
+            const google::protobuf::UnknownFieldSet& unknownFields = msgRefl->GetUnknownFields(msg);
+            bool hasData =  fd->is_required() ||
+                            (fd->is_optional() && msgRefl->HasField(msg, fd)) ||
+                            (fd->is_repeated() && msgRefl->FieldSize(msg, fd) > 0) ||
+                            !unknownFields.empty();
+            if (!hasData)
+                continue;
+
+            extractCustomParams(unknownFields, params);
+            if (fd->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE)
+            {
+                if (fd->is_repeated()) //Extract only first item!
+                    extractLayerParams(msgRefl->GetRepeatedMessage(msg, fd, 0), params, true);
+                else
+                    extractLayerParams(msgRefl->GetMessage(msg, fd), params, true);
+            }
+            else
+            {
+                addParam(msg, fd, params);
+            }
+        }
+    }
+
+    void blobShapeFromProto(const caffe::BlobProto &pbBlob, MatShape& shape)
+    {
+        shape.clear();
+        if (pbBlob.has_num() || pbBlob.has_channels() || pbBlob.has_height() || pbBlob.has_width())
+        {
+            shape.push_back(pbBlob.num());
+            shape.push_back(pbBlob.channels());
+            shape.push_back(pbBlob.height());
+            shape.push_back(pbBlob.width());
+        }
+        else if (pbBlob.has_shape())
+        {
+            shape = parseBlobShape(pbBlob.shape());
+        }
+        else
+            shape.resize(1, 1);  // Is a scalar.
+    }
+
+    void blobFromProto(const caffe::BlobProto &pbBlob, cv::Mat &dstBlob)
+    {
+        MatShape shape;
+        blobShapeFromProto(pbBlob, shape);
+
+        dstBlob.create((int)shape.size(), &shape[0], CV_32F);
+        if (pbBlob.data_size())
+        {
+            // Single precision floats.
+            CV_Assert(pbBlob.data_size() == (int)dstBlob.total());
+
+            CV_DbgAssert(pbBlob.GetDescriptor()->FindFieldByLowercaseName("data")->cpp_type() == FieldDescriptor::CPPTYPE_FLOAT);
+            Mat(dstBlob.dims, &dstBlob.size[0], CV_32F, (void*)pbBlob.data().data()).copyTo(dstBlob);
+        }
+        else
+        {
+            CV_Assert(pbBlob.has_raw_data());
+            const std::string& raw_data = pbBlob.raw_data();
+            if (pbBlob.raw_data_type() == caffe::FLOAT16)
+            {
+                // Half precision floats.
+                CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());
+
+                Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
+                convertFp16(halfs, dstBlob);
+            }
+            else if (pbBlob.raw_data_type() == caffe::FLOAT)
+            {
+                CV_Assert(raw_data.size() / 4 == (int)dstBlob.total());
+                Mat((int)shape.size(), &shape[0], CV_32FC1, (void*)raw_data.c_str()).copyTo(dstBlob);
+            }
+            else
+                CV_Error(Error::StsNotImplemented, "Unexpected blob data type");
+        }
+    }
+
+    void extractBinaryLayerParams(const caffe::LayerParameter& layer, LayerParams& layerParams)
+    {
+        const std::string &name = layer.name();
+
+        int li;
+        for (li = 0; li != netBinary.layer_size(); li++)
+        {
+            const caffe::LayerParameter& binLayer = netBinary.layer(li);
+            // Break if the layer name is the same and the blobs are not cleared
+            if (binLayer.name() == name && binLayer.blobs_size() != 0)
+                break;
+        }
+
+        if (li == netBinary.layer_size())
+            return;
+
+        caffe::LayerParameter* binLayer = netBinary.mutable_layer(li);
+        const int numBlobs = binLayer->blobs_size();
+        std::vector<caffe::BlobProto*> blobs(numBlobs);
+        binLayer->mutable_blobs()->ExtractSubrange(0, numBlobs, blobs.data());
+        layerParams.blobs.resize(numBlobs);
+        for (int bi = 0; bi < numBlobs; bi++)
+        {
+            blobFromProto(*blobs[bi], layerParams.blobs[bi]);
+            delete blobs[bi];
+        }
+    }
+
+    struct BlobNote
+    {
+        BlobNote(const std::string &_name, int _layerId, int _outNum) :
+            name(_name), layerId(_layerId), outNum(_outNum) {}
+
+        std::string name;
+        int layerId, outNum;
+    };
+
+    std::vector<BlobNote> addedBlobs;
+    std::map<String, int> layerCounter;
+
+    void populateNet(Net dstNet)
+    {
+        CV_TRACE_FUNCTION();
+
+        int layersSize = net.layer_size();
+        layerCounter.clear();
+        addedBlobs.clear();
+        addedBlobs.reserve(layersSize + 1);
+
+        //setup input layer names
+        std::vector<String> netInputs(net.input_size());
+        std::vector<MatShape> inp_shapes;
+        {
+            int net_input_size = net.input_size();
+            for (int inNum = 0; inNum < net_input_size; inNum++)
+            {
+                addedBlobs.push_back(BlobNote(net.input(inNum), 0, inNum));
+                netInputs[inNum] = net.input(inNum);
+            }
+
+            if (net.input_dim_size() > 0)  // deprecated in Caffe proto
+            {
+                int net_input_dim_size = net.input_dim_size();
+                CV_Check(net_input_dim_size, net_input_dim_size % 4 == 0, "");
+                CV_CheckEQ(net_input_dim_size, net_input_size * 4, "");
+                for (int inp_id = 0; inp_id < net_input_size; inp_id++)
+                {
+                    int dim = inp_id * 4;
+                    MatShape shape(4);
+                    shape[0] = net.input_dim(dim);
+                    shape[1] = net.input_dim(dim+1);
+                    shape[2] = net.input_dim(dim+2);
+                    shape[3] = net.input_dim(dim+3);
+                    inp_shapes.push_back(shape);
+                }
+            }
+            else if (net.input_shape_size() > 0)  // deprecated in Caffe proto
+            {
+                int net_input_shape_size = net.input_shape_size();
+                CV_CheckEQ(net_input_shape_size, net_input_size, "");
+                for (int inp_id = 0; inp_id < net_input_shape_size; inp_id++)
+                {
+                    MatShape shape = parseBlobShape(net.input_shape(inp_id));
+                    inp_shapes.push_back(shape);
+                }
+            }
+            else
+            {
+                for (int inp_id = 0; inp_id < net_input_size; inp_id++)
+                {
+                    MatShape shape; // empty
+                    inp_shapes.push_back(shape);
+                }
+            }
+        }
+
+        for (int li = 0; li < layersSize; li++)
+        {
+            const caffe::LayerParameter &layer = net.layer(li);
+            String name = layer.name();
+            String type = layer.type();
+            LayerParams layerParams;
+
+            extractLayerParams(layer, layerParams);
+            extractBinaryLayerParams(layer, layerParams);
+
+            int repetitions = layerCounter[name]++;
+            if (repetitions)
+                name += String("_") + toString(repetitions);
+
+            if (type == "Input")
+            {
+                for (int outNum = 0; outNum < layer.top_size(); outNum++)
+                {
+                    addOutput(layer, 0, outNum);
+                    addedBlobs.back().outNum = netInputs.size();
+                    netInputs.push_back(addedBlobs.back().name);
+                }
+                if (layer.has_input_param())
+                {
+                    const caffe::InputParameter &inputParameter = layer.input_param();
+                    int input_shape_size = inputParameter.shape_size();
+                    CV_CheckEQ(input_shape_size, layer.top_size(), "");
+                    for (int inp_id = 0; inp_id < input_shape_size; inp_id++)
+                    {
+                        MatShape shape = parseBlobShape(inputParameter.shape(inp_id));
+                        inp_shapes.push_back(shape);
+                    }
+                }
+                continue;
+            }
+            else if (type == "BatchNorm")
+            {
+                if (!layerParams.get<bool>("use_global_stats", true))
+                {
+                    CV_Assert_N(layer.bottom_size() == 1, layer.top_size() == 1);
+
+                    LayerParams mvnParams;
+                    mvnParams.set("eps", layerParams.get<float>("eps", 1e-5));
+                    std::string mvnName = name + "/mvn";
+
+                    int repetitions = layerCounter[mvnName]++;
+                    if (repetitions)
+                        mvnName += String("_") + toString(repetitions);
+
+                    int mvnId = dstNet.addLayer(mvnName, "MVN", mvnParams);
+                    addInput(layer.bottom(0), mvnId, 0, dstNet);
+                    addOutput(layer, mvnId, 0);
+                    net.mutable_layer(li)->set_bottom(0, layer.top(0));
+                    layerParams.blobs[0].setTo(0);  // mean
+                    layerParams.blobs[1].setTo(1);  // std
+                }
+            }
+            else if (type == "Axpy")
+            {
+                CV_Assert_N(layer.bottom_size() == 3, layer.top_size() == 1);
+
+                std::string scaleName = name + "/scale";
+                int repetitions = layerCounter[scaleName]++;
+                if (repetitions) {
+                    scaleName += String("_") + toString(repetitions);
+                }
+
+                LayerParams scaleParams;
+                scaleParams.set("axis", 1);
+                scaleParams.set("has_bias", false);
+                int scaleId = dstNet.addLayer(scaleName, "Scale", scaleParams);
+                addInput(layer.bottom(2), scaleId, 0, dstNet);
+                addInput(layer.bottom(0), scaleId, 1, dstNet);
+                addOutput(layer, scaleId, 0);
+                net.mutable_layer(li)->set_bottom(0, layer.top(0));
+                net.mutable_layer(li)->mutable_bottom()->RemoveLast();
+                type = "Eltwise";
+            }
+            else if (type == "Resample")
+            {
+                CV_Assert(layer.bottom_size() == 1 || layer.bottom_size() == 2);
+                type = "Resize";
+                String interp = toLowerCase(layerParams.get<String>("type"));
+                layerParams.set("interpolation", interp == "linear" ? "bilinear" : interp);
+
+                if (layerParams.has("factor"))
+                {
+                    float factor = layerParams.get<float>("factor");
+                    CV_Assert(layer.bottom_size() != 2 || factor == 1.0);
+                    layerParams.set("zoom_factor", factor);
+
+                    if ((interp == "linear" && factor != 1.0) ||
+                        (interp == "nearest" && factor < 1.0))
+                        CV_Error(Error::StsNotImplemented, "Unsupported Resample mode");
+                }
+            }
+            else if ("Convolution" == type)
+            {
+                CV_Assert(layer.bottom_size() == layer.top_size());
+                for (int i = 0; i < layer.bottom_size(); i++)
+                {
+                    int conv_id = dstNet.addLayer(layer.top(i), type, layerParams);
+                    addInput(layer.bottom(i), conv_id, 0, dstNet);
+                    addedBlobs.push_back(BlobNote(layer.top(i), conv_id, 0));
+                }
+                continue;
+            }
+            else if ("ConvolutionDepthwise" == type)
+            {
+                type = "Convolution";
+            }
+
+            int id = dstNet.addLayer(name, type, layerParams);
+
+            for (int inNum = 0; inNum < layer.bottom_size(); inNum++)
+                addInput(layer.bottom(inNum), id, inNum, dstNet);
+
+            for (int outNum = 0; outNum < layer.top_size(); outNum++)
+                addOutput(layer, id, outNum);
+        }
+        dstNet.setInputsNames(netInputs);
+
+        if (inp_shapes.size() > 0)
+        {
+            CV_CheckEQ(inp_shapes.size(), netInputs.size(), "");
+            for (int inp_id = 0; inp_id < inp_shapes.size(); inp_id++)
+                dstNet.setInputShape(netInputs[inp_id], inp_shapes[inp_id]);
+        }
+
+        addedBlobs.clear();
+    }
+
+    void addOutput(const caffe::LayerParameter &layer, int layerId, int outNum)
+    {
+        const std::string &name = layer.top(outNum);
+
+        bool haveDups = false;
+        for (int idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
+        {
+            if (addedBlobs[idx].name == name)
+            {
+                haveDups = true;
+                break;
+            }
+        }
+
+        if (haveDups)
+        {
+            bool isInplace = layer.bottom_size() > outNum && layer.bottom(outNum) == name;
+            if (!isInplace)
+                CV_Error(Error::StsBadArg, "Duplicate blobs produced by multiple sources");
+        }
+
+        addedBlobs.push_back(BlobNote(name, layerId, outNum));
+    }
+
+    void addInput(const std::string &name, int layerId, int inNum, Net &dstNet)
+    {
+        int idx;
+        for (idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
+        {
+            if (addedBlobs[idx].name == name)
+                break;
+        }
+
+        if (idx < 0)
+        {
+            CV_Error(Error::StsObjectNotFound, "Can't find output blob \"" + name + "\"");
+            return;
+        }
+
+        dstNet.connect(addedBlobs[idx].layerId, addedBlobs[idx].outNum, layerId, inNum);
+    }
+};
+
+}
+
+Net readNetFromCaffe(const String &prototxt, const String &caffeModel /*= String()*/)
+{
+    CaffeImporter caffeImporter(prototxt.c_str(), caffeModel.c_str());
+    Net net;
+    caffeImporter.populateNet(net);
+    return net;
+}
+
+Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
+                     const char *bufferModel, size_t lenModel)
+{
+    CaffeImporter caffeImporter(bufferProto, lenProto, bufferModel, lenModel);
+    Net net;
+    caffeImporter.populateNet(net);
+    return net;
+}
+
+Net readNetFromCaffe(const std::vector<uchar>& bufferProto, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferProtoPtr = reinterpret_cast<const char*>(&bufferProto[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromCaffe(bufferProtoPtr, bufferProto.size(),
+                            bufferModelPtr, bufferModel.size());
+}
+
+#endif //HAVE_PROTOBUF
+
+CV__DNN_INLINE_NS_END
+}} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.cpp
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.hpp
@@ -0,0 +1,129 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//COPYRIGHT
+//
+//All contributions by the University of California:
+//Copyright (c) 2014, The Regents of the University of California (Regents)
+//All rights reserved.
+//
+//All other contributions:
+//Copyright (c) 2014, the respective contributors
+//All rights reserved.
+//
+//Caffe uses a shared copyright model: each contributor holds copyright over
+//their contributions to Caffe. The project versioning records all such
+//contribution and copyright details. If a contributor wants to further mark
+//their specific copyright on a particular contribution, they should indicate
+//their copyright solely in the commit message of the change when it is
+//committed.
+//
+//LICENSE
+//
+//Redistribution and use in source and binary forms, with or without
+//modification, are permitted provided that the following conditions are met:
+//
+//1. Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//2. Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+//THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+//ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+//WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+//DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+//ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+//(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+//LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+//ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+//(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//CONTRIBUTION AGREEMENT
+//
+//By contributing to the BVLC/caffe repository through pull-request, comment,
+//or otherwise, the contributor releases their content to the
+//license and copyright terms herein.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_CAFFE_IO_HPP__
+#define __OPENCV_DNN_CAFFE_IO_HPP__
+#ifdef HAVE_PROTOBUF
+
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+#endif
+#include "opencv-caffe.pb.h"
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
+
+namespace caffe { using namespace opencv_caffe; } // avoid massive renames from caffe proto package
+
+namespace cv {
+namespace dnn {
+
+// Read parameters from a file into a NetParameter proto message.
+void ReadNetParamsFromTextFileOrDie(const char* param_file,
+                                    caffe::NetParameter* param);
+void ReadNetParamsFromBinaryFileOrDie(const char* param_file,
+                                      caffe::NetParameter* param);
+
+// Read parameters from a memory buffer into a NetParammeter proto message.
+void ReadNetParamsFromBinaryBufferOrDie(const char* data, size_t len,
+                                        caffe::NetParameter* param);
+void ReadNetParamsFromTextBufferOrDie(const char* data, size_t len,
+                                      caffe::NetParameter* param);
+
+// Utility functions used internally by Caffe and TensorFlow loaders
+bool ReadProtoFromTextFile(const char* filename, ::google::protobuf::Message* proto);
+bool ReadProtoFromBinaryFile(const char* filename, ::google::protobuf::Message* proto);
+bool ReadProtoFromTextBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
+bool ReadProtoFromBinaryBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
+
+}
+}
+#endif
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_shrinker.cpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_shrinker.cpp
@@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+
+#ifdef HAVE_PROTOBUF
+#include <fstream>
+#include "caffe_io.hpp"
+#endif
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+#ifdef HAVE_PROTOBUF
+
+void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& layersTypes)
+{
+    CV_TRACE_FUNCTION();
+
+    std::vector<String> types(layersTypes);
+    if (types.empty())
+    {
+        types.push_back("Convolution");
+        types.push_back("InnerProduct");
+    }
+
+    caffe::NetParameter net;
+    ReadNetParamsFromBinaryFileOrDie(src.c_str(), &net);
+
+    for (int i = 0; i < net.layer_size(); ++i)
+    {
+        caffe::LayerParameter* lp = net.mutable_layer(i);
+        if (std::find(types.begin(), types.end(), lp->type()) == types.end())
+        {
+            continue;
+        }
+        for (int j = 0; j < lp->blobs_size(); ++j)
+        {
+            caffe::BlobProto* blob = lp->mutable_blobs(j);
+            CV_Assert(blob->data_size() != 0);  // float32 array.
+
+            Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
+            Mat halfs(1, blob->data_size(), CV_16SC1);
+            convertFp16(floats, halfs);  // Convert to float16.
+
+            blob->clear_data();  // Clear float32 data.
+
+            // Set float16 data.
+            blob->set_raw_data(halfs.data, halfs.total() * halfs.elemSize());
+            blob->set_raw_data_type(caffe::FLOAT16);
+        }
+    }
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+    size_t msgSize = saturate_cast<size_t>(net.ByteSize());
+#else
+    size_t msgSize = net.ByteSizeLong();
+#endif
+    std::vector<uint8_t> output(msgSize);
+    net.SerializeWithCachedSizesToArray(&output[0]);
+
+    std::ofstream ofs(dst.c_str(), std::ios::binary);
+    ofs.write((const char*)&output[0], msgSize);
+    ofs.close();
+}
+
+#else
+
+void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& types)
+{
+    CV_Error(cv::Error::StsNotImplemented, "libprotobuf required to import data from Caffe models");
+}
+
+#endif  // HAVE_PROTOBUF
+
+CV__DNN_INLINE_NS_END
+}} // namespace
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/glog_emulator.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/glog_emulator.hpp
@@ -0,0 +1,106 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
+#define __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <opencv2/core.hpp>
+
+#define CHECK(cond)     for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #cond, cond); _logger.exit(); _logger.check()) _logger.stream()
+#define CHECK_EQ(a, b)  for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #a"="#b, ((a) == (b))); _logger.exit(); _logger.check()) _logger.stream()
+#define LOG(TYPE)       for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, #TYPE); _logger.exit(); _logger.check()) _logger.stream()
+
+namespace cv
+{
+namespace dnn
+{
+
+class GLogWrapper
+{
+    const char *file, *func, *type, *cond_str;
+    int line;
+    bool cond_status, exit_loop;
+    std::stringstream sstream;
+
+public:
+
+    GLogWrapper(const char *_file, const char *_func, int _line,
+          const char *_type,
+          const char *_cond_str = NULL, bool _cond_status = true
+    ) :
+        file(_file), func(_func), type(_type), cond_str(_cond_str),
+        line(_line), cond_status(_cond_status), exit_loop(true) {}
+
+    std::iostream &stream()
+    {
+        return sstream;
+    }
+
+    bool exit()
+    {
+        return exit_loop;
+    }
+
+    void check()
+    {
+        exit_loop = false;
+
+        if (cond_str && !cond_status)
+        {
+            cv::error(cv::Error::StsError, "FAILED: " + String(cond_str) + ". " + sstream.str(), func, file, line);
+        }
+        else if (!cond_str && strcmp(type, "CHECK"))
+        {
+            #ifndef NDEBUG
+            if (!std::strcmp(type, "INFO"))
+                std::cout << sstream.str() << std::endl;
+            else
+                std::cerr << sstream.str() << std::endl;
+            #endif
+        }
+    }
+};
+
+}
+}
+#endif
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/opencv-caffe.proto
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/caffe/opencv-caffe.proto
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activation_eltwise.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activation_eltwise.cu
@@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
+    __global__ void generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        ActivationOp activation_op(act_params);
+        EltwiseOp eltwise_op(eltwise_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j]), eltwise_vec.data[j]);
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
+void launch_vectorized_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+
+    auto kernel = raw::generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, eltwise, act_params, eltwise_params);
+}
+
+template <class T, class ActivationOp, class EltwiseOp> static
+void generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4)) {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2)) {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    } else {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    }
+}
+
+template <class T>
+void relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T slope) {
+    generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {slope});
+}
+
+template <class T>
+void clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {floor, ceiling});
+}
+
+template <class T>
+void tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T exp, T scale, T shift) {
+    generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float);
+template void clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float);
+template void tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activations.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activations.cu
@@ -0,0 +1,209 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
+
+namespace raw {
+    template <class T, class ActivationOp, std::size_t N>
+    __global__ void generic_op_vec(Span<T> output, View<T> input, const typename ActivationOp::Params params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto input_vPtr = vector_type::get_pointer(input.data());
+
+        ActivationOp activation_op(params);
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            vector_type vec;
+            v_load(vec, input_vPtr[i]);
+            for (int j = 0; j < vector_type::size(); j++)
+                vec.data[j] = activation_op(vec.data[j]);
+            v_store(output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto input_vPtr = vector_type::get_pointer(input.data());
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            const index_type c = (i / inner_size) % slope.size();
+
+            vector_type vec;
+            v_load(vec, input_vPtr[i]);
+            for (int j = 0; j < vector_type::size(); j++)
+                vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c];
+            v_store(output_vPtr[i], vec);
+        }
+    }
+
+} /* namespace raw */
+
+template <class T, class ActivationOp, std::size_t N> static
+void launch_vectorized_generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(input, N));
+
+    auto kernel = raw::generic_op_vec<T, ActivationOp, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, input, params);
+}
+
+template <class T, class ActivationOp> static
+void generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params = {}) {
+    CV_Assert(input.size() == output.size());
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+        launch_vectorized_generic_op<T, ActivationOp, 4>(stream, output, input, params);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+        launch_vectorized_generic_op<T, ActivationOp, 2>(stream, output, input, params);
+    } else {
+        launch_vectorized_generic_op<T, ActivationOp, 1>(stream, output, input, params);
+    }
+}
+
+template <class T>
+void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+    generic_op<T, ReLUFunctor<T>>(stream, output, input, {slope});
+}
+
+template <class T>
+void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    generic_op<T, ClippedReLUFunctor<T>>(stream, output, input, {floor, ceiling});
+}
+
+template <class T>
+void tanh(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, TanHFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void swish(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, SwishFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void mish(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, MishFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, SigmoidFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void elu(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, ELUFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void bnll(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, BNLLFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void abs(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, AbsFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
+    CV_Assert(input.size() == output.size());
+
+    if (static_cast<float>(exp) == 1.0f) {
+        scale1_with_bias1(stream, output, input, scale, shift);
+        return;
+    }
+
+    generic_op<T, PowerFunctor<T>>(stream, output, input, {exp, scale, shift});
+}
+
+template <class T>
+void exp(const Stream& stream, Span<T> output, View<T> input, T normScale, T normShift) {
+    generic_op<T, ExpFunctor<T>>(stream, output, input, {normScale, normShift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
+template void swish<__half>(const Stream&, Span<__half>, View<__half>);
+template void mish<__half>(const Stream&, Span<__half>, View<__half>);
+template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
+template void elu<__half>(const Stream&, Span<__half>, View<__half>);
+template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
+template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
+template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+#endif
+
+
+template void relu<float>(const Stream&, Span<float>, View<float>, float);
+template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void tanh<float>(const Stream&, Span<float>, View<float>);
+template void swish<float>(const Stream&, Span<float>, View<float>);
+template void mish<float>(const Stream&, Span<float>, View<float>);
+template void sigmoid<float>(const Stream&, Span<float>, View<float>);
+template void elu<float>(const Stream&, Span<float>, View<float>);
+template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
+template void bnll<float>(const Stream&, Span<float>, View<float>);
+template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
+template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
+
+template <class T, std::size_t N> static
+void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(input, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::axiswise_relu_vec<T, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, input, inner_size / N, slope);
+}
+
+template <class T>
+void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+    CV_Assert(input.size() == output.size());
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+        launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+        launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope);
+    } else {
+        launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope);
+    }
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>);
+#endif
+    template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/array.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/array.hpp
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+
+#include <cstddef>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T, std::size_t N>
+    struct array {
+        using value_type        = T;
+        using size_type         = device::size_type;
+        using difference_type   = std::ptrdiff_t;
+        using reference         = typename std::add_lvalue_reference<value_type>::type;
+        using const_reference   = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type;
+        using pointer           = typename std::add_pointer<value_type>::type;
+        using const_pointer     = typename std::add_pointer<typename std::add_const<value_type>::type>::type;
+        using iterator          = pointer;
+        using const_iterator    = const_pointer;
+        using reverse_iterator  = std::reverse_iterator<iterator>;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+        __host__ __device__ bool empty() const noexcept { return N == 0; }
+        __host__ __device__ size_type size() const noexcept { return N; }
+
+        __host__ __device__ iterator begin() noexcept { return ptr; }
+        __host__ __device__ iterator end() noexcept { return ptr + N; }
+        __host__ __device__ const_iterator begin() const noexcept { return ptr; }
+        __host__ __device__ const_iterator end() const noexcept { return ptr + N; }
+
+        __host__ __device__ const_iterator cbegin() const noexcept { return ptr; }
+        __host__ __device__ const_iterator cend() const noexcept { return ptr + N; }
+
+        __host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; }
+        __host__ __device__ reverse_iterator rend() noexcept { return ptr; }
+        __host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; }
+        __host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; }
+
+        __host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; }
+        __host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; }
+
+        template <class InputItr>
+        __host__ void assign(InputItr first, InputItr last) {
+            std::copy(first, last, std::begin(ptr));
+        }
+
+        __host__ __device__ reference operator[](int idx) { return ptr[idx]; }
+        __host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; }
+
+        __host__ __device__ reference front() { return ptr[0]; }
+        __host__ __device__ const_reference front() const { return ptr[0]; }
+
+        __host__ __device__ reference back() { return ptr[N - 1]; }
+        __host__ __device__ const_reference back() const { return ptr[N - 1]; }
+
+        __host__ __device__ pointer data() noexcept { return ptr; }
+        __host__ __device__ const_pointer data() const noexcept { return ptr; }
+
+        T ptr[N];
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/atomics.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/atomics.hpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
+// This function was introduced in CUDA 10.
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700 && CUDART_VERSION >= 10000)
+// And half-precision floating-point operations are not supported by devices of compute capability strictly lower than 5.3
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
+#elif __CUDA_ARCH__ < 530
+#else
+inline __device__ void atomicAdd(__half* address, __half val) {
+    unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        __half tmpres = hsum + val;
+        hsum = __half_raw(tmpres);
+
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+
+#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bbox_utils.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bbox_utils.hpp
@@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
+#define OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
+
+#include "math.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    struct BoundingBox
+    {
+        float xmin, ymin, xmax, ymax;
+    };
+
+    template <bool NORMALIZED_BBOX>
+    __device__ __forceinline__ float compute_bbox_size(BoundingBox bbox)
+    {
+        float width = bbox.xmax - bbox.xmin;
+        float height = bbox.ymax - bbox.ymin;
+        if (width < 0 || height < 0)
+            return 0.0;
+
+        if (!NORMALIZED_BBOX)
+        {
+            width += 1;
+            height += 1;
+        }
+
+        using csl::device::mul_ftz;
+        return mul_ftz(width, height);
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation.cu
@@ -0,0 +1,120 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, class ActivationOp, std::size_t N>
+    __global__ void biasN_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, const typename ActivationOp::Params params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        ActivationOp activation_op(params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % bias.size();
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++)
+                vec.data[j] = activation_op(vec.data[j] + bias[bias_idx]);
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+} /* namespace raw */
+
+template <class T, class ActivationOp, std::size_t N> static
+void launch_vectorized_biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params) {
+    CV_Assert(inplace_output.size() % inner_size == 0);
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_generic_op_inplace_vec<T, ActivationOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, params);
+}
+
+template <class T, class ActivationOp> static
+void biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params = {}) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 4>(stream, inplace_output, inner_size, bias, params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 2>(stream, inplace_output, inner_size, bias, params);
+    } else {
+        launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 1>(stream, inplace_output, inner_size, bias, params);
+    }
+}
+
+template <class T>
+void biasN_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
+    biasN_generic_op_inplace<T, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {slope});
+}
+
+template <class T>
+void biasN_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceil));
+    biasN_generic_op_inplace<T, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {floor, ceil});
+}
+
+template <class T>
+void biasN_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, TanHFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, SwishFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, MishFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power, T scale, T shift) {
+    biasN_generic_op_inplace<T, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, {power, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
+template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half);
+template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half, __half);
+#endif
+
+template void biasN_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
+template void biasN_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float);
+template void biasN_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation_eltwise.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation_eltwise.cu
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
+    __global__ void biasN_generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        ActivationOp activation_op(act_params);
+        EltwiseOp eltwise_op(eltwise_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % bias.size();
+
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j] + bias[bias_idx]), eltwise_vec.data[j]);
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
+void launch_vectorized_biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, act_params, eltwise_params);
+}
+
+template <class T, class ActivationOp, class EltwiseOp> static
+void biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
+        launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
+        launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
+    } else {
+        launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
+    }
+}
+
+template <class T>
+void biasN_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
+    biasN_generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {slope});
+}
+
+template <class T>
+void biasN_clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    biasN_generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {floor, ceiling});
+}
+
+template <class T>
+void biasN_tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
+    biasN_generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void biasN_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
+template void biasN_clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
+template void biasN_tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void biasN_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
+template void biasN_clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
+template void biasN_tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_eltwise_activation.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_eltwise_activation.cu
@@ -0,0 +1,132 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
+    __global__ void biasN_eltwise_op_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        EltwiseOp eltwise_op(eltwise_params);
+        ActivationOp activation_op(act_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % bias.size();
+
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = activation_op(eltwise_op(output_vec.data[j] + bias[bias_idx], eltwise_vec.data[j]));
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
+void launch_vectorized_biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inplace_output.size() % bias.size() == 0);
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_eltwise_op_generic_op_inplace_vec<T, EltwiseOp, ActivationOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, eltwise_params, act_params);
+}
+
+template <class T, class EltwiseOp, class ActivationOp> static
+void biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
+        launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 4>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
+        launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 2>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
+    } else {
+        launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 1>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
+    }
+}
+
+template <class T>
+void biasN_eltwise_sum_2_identity_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, IdentityFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {slope});
+}
+
+template <class T>
+void biasN_eltwise_sum_2_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {floor, ceiling});
+}
+
+template <class T>
+void biasN_eltwise_sum_2_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, TanHFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SwishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, MishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void biasN_eltwise_sum_2_identity_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
+template void biasN_eltwise_sum_2_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
+template void biasN_eltwise_sum_2_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void biasN_eltwise_sum_2_identity_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
+template void biasN_eltwise_sum_2_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
+template void biasN_eltwise_sum_2_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/block_stride_range.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/block_stride_range.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
+#define OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
+
+#include "types.hpp"
+#include "index_helpers.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+template <int dim, int BLOCK_SIZE = 0, class index_type = device::index_type, class size_type = device::size_type>
+class block_stride_range_generic {
+public:
+    __device__ block_stride_range_generic(index_type to_) : from(0), to(to_) { }
+    __device__ block_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
+
+    class iterator
+    {
+    public:
+        __device__ iterator(index_type pos_) : pos(pos_) {}
+
+        /* these iterators return the index when dereferenced; this allows us to loop
+         * through the indices using a range based for loop
+         */
+        __device__ index_type operator*() const { return pos; }
+
+        __device__ iterator& operator++() {
+            const index_type block_size = BLOCK_SIZE == 0 ? getBlockDim<dim>() : BLOCK_SIZE;
+            pos += block_size;
+            return *this;
+        }
+
+        __device__ bool operator!=(const iterator& other) const {
+            /* NOTE HACK
+             * 'pos' can move in large steps (see operator++)
+             * expansion of range for loop uses != as the loop conditioion
+             * => operator!= must return false if 'pos' crosses the end
+             */
+            return pos < other.pos;
+        }
+
+    private:
+        index_type pos;
+    };
+
+    __device__ iterator begin() const {
+        return iterator(from + getThreadIdx<dim>());
+    }
+
+    __device__ iterator end() const {
+        return iterator(to);
+    }
+
+private:
+    index_type from, to;
+};
+
+using block_stride_range_x = block_stride_range_generic<0>;
+using block_stride_range_y = block_stride_range_generic<1>;
+using block_stride_range_z = block_stride_range_generic<2>;
+
+template <size_type BLOCK_SIZE = 0>
+using block_stride_range = block_stride_range_generic<0, BLOCK_SIZE>;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/concat.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/concat.cu
@@ -0,0 +1,277 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void concat_vec(
+            Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+            View<T> input, size_type input_axis_size, size_type concat_size)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            /* we need to copy all the elements of input to some location in the output
+             * we copy blocks of size `total_concat_size` to some location in the output
+             */
+            const auto total_concat_size = concat_size * input_axis_size;
+
+            for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) {
+                const index_type idx = in_idx * vector_type::size();
+                const index_type concat_num = idx / total_concat_size;
+                const index_type concat_index = idx % total_concat_size;
+                const index_type top_index = concat_index +
+                    (concat_num * output_axis_size + output_axis_offset) * concat_size;
+
+                const auto out_idx = top_index / vector_type::size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[in_idx]);
+                v_store(output_vPtr[out_idx], vec);
+            }
+        }
+
+        template <class T, std::size_t Rank>
+        __global__ void concat_with_offsets(
+            Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset,
+            View<T> input, array<size_type, Rank> in_strides)
+        {
+            for (auto i : grid_stride_range(input.size())) {
+                index_type in_index = i / in_strides[0];
+                index_type out_index = out_offset[0] + in_index;
+                index_type oidx = out_index * out_strides[0];
+                for (int j = 1; j < Rank; j++) {
+                    in_index = (i % in_strides[j - 1]) / in_strides[j];
+                    out_index = out_offset[j] + in_index;
+                    oidx += out_index * out_strides[j];
+                }
+
+                output[oidx] = input[i];
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_concat(const Stream& stream,
+        Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+        View<T> input, size_type input_axis_size, size_type concat_size)
+    {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        /* more assertions are required to fully check for vectorization possibility; check concat() */
+
+        auto kernel = raw::concat_vec<T, N>;
+        auto policy = make_policy(kernel, input.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+    }
+
+    template <class T>
+    void concat(
+        const Stream& stream,
+        TensorSpan<T> output, std::size_t output_axis_offset,
+        TensorView<T> input, std::size_t axis)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output_axis_offset < output.get_axis_size(axis));
+
+        /* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
+         * in the output and we can copy each block directly
+         */
+        if (output.size_range(0, axis) == 1)
+        {
+            auto stride = output.size_range(axis + 1, output.rank());
+            auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
+            kernels::copy<T>(stream, sliced_output, input);
+            return;
+        }
+
+        /* let's call the axis of interest as the channel axis for the purpose of the following discussion
+         * even though it can be any axis
+         *
+         * for each batch item:
+         *    we move all the channels from the input (which together, for a single batch item, is contiguous)
+         *    of a batch item to its corresponding contiguous place in the output
+         *
+         * for a valid vector operation:
+         * - the size of each copy block must be aligned
+         * - input must be aligned
+         * - all the destination locations in the output must be aligned
+         */
+        std::size_t concat_size = output.size_range(axis + 1, output.rank());
+
+        std::size_t input_axis_size = input.get_axis_size(axis);
+        std::size_t output_axis_size = output.get_axis_size(axis);
+
+        std::size_t copy_block_size = concat_size * input_axis_size;
+        std::size_t copy_block_stride = concat_size * output_axis_size;
+        std::size_t starting_offset = output_axis_offset * concat_size;
+
+        /* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size`
+         * to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride`
+         */
+
+        bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0;
+        bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0;
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) {
+            launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) {
+            launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        } else {
+            launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
+#endif
+    template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>,  std::size_t);
+
+    template <class T, std::size_t Rank> static
+    void launch_concat_with_offsets(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset,
+        View<T> input, const std::vector<std::size_t>& inStride)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(outOffset.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> outOffset_k;
+        outOffset_k.assign(std::begin(outOffset), std::end(outOffset));
+
+        auto kernel = raw::concat_with_offsets<T, Rank>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets);
+
+    template <class T>
+    void concat_with_offsets(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> offsets)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == offsets.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output
+         * tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items
+         * from the input tensor to new locations in the output tensor.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output
+         * indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...]
+         * respectively. The first index does not contribute to the element's address calculation and
+         * hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(offsets[0] == 0);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            offsets.erase(std::begin(offsets));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == offsets.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes that undergo full copy can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any
+         * concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+         *
+         * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`.
+         * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any slicing */
+            if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+                    /* `j` axis is also copied fully; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    offsets[i] = 0; /* redundant */
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    offsets.erase(std::begin(offsets) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == offsets.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(offsets[i] == 0);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride);
+    }
+
+    template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+    template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/crop_and_resize.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/crop_and_resize.cu
@@ -0,0 +1,171 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void crop_and_resize(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes,
+            size_type num_channels)
+        {
+            // input [1, num_channels, in_height, in_width]
+            // output [boxes, num_channels, out_height, out_width]
+
+            const auto in_image_size = in_height * in_width;
+            const auto out_image_size = out_height * out_width;
+            const auto out_box_size = num_channels * out_image_size;
+
+            /* we have to compute the output value for every combination of (box, c, y, x) in the output
+             *
+             * the computation involving (y, x) are identical for all non-spatial dimensions
+             * the computation and memory requests involving the box are identical for remaining three axes
+             *
+             * we process multiple channels every iteration to reuse the identical computation
+             * and memory requests involved with the box and spatial dimensions
+             */
+
+            /*
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y)
+             */
+            auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER;
+
+            /* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are
+             * `num_boxes` boxes and `out_image_size` combinations of (x, y)
+             */
+            auto num_boxes = boxes.size() / 7; /* 7 values per box */
+            auto iters_per_box = num_channel_iters_per_box_xy * out_image_size;
+            auto iters_required = num_boxes * iters_per_box;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type box_no = iter / iters_per_box;
+                const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                const index_type box_offset = box_no * 7;
+                const auto left = boxes[box_offset + 3],
+                           top = boxes[box_offset + 4],
+                           right = boxes[box_offset + 5],
+                           bottom = boxes[box_offset + 6];
+
+                const auto box_width = right - left;
+                const auto box_height = bottom - top;
+
+                const auto o2i_fy = static_cast<T>(in_height - 1) / static_cast<T>(out_height - 1);
+                const auto o2i_fx = static_cast<T>(in_width - 1) / static_cast<T>(out_width - 1);
+
+                const auto height_scale = box_height * o2i_fy;
+                const auto width_scale = box_width * o2i_fx;
+
+                const auto in_y = top * static_cast<T>(in_height - 1) + static_cast<T>(y) * height_scale;
+                const auto in_x = left * static_cast<T>(in_width - 1) + static_cast<T>(x) * width_scale;
+
+                const auto in_y0 = static_cast<index_type>(in_y);
+                const auto in_x0 = static_cast<index_type>(in_x);
+
+                using device::min;
+                const auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                const auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling */
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
+                         v_01 = load_ldg(input[in_offset_r0 + in_x1]),
+                         v_10 = load_ldg(input[in_offset_r1 + in_x0]),
+                         v_11 = load_ldg(input[in_offset_r1 + in_x1]);
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - T(in_y0)) * T(v_10 - v_00) +
+                        T(in_x - T(in_x0)) * T(v_01 - v_00) +
+                        T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_crop_and_resize(const Stream& stream,
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes, size_type num_channels)
+    {
+        auto kernel = raw::crop_and_resize<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+    }
+
+    template <class T>
+    void crop_and_resize(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> boxes) {
+        CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_channels = input.get_axis_size(1);
+
+        if (num_channels % 64 == 0) {
+            launch_multichannel_crop_and_resize<T, 64>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 32 == 0) {
+            launch_multichannel_crop_and_resize<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 16 == 0) {
+            launch_multichannel_crop_and_resize<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 8 == 0) {
+            launch_multichannel_crop_and_resize<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 4 == 0) {
+            launch_multichannel_crop_and_resize<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 2 == 0) {
+            launch_multichannel_crop_and_resize<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else {
+            launch_multichannel_crop_and_resize<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes);
+#endif
+    template void crop_and_resize<float>(const Stream&, TensorSpan<float>, TensorView<float>, View<float> boxes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/detection_output.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/detection_output.cu
@@ -0,0 +1,897 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "bbox_utils.hpp"
+#include "grid_stride_range.hpp"
+#include "block_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX>
+    __global__ void decode_bbox(Span<T> decoded_bboxes, View<T> locations, View<T> priors,
+        bool transpose_location, bool normalized_bbox,
+        size_type num_loc_classes, index_type background_class_id,
+        float clip_width, float clip_height)
+    {
+        // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+        // locations: [batch_size, num_priors, num_loc_classes, 4]
+        // priors: [1, C, num_priors, 4]
+        // C = 2 if !VARIANCE_ENCODED_IN_TARGET; otherwise, 1
+
+        /* 4 bbox values + 4 variance values per prior */
+        constexpr int PRIOR_BOX_SIZE = VARIANCE_ENCODED_IN_TARGET ? 4 : 8;
+        const size_type num_priors = priors.size() / PRIOR_BOX_SIZE;
+
+        using vector_type = get_vector_type_t<T, 4>;
+        auto locations_vPtr = vector_type::get_pointer(locations.data());
+        auto priors_vPtr = vector_type::get_pointer(priors.data());
+        auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
+
+        const auto boxes_per_batch = num_priors * num_loc_classes;
+        for (auto idx : grid_stride_range(decoded_bboxes.size() / 4))
+        {
+            index_type p;
+            index_type c;
+
+            if (SHARE_LOCATION)
+            {
+                // locations are shared across all classes => num_loc_classes = 1
+                p = idx % boxes_per_batch;
+                c = 0;
+            }
+            else
+            {
+                p = (idx % boxes_per_batch) / num_loc_classes;
+                c = idx % num_loc_classes;
+            }
+
+            if (!SHARE_LOCATION && c == background_class_id)
+                continue;
+
+            BoundingBox bbox;
+            {
+                vector_type location;
+                v_load(location, locations_vPtr[idx]);
+
+                if (transpose_location)
+                {
+                    bbox.ymin = location.data[0];
+                    bbox.xmin = location.data[1];
+                    bbox.ymax = location.data[2];
+                    bbox.xmax = location.data[3];
+                }
+                else
+                {
+                    bbox.xmin = location.data[0];
+                    bbox.ymin = location.data[1];
+                    bbox.xmax = location.data[2];
+                    bbox.ymax = location.data[3];
+                }
+            }
+
+            if (!VARIANCE_ENCODED_IN_TARGET)
+            {
+                vector_type prior_variance;
+                v_load_ldg(prior_variance, priors_vPtr[num_priors + p]);
+
+                bbox.xmin *= static_cast<float>(prior_variance.data[0]);
+                bbox.ymin *= static_cast<float>(prior_variance.data[1]);
+                bbox.xmax *= static_cast<float>(prior_variance.data[2]);
+                bbox.ymax *= static_cast<float>(prior_variance.data[3]);
+            }
+
+            BoundingBox prior;
+            {
+                vector_type prior_box;
+                v_load_ldg(prior_box, priors_vPtr[p]);
+
+                prior.xmin = prior_box.data[0];
+                prior.ymin = prior_box.data[1];
+                prior.xmax = prior_box.data[2];
+                prior.ymax = prior_box.data[3];
+            }
+
+            BoundingBox decoded_bbox;
+            if (CORNER_TRUE_CENTER_FALSE)
+            {
+                decoded_bbox.xmin = prior.xmin + bbox.xmin;
+                decoded_bbox.ymin = prior.ymin + bbox.ymin;
+                decoded_bbox.xmax = prior.xmax + bbox.xmax;
+                decoded_bbox.ymax = prior.ymax + bbox.ymax;
+            }
+            else
+            {
+                auto prior_width = prior.xmax - prior.xmin;
+                auto prior_height = prior.ymax - prior.ymin;
+                if (!normalized_bbox)
+                {
+                    prior_width += 1;
+                    prior_height += 1;
+                }
+
+                auto prior_center_x = prior.xmin + prior_width * 0.5f;
+                auto prior_center_y = prior.ymin + prior_height * 0.5f;
+
+                auto decode_bbox_center_x = bbox.xmin * prior_width + prior_center_x;
+                auto decode_bbox_center_y = bbox.ymin * prior_height + prior_center_y;
+
+                using device::exp;
+                float decode_bbox_width = exp(bbox.xmax) * prior_width;
+                float decode_bbox_height = exp(bbox.ymax) * prior_height;
+
+                decoded_bbox.xmin = decode_bbox_center_x - decode_bbox_width * 0.5f;
+                decoded_bbox.ymin = decode_bbox_center_y - decode_bbox_height * 0.5f;
+                decoded_bbox.xmax = decode_bbox_center_x + decode_bbox_width * 0.5f;
+                decoded_bbox.ymax = decode_bbox_center_y + decode_bbox_height * 0.5f;
+            }
+
+            vector_type decoded_bbox_vec;
+            if (CLIP_BBOX)
+            {
+                decoded_bbox_vec.data[0] = clamp(decoded_bbox.xmin, 0.0f, clip_width);
+                decoded_bbox_vec.data[1] = clamp(decoded_bbox.ymin, 0.0f, clip_height);
+                decoded_bbox_vec.data[2] = clamp(decoded_bbox.xmax, 0.0f, clip_width);
+                decoded_bbox_vec.data[3] = clamp(decoded_bbox.ymax, 0.0f, clip_height);
+            }
+            else
+            {
+                decoded_bbox_vec.data[0] = decoded_bbox.xmin;
+                decoded_bbox_vec.data[1] = decoded_bbox.ymin;
+                decoded_bbox_vec.data[2] = decoded_bbox.xmax;
+                decoded_bbox_vec.data[3] = decoded_bbox.ymax;
+            }
+
+            v_store(decoded_bboxes_vPtr[idx], decoded_bbox_vec);
+        }
+    }
+
+    template <class T, int BINS, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void findTopK(Span<int> indices_, Span<int> count_, View<T> scores_, float threshold, size_type classwise_topK, size_type num_classes, size_type num_priors, index_type background_class_id)
+    {
+        /* We need to sort boxes based on their confidence scores. The confidence scores fall in
+         * the range [0.0, 1.0]. We break the range into bins and perform count sort. This is an
+         * approximate algorithm.
+         *
+         * Each block handles a particular class of a particular batch item.
+         */
+        const auto c = blockIdx.x;
+        const auto b = blockIdx.y;
+
+        if (c == background_class_id)
+            return;
+
+        // indices: [batch_size, num_classes, classwise_topK]
+        // count: [batch_size, num_classes]
+        // scores: [batch_size, num_classes, num_priors]
+
+        auto count = count_.data() + b * num_classes + c;
+        auto scores = scores_.data() + (b * num_classes + c) * num_priors;
+        auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+
+        /* We do not require a large number of bins to find the top K confidence scores. We will use
+         * a reasonable number of bins which will fit in the shared memory.
+         *
+         * Note that smaller scores will have a smaller index, i.e. the `bins` are ordered in
+         * ascending order.
+         */
+
+        __shared__ int bins[BINS];
+
+        #pragma unroll
+        for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
+            bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
+
+        __syncthreads();
+
+        for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
+        {
+            const float confidence = load_ldg(scores[i]);
+            if (confidence > threshold)
+            {
+                using device::fast_divide_ftz;
+                auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                using device::clamp;
+                int bin_index = conf_scaled * BINS;
+
+                /* We store counts of confidence scores in the bins. Our ultimate goal is to store the indices
+                 * of the `classwise_topK` confidence values in the `indices` array.
+                 *
+                 * We use a little trick to parallelize the process of filling up the `indices` array.
+                 * We want every thread in the block to participate in the process. To do so, we want the
+                 * bins array to be shifted by one place to the left. We will be computing the suffix sum
+                 * of the bins array later. Details and reasons for doing so will be explained later.
+                 */
+                bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
+
+                if (bin_index >= 0)
+                    atomicAdd(&bins[bin_index], 1);
+            }
+        }
+
+        __syncthreads();
+
+        constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
+        // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
+
+        if (threadIdx.x < WARP_SIZE)
+        {
+            /* We can compute suffix sum of an array in groups of N numbers.
+             * Let N be 4 for this example.
+             *
+             * 1) Last 4 numbers
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:                                            42  33  23  12
+             *
+             * 2) Middle 4 numbers
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:                    |   26  21  15  8   |
+             *
+             * We add `42` (first element in the previous group) to each element to get:
+             *
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             *                                      |   68  63  57  50  |   42  33  23  12
+             * 3) First 4 numbers
+             *
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:    10  9   7   4   |
+             *
+             * We add `68` (first element in the previous group) to each element to get:
+             *
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:    78  77  75  72  |   68  63  57  50  |   42  33  23  12
+             *
+             * What we are left with now is the suffix sum of the entire array.
+             *
+             * We use the aforementioned logic in the code below but work in groups of `warpSize`.
+             */
+
+            /* We calculate suffix sums WARP_SIZE elements at a time starting from the right end.
+             * Hence, we will need BINS / WARP_SIZE number of iterations.
+             *
+             * Each iteration uses shuffle instructions to exchange data between threads. Shuffle
+             * instructions cannot be used in warp-divergent code. If the bins are a multiple of
+             * the warpSize, all the threads in the warp will participate.
+             */
+            static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
+
+            const int thread_id = threadIdx.x;
+            const int inverse_lane_id = WARP_SIZE - thread_id - 1;
+
+            int previous_group_first_element = 0;
+            for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
+            {
+                const index_type idx = iter * WARP_SIZE + thread_id;
+                auto value = bins[idx];
+
+                for (int i = 1; i < WARP_SIZE; i *= 2)
+                {
+                    auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
+                    if (inverse_lane_id >= i)
+                        value += n;
+                }
+
+                value += previous_group_first_element;
+                bins[idx] = value;
+
+                previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
+            }
+        }
+
+        if (threadIdx.x == 0)
+            *count = 0;
+
+        __syncthreads();
+
+        for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
+        {
+            const float confidence = load_ldg(scores[i]);
+            if (confidence > threshold)
+            {
+                using device::fast_divide_ftz;
+                auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                int bin_index = conf_scaled * BINS;
+                bin_index = clamp<int>(bin_index, 0, BINS - 1);
+
+                /* This bounding box is eligible to be selected unless it does not fall in
+                 * the `classwise_topK`. If it did, we would have to compute the location where it needs
+                 * to be stored.
+                 *
+                 * Suppose we had just 4 bins and say the following were the counts:
+                 * BIN0 2
+                 * BIN1 1
+                 * BIN2 3
+                 * BIN3 0 (last bin is always zero as we shift left by one while populating the bins)
+                 *
+                 * We will try our best to store the boxes in a sorted order in the `indices` array.
+                 * This requires that the boxes in later bins (higher confidence scores) must be
+                 * stored earlier.
+                 *
+                 * We compute the suffix sum of the array. This gives us:
+                 * BIN0 6
+                 * BIN1 4
+                 * BIN2 3
+                 * BIN3 0
+                 *
+                 * The bins now give us the location in the `indices` array from which the indices of the
+                 * scores corresponding to that bin would be stored. We atomically increment the bin count
+                 * everytime we store a box corresponding to that bin. Therefore, the value in the bins
+                 * gives the index in the `indices` array where the next box corresponding to that bin  must
+                 * be put.
+                 */
+
+                const index_type idx = atomicAdd(&bins[bin_index], 1);
+                if (idx < classwise_topK)
+                {
+                    indices[idx] = i;
+                    atomicAdd(&count[0], 1);
+                }
+            }
+        }
+    }
+
+    template <class T>
+    __global__ void box_collect(Span<T> collected_bboxes_, View<T> decoded_bboxes_, View<int> indices_, View<int> count_, bool share_location, size_type num_priors, size_type num_classes, size_type classwise_topK, index_type background_class_id)
+    {
+        const index_type c = blockIdx.x;
+        if (c == background_class_id)
+            return;
+
+        const index_type b = blockIdx.y;
+
+        // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+        // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+        // indices: [batch_size, num_classes, classwise_topK]
+        // count: [batch_size, num_classes]
+
+        const auto num_loc_classes = share_location ? 1 : num_classes;
+
+        auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
+        auto decoded_bboxes = decoded_bboxes_.data() + b * num_priors * num_loc_classes * 4;
+        auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+        auto count = count_.data() + b * num_classes + c;
+
+        const auto boxes = load_ldg(&count[0]);
+        if (boxes == 0)
+            return;
+
+        using vector_type = get_vector_type_t<T, 4>;
+        auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes);
+        auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
+
+        for (auto i : block_stride_range<>(boxes))
+        {
+            const auto prior_id = indices[i];
+            const index_type idx = share_location ? prior_id : (prior_id * num_classes + c);
+
+            vector_type box;
+            v_load(box, decoded_bboxes_vPtr[idx]);
+            v_store(collected_bboxes_vPtr[i], box);
+        }
+    }
+
+    template <class T, bool NORMALIZED_BBOX>
+    __global__ void blockwise_class_nms(Span<int> indices_, Span<int> count_, View<T> collected_bboxes_, size_type num_classes, size_type classwise_topK, index_type background_class_id, float nms_threshold)
+    {
+        const index_type b = blockIdx.x / num_classes;
+        const index_type c = blockIdx.x % num_classes;
+        if (c == background_class_id)
+            return;
+
+        // indices: [batch_size, num_classes, classwise_topK]
+        // count: [batch_size, num_classes]
+        // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+
+        auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+        auto count = count_.data() + b * num_classes + c;
+        auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
+
+        const auto boxes = count[0];
+        if (boxes == 0)
+            return;
+
+        using vector_type = get_vector_type_t<T, 4>;
+        auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
+
+        for (int i = 0; i < boxes; i++)
+        {
+            auto prior_id = indices[i];
+            if (prior_id != -1)
+            {
+                BoundingBox bbox1;
+                {
+                    vector_type box;
+                    v_load(box, collected_bboxes_vPtr[i]);
+
+                    bbox1.xmin = box.data[0];
+                    bbox1.ymin = box.data[1];
+                    bbox1.xmax = box.data[2];
+                    bbox1.ymax = box.data[3];
+                }
+
+                for (auto j : block_stride_range<>(i + 1, boxes))
+                {
+                    prior_id = indices[j];
+                    if (prior_id == -1)
+                        continue;
+
+                    BoundingBox bbox2;
+                    {
+                        vector_type box;
+                        v_load_ldg(box, collected_bboxes_vPtr[j]);
+
+                        bbox2.xmin = box.data[0];
+                        bbox2.ymin = box.data[1];
+                        bbox2.xmax = box.data[2];
+                        bbox2.ymax = box.data[3];
+                    }
+
+                    using device::min;
+                    using device::max;
+
+                    BoundingBox intersect_bbox;
+                    intersect_bbox.xmin = max(bbox1.xmin, bbox2.xmin);
+                    intersect_bbox.ymin = max(bbox1.ymin, bbox2.ymin);
+                    intersect_bbox.xmax = min(bbox1.xmax, bbox2.xmax);
+                    intersect_bbox.ymax = min(bbox1.ymax, bbox2.ymax);
+
+                    float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
+                    float bbox1_size = compute_bbox_size<NORMALIZED_BBOX>(bbox1);
+                    float bbox2_size = compute_bbox_size<NORMALIZED_BBOX>(bbox2);
+
+                    using device::fast_divide_ftz;
+                    float iou = fast_divide_ftz(intersect_size, bbox1_size + bbox2_size - intersect_size);
+                    if (iou > nms_threshold)
+                        indices[j] = -1;
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if (threadIdx.x == 0)
+            count[0] = 0;
+
+        __syncthreads();
+
+        for (auto i : block_stride_range<>(boxes))
+        {
+            auto prior_id = indices[i];
+            if(prior_id != -1)
+            {
+                const index_type idx = atomicAdd(&count[0], 1);
+                indices[idx] = prior_id;
+            }
+        }
+    }
+
+    template <class T, std::size_t BINS, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void nms_collect(
+        Span<int> kept_indices, Span<int> kept_count, View<int> indices_, View<int> count, View<T> scores_, float threshold,
+        size_type num_classes, size_type num_priors, size_type classwise_topK, size_type keepTopK, index_type background_class_id)
+    {
+        // sorting algorithm is documented in detail in findTopK kernel comments
+        // no explanations are provided here
+
+        // kept_indices: [batch_size, keepTopK]
+        // kept_count: [batch_size]
+
+        const auto b = blockIdx.x;
+
+        __shared__ int bins[BINS];
+
+        #pragma unroll
+        for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
+            bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
+
+        __syncthreads();
+
+        for (int c = 0; c < num_classes; c++)
+        {
+            if (c == background_class_id)
+                continue;
+
+            // indices: [batch_size, num_classes, classwise_topK]
+            // count: [batch_size, num_classes]
+            // scores: [batch_size, num_classes, num_priors]
+
+            const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+            const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
+
+            auto boxes = count[b * num_classes + c];
+
+            for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
+            {
+                auto prior_id = indices[i];
+                const float confidence = load_ldg(scores[prior_id]);
+                if (confidence > threshold)
+                {
+                    using device::fast_divide_ftz;
+                    auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                    using device::clamp;
+                    int bin_index = conf_scaled * BINS;
+                    bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
+
+                    if (bin_index >= 0)
+                        atomicAdd(&bins[bin_index], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
+        // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
+
+        if (threadIdx.x < WARP_SIZE)
+        {
+            static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
+
+            const int thread_id = threadIdx.x;
+            const int inverse_lane_id = WARP_SIZE - thread_id - 1;
+
+            int previous_group_first_element = 0;
+            for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
+            {
+                const index_type idx = iter * WARP_SIZE + thread_id;
+                auto value = bins[idx];
+
+                for (int i = 1; i < WARP_SIZE; i *= 2)
+                {
+                    auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
+                    if (inverse_lane_id >= i)
+                        value += n;
+                }
+
+                value += previous_group_first_element;
+                bins[idx] = value;
+
+                previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
+            }
+        }
+
+        if (threadIdx.x == 0)
+            kept_count[b] = 0;
+
+        __syncthreads();
+
+        for (int c = 0; c < num_classes; c++)
+        {
+            if (c == background_class_id)
+                continue;
+
+            const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+            const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
+
+            auto boxes = count[b * num_classes + c];
+
+            for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
+            {
+                auto prior_id = indices[i];
+                const float confidence = load_ldg(scores[prior_id]);
+                if (confidence > threshold)
+                {
+                    using device::fast_divide_ftz;
+                    auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                    using device::clamp;
+                    int bin_index = conf_scaled * BINS;
+                    bin_index = clamp<int>(bin_index, 0, BINS - 1);
+
+                    const index_type idx = atomicAdd(&bins[bin_index], 1);
+                    if (idx < keepTopK)
+                    {
+                        kept_indices[b * keepTopK + idx] = c * num_priors + prior_id;
+                        atomicAdd(&kept_count[b], 1);
+                    }
+                }
+            }
+        }
+    }
+
+    template <class T>
+    __global__ void consolidate_detections(Span<T> output,
+        View<int> kept_indices, View<int> kept_count, View<T> decoded_bboxes, View<T> scores, bool share_location,
+        size_type batch_size, size_type num_classes, size_type num_priors, size_type keepTopK, DevicePtr<int> num_detections)
+    {
+        using vector_type = get_vector_type_t<T, 4>;
+        auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
+
+        // output: [1, 1, batch_size * keepTopK, 7]
+        // kept_indices: [batch_size, keepTopK]
+        // kept_count: [batch_size]
+        // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+        // scores: [batch_size, num_classes, num_priors]
+
+        for (int b = 0; b < batch_size; b++)
+        {
+            for (auto i : grid_stride_range(kept_count[b]))
+            {
+                auto score_id = kept_indices[b * keepTopK + i];
+                auto c = score_id / num_priors;
+                auto prior_id = score_id % num_priors;
+
+                const auto confidence = scores[b * num_classes * num_priors + score_id];
+
+                index_type bbox_id;
+                if (share_location)
+                {
+                    // decoded_bboxes: [batch_size, num_priors, 1, 4]
+                    bbox_id = b * num_priors + prior_id;
+                }
+                else
+                {
+                    // decoded_bboxes: [batch_size, num_priors, num_classes, 4]
+                    bbox_id = (b * num_priors + prior_id) * num_classes + c;
+                }
+
+                vector_type bbox;
+                v_load(bbox, decoded_bboxes_vPtr[bbox_id]);
+
+                auto output_id = atomicAdd(num_detections.get(), 1);
+                output[output_id * 7 + 0] = b;
+                output[output_id * 7 + 1] = c;
+                output[output_id * 7 + 2] = confidence;
+                output[output_id * 7 + 3] = bbox.data[0];
+                output[output_id * 7 + 4] = bbox.data[1];
+                output[output_id * 7 + 5] = bbox.data[2];
+                output[output_id * 7 + 6] = bbox.data[3];
+            }
+        }
+    }
+}
+
+template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX> static
+void launch_decode_boxes_kernel(const Stream& stream, Span<T> decoded_bboxes, View<T> locations, View<T> priors,
+    bool transpose_location, bool normalized_bbox,
+    size_type num_loc_classes, index_type background_class_id,
+    float clip_width, float clip_height)
+{
+    auto kernel = raw::decode_bbox<T, SHARE_LOCATION, VARIANCE_ENCODED_IN_TARGET, CORNER_TRUE_CENTER_FALSE, CLIP_BBOX>;
+    auto policy = make_policy(kernel, decoded_bboxes.size() / 4, 0, stream);
+    launch_kernel(kernel, policy, decoded_bboxes, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
+}
+
+template <class T, unsigned int current, class ...Args> static
+typename std::enable_if<current == 0, void>
+::type dispatch_decode_bboxes(int selector, Args&& ...args) {
+    if(selector == 0)
+        launch_decode_boxes_kernel<T, 0, 0, 0, 0>(std::forward<Args>(args)...);
+}
+
+template <class T, unsigned int current, class ...Args> static
+typename std::enable_if<current != 0, void>
+::type dispatch_decode_bboxes(int selector, Args&& ...args) {
+    if(selector == current)
+        launch_decode_boxes_kernel<T,
+                static_cast<bool>(current & 8),
+                static_cast<bool>(current & 4),
+                static_cast<bool>(current & 2),
+                static_cast<bool>(current & 1)>(std::forward<Args>(args)...);
+    else
+        dispatch_decode_bboxes<T, current - 1, Args...>(selector, std::forward<Args>(args)...);
+}
+
+template <class T>
+void decode_bboxes(const Stream& stream, Span<T> output, View<T> locations, View<T> priors,
+    std::size_t num_loc_classes,
+    bool share_location, std::size_t background_class_id,
+    bool transpose_location, bool variance_encoded_in_target,
+    bool corner_true_or_center_false, bool normalized_bbox,
+    bool clip_box, float clip_width, float clip_height)
+{
+    /* `config` combines three kernel template options into one number using which a bit of TMP code can
+     * run through all possible combinations and instantiate the correct template
+     */
+    unsigned int config = (share_location << 3 | variance_encoded_in_target << 2 | corner_true_or_center_false << 1 | clip_box);
+    dispatch_decode_bboxes<T, 15>(config, stream, output, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
+}
+
+template void decode_bboxes(const Stream&, Span<__half>, View<__half>, View<__half>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
+template void decode_bboxes(const Stream&, Span<float>, View<float>, View<float>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
+
+template <class T>
+void findTopK(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> scores, std::size_t background_class_id, float threshold)
+{
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+    // scores: [batch_size, num_classes, num_priors]
+
+    const auto batch_size = indices.get_axis_size(0);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(scores.get_axis_size(0) == batch_size);
+
+    const auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(scores.get_axis_size(1) == num_classes);
+
+    const auto classwise_topK = indices.get_axis_size(2);
+    const auto num_priors = scores.get_axis_size(2);
+
+    /* each block processes one class from each batch */
+    constexpr auto BLOCK_SIZE = 256;
+
+    dim3 grid_size(num_classes, batch_size);
+    dim3 block_size(BLOCK_SIZE);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    auto kernel = raw::findTopK<T, 2048, BLOCK_SIZE>;
+    launch_kernel(kernel, policy, indices, count, scores, threshold, classwise_topK, num_classes, num_priors, background_class_id);
+}
+
+template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, std::size_t, float);
+template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, std::size_t, float);
+
+template <class T>
+void box_collect(const Stream& stream, TensorSpan<T> collected_bboxes, TensorView<T> decoded_bboxes, TensorView<int> indices, TensorView<int> count, bool share_location, std::size_t background_class_id)
+{
+    // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+    // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+
+    const auto batch_size = collected_bboxes.get_axis_size(0);
+    CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
+    CV_Assert(indices.get_axis_size(0) == batch_size);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+
+    const auto num_classes = collected_bboxes.get_axis_size(1);
+    CV_Assert(indices.get_axis_size(1) == num_classes);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+
+    const auto classwise_topK = collected_bboxes.get_axis_size(2);
+    CV_Assert(indices.get_axis_size(2) == classwise_topK);
+
+    const auto num_priors = decoded_bboxes.get_axis_size(1);
+
+    CV_Assert(!share_location || decoded_bboxes.get_axis_size(2) == 1);
+
+    constexpr int BLOCK_SIZE = 256;
+
+    /* each block processes one class from each batch */
+    dim3 grid_size(num_classes, batch_size);
+    dim3 block_size(BLOCK_SIZE);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    auto kernel = raw::box_collect<T>;
+    launch_kernel(kernel, policy, collected_bboxes, decoded_bboxes, indices, count, share_location, num_priors, num_classes, classwise_topK, background_class_id);
+}
+
+template void box_collect(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<int>, TensorView<int>, bool, std::size_t);
+template void box_collect(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<int>, TensorView<int>, bool, std::size_t);
+
+template <class T>
+void blockwise_class_nms(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> collected_bboxes,
+    bool normalized_bbox, std::size_t background_class_id, float nms_threshold)
+{
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+    // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+
+    const auto batch_size = indices.get_axis_size(0);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(collected_bboxes.get_axis_size(0) == batch_size);
+
+    const auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(collected_bboxes.get_axis_size(1) == num_classes);
+
+    const auto classwise_topK = indices.get_axis_size(2);
+    CV_Assert(collected_bboxes.get_axis_size(2) == classwise_topK);
+
+    /* each block processes one class from each batch */
+    auto num_blocks = batch_size * num_classes;
+    auto num_threads = std::max<std::size_t>(std::min<std::size_t>(1024, classwise_topK), 32);
+
+    dim3 grid_size(num_blocks);
+    dim3 block_size(num_threads);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    if (normalized_bbox)
+    {
+        auto kernel = raw::blockwise_class_nms<T, true>;
+        launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
+    }
+    else
+    {
+        auto kernel = raw::blockwise_class_nms<T, false>;
+        launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
+    }
+}
+
+template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, bool, std::size_t, float);
+template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, bool, std::size_t, float);
+
+template <class T>
+void nms_collect(const Stream& stream, TensorSpan<int> kept_indices, TensorSpan<int> kept_count,
+    TensorView<int> indices, TensorView<int> count, TensorView<T> scores, float threshold, std::size_t background_class_id)
+{
+    // kept_indices: [batch_size, keepTopK]
+    // kept_count: [batch_size]
+
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+    // scores: [batch_size, num_classes, num_priors]
+
+    auto batch_size = kept_indices.get_axis_size(0);
+    CV_Assert(kept_count.get_axis_size(0) == batch_size);
+    CV_Assert(indices.get_axis_size(0) == batch_size);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(scores.get_axis_size(0) == batch_size);
+
+    auto keepTopK = kept_indices.get_axis_size(1);
+
+    auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(scores.get_axis_size(1) == num_classes);
+
+    auto classwise_topK = indices.get_axis_size(2);
+    auto num_priors = scores.get_axis_size(2);
+
+    auto num_blocks = batch_size;
+    constexpr int BLOCK_SIZE = 1024;
+
+    dim3 grid_size(num_blocks);
+    dim3 block_size(BLOCK_SIZE);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    auto kernel = raw::nms_collect<T, 1024, BLOCK_SIZE>;
+    launch_kernel(kernel, policy, kept_indices, kept_count, indices, count, scores, threshold, num_classes, num_priors, classwise_topK, keepTopK, background_class_id);
+}
+
+template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<__half>, float, std::size_t);
+template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<float>, float, std::size_t);
+
+template <class T>
+void consolidate_detections(const Stream& stream, TensorSpan<T> output,
+    TensorView<int> kept_indices, TensorView<int> kept_count,
+    TensorView<T> decoded_bboxes, TensorView<T> scores, bool share_location, DevicePtr<int> num_detections)
+{
+    // output: [1, 1, batch_size * keepTopK, 7]
+    // kept_indices: [batch_size, keepTopK]
+    // kept_count: [batch_size]
+    // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+    // scores: [batch_size, num_classes, num_priors]
+
+    auto batch_size = kept_indices.get_axis_size(0);
+    CV_Assert(kept_count.get_axis_size(0) == batch_size);
+    CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
+    CV_Assert(scores.get_axis_size(0) == batch_size);
+
+    auto keepTopK = kept_indices.get_axis_size(1);
+
+    auto num_classes = scores.get_axis_size(1);
+    auto num_priors = scores.get_axis_size(2);
+
+    CV_Assert(batch_size * keepTopK * 7 == output.size());
+
+    auto kernel = raw::consolidate_detections<T>;
+    auto policy = make_policy(kernel, keepTopK, 0, stream);
+    launch_kernel(kernel, policy, output, kept_indices, kept_count, decoded_bboxes, scores, share_location, batch_size, num_classes, num_priors, keepTopK, num_detections);
+}
+
+template void consolidate_detections(const Stream&, TensorSpan<__half>, TensorView<int>, TensorView<int>, TensorView<__half>, TensorView<__half>, bool, DevicePtr<int>);
+template void consolidate_detections(const Stream&, TensorSpan<float>, TensorView<int>, TensorView<int>, TensorView<float>, TensorView<float>, bool, DevicePtr<int>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_activation.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_activation.cu
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
+    __global__ void eltwise_op_generic_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto x_vPtr = vector_type::get_pointer(x.data());
+        auto y_vPtr = vector_type::get_pointer(y.data());
+
+        EltwiseOp eltwise_op(eltwise_params);
+        ActivationOp activation_op(act_params);
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            vector_type vec_x, vec_y;
+            v_load(vec_x, x_vPtr[i]);
+            v_load(vec_y, y_vPtr[i]);
+            for(int j = 0; j < vec_x.size(); j++)
+                vec_x.data[j] = activation_op(eltwise_op(vec_x.data[j], vec_y.data[j]));
+            v_store(output_vPtr[i], vec_x);
+        }
+    }
+}
+
+template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
+void launch_vectorized_eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(x, N));
+    CV_Assert(is_fully_aligned<T>(y, N));
+
+    auto kernel = raw::eltwise_op_generic_op_vec<T, EltwiseOp, ActivationOp, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, x, y, eltwise_params, act_params);
+}
+
+template <class T, class EltwiseOp, class ActivationOp> static
+void eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
+    CV_Assert(output.size() == x.size());
+    CV_Assert(output.size() == y.size());
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+        launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 4>(stream, output, x, y, eltwise_params, act_params);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 4)) {
+        launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 2>(stream, output, x, y, eltwise_params, act_params);
+    } else {
+        launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 1>(stream, output, x, y, eltwise_params, act_params);
+    }
+}
+
+template <class T>
+void eltwise_sum_2_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T slope) {
+    eltwise_op_generic_op<T, SumFunctor<T>, ReLUFunctor<T>>(stream, output, x, y, {}, {slope});
+}
+
+template <class T>
+void eltwise_sum_2_clipped_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    eltwise_op_generic_op<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, output, x, y, {}, {floor, ceiling});
+}
+
+template <class T>
+void eltwise_sum_2_tanh(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, TanHFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_swish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, SwishFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_mish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, MishFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_sigmoid(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_power(const Stream& stream, Span<T> output, View<T> x, View<T> y, T exp, T scale, T shift) {
+    eltwise_op_generic_op<T, SumFunctor<T>, PowerFunctor<T>>(stream, output, x, y, {}, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void eltwise_sum_2_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half);
+template void eltwise_sum_2_clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half);
+template void eltwise_sum_2_tanh<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_swish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_mish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_sigmoid<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_power<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void eltwise_sum_2_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float);
+template void eltwise_sum_2_clipped_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float, float);
+template void eltwise_sum_2_tanh<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_swish<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_mish<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_sigmoid<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_power<float>(const Stream&, Span<float>, View<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_ops.cu
@@ -0,0 +1,334 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "functors.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, class EltwiseOp, std::size_t N>
+    __global__ void eltwise_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto x_vPtr = vector_type::get_pointer(x.data());
+        auto y_vPtr = vector_type::get_pointer(y.data());
+
+        EltwiseOp eltwise_op(params);
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            vector_type vec_x, vec_y;
+            v_load(vec_x, x_vPtr[i]);
+            v_load(vec_y, y_vPtr[i]);
+            for (int j = 0; j < vector_type::size(); j++)
+                vec_x.data[j] = eltwise_op(vec_x.data[j], vec_y.data[j]);
+            v_store(output_vPtr[i], vec_x);
+        }
+    }
+
+    template <class T, class EltwiseOp, std::size_t Rank>
+    __global__ void eltwise_op_bcast(
+        Span<T> output, array<size_type, Rank> out_strides,
+        View<T> x, array<size_type, Rank> x_strides, array<bool, Rank> x_bcast,
+        View<T> y, array<size_type, Rank> y_strides, array<bool, Rank> y_bcast,
+        const typename EltwiseOp::Params params) {
+        EltwiseOp eltwise_op(params);
+
+        for (auto i : grid_stride_range(output.size())) {
+            index_type out_index = i / out_strides[0];
+            index_type x_index = x_bcast[0] ? 0 : out_index * x_strides[0];
+            index_type y_index = y_bcast[0] ? 0 : out_index * y_strides[0];
+
+            for (int j = 1; j < Rank; j++)
+            {
+                out_index = (i % out_strides[j - 1]) / out_strides[j];
+                if (!x_bcast[j])
+                    x_index += out_index * x_strides[j];
+                if (!y_bcast[j])
+                    y_index += out_index * y_strides[j];
+            }
+
+            output[i] = eltwise_op(x[x_index], y[y_index]);
+        }
+    }
+}
+
+template <class T, class EltwiseOp, std::size_t N> static
+void launch_vectorized_eltwise_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& params) {
+    CV_Assert(x.size() == y.size());
+    CV_Assert(x.size() == output.size());
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(x, N));
+    CV_Assert(is_fully_aligned<T>(y, N));
+
+    auto kernel = raw::eltwise_op_vec<T, EltwiseOp, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, x, y, params);
+}
+
+template <class T, class EltwiseOp, std::size_t Rank> static
+void launch_eltwise_op_bcast(
+    const Stream& stream,
+    Span<T> output, const std::vector<std::size_t>& outStride,
+    View<T> x, const std::vector<std::size_t>& inStride1, const std::vector<int>& inBcast1,
+    View<T> y, const std::vector<std::size_t>& inStride2, const std::vector<int>& inBcast2,
+    const typename EltwiseOp::Params& params)
+{
+    CV_Assert(outStride.size() == Rank);
+    CV_Assert(inStride1.size() == Rank);
+    CV_Assert(inStride2.size() == Rank);
+    CV_Assert(inBcast1.size() == Rank);
+    CV_Assert(inBcast2.size() == Rank);
+
+    array<size_type, Rank> outStride_k, inStride1_k, inStride2_k;
+    outStride_k.assign(std::begin(outStride), std::end(outStride));
+    inStride1_k.assign(std::begin(inStride1), std::end(inStride1));
+    inStride2_k.assign(std::begin(inStride2), std::end(inStride2));
+
+    array<bool, Rank> inBcast1_k, inBcast2_k;
+    inBcast1_k.assign(std::begin(inBcast1), std::end(inBcast1));
+    inBcast2_k.assign(std::begin(inBcast2), std::end(inBcast2));
+
+    auto kernel = raw::eltwise_op_bcast<T, EltwiseOp, Rank>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, outStride_k, x, inStride1_k, inBcast1_k, y, inStride2_k, inBcast2_k, params);
+}
+
+GENERATE_KERNEL_DISPATCHER_2TP(eltwise_op_bcast_dispatcher, launch_eltwise_op_bcast);
+
+template <class T, class EltwiseOp> static
+void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y, const typename EltwiseOp::Params& params = {}) {
+    if (is_shape_same(output, x) && is_shape_same(output, y))
+    {
+        /* no broadcasting; use fast path */
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_op<T, EltwiseOp, 4>(stream, output, x, y, params);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_op<T, EltwiseOp, 2>(stream, output, x, y, params);
+        } else {
+            launch_vectorized_eltwise_op<T, EltwiseOp, 1>(stream, output, x, y, params);
+        }
+    }
+    else
+    {
+        CV_Assert(is_shape_compatible(output, x));
+        CV_Assert(is_shape_compatible(output, y));
+
+        /* matching singleton axes in both input tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Singleton axes do not contribute towards address calculation. They are redundant
+         * unless there is broadcasting. If both input tensors have singleton axis at a
+         * specified position, there is no broadcasting on that axis.
+         *
+         * Example:
+         * ---------
+         * x: [1, 256, 32, 32] -> [256, 32, 32]
+         * y: [1, 256, 1, 1] -> [256, 1, 1]
+         */
+        for (int r = 0; r < output.rank(); r++)
+        {
+            while (x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
+                CV_Assert(output.get_axis_size(r) == 1);
+
+                x.squeeze(r);
+                y.squeeze(r);
+                output.squeeze(r);
+            }
+        }
+
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes that do not broadcast can be merged into one axis
+         *
+         * Example:
+         * ---------
+         * x: [32, 8, 8] -> [32, 64]
+         * y: [1, 8, 8] -> [1, 64]
+         */
+        for (int i = 0; i < inShape1.size(); i++) {
+            /* check if axis `i` requires any broadcasting */
+            if (inShape1[i] == inShape2[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape1.size() && inShape1[j] == inShape2[j]) {
+                    CV_Assert(outShape[j] == inShape1[j]);
+
+                    /* `j` axis is also used fully; merge `i` and `j` */
+                    auto new_size = inShape1[i] * inShape1[j];
+                    inShape1[i] = new_size;
+                    inShape2[i] = new_size;
+
+                    /* delete axis `j` */
+                    inShape1.erase(std::begin(inShape1) + j);
+                    inShape2.erase(std::begin(inShape2) + j);
+                    outShape.erase(std::begin(outShape) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape1.size() == outShape.size());
+                    CV_Assert(inShape2.size() == outShape.size());
+                    CV_Assert(inShape1[i] == outShape[i]);
+                    CV_Assert(inShape2[i] == outShape[i]);
+                }
+            }
+        }
+
+        /* contiguous broadcasting axes on the same tensor can be merged into one axis
+         *
+         * Example:
+         * ---------
+         * x: [256, 8, 8] -> [256, 64]
+         * y: [256, 1, 1] -> [256, 1]
+         */
+        for (int i = 0; i < inShape1.size(); i++) {
+            /* check if axis `i` requires any broadcasting in tensor 1 */
+            if (inShape1[i] == 1 && inShape2[i] != 1) {
+                /* loop invariant: `i` is the first axis in the contiguous axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape1.size() && inShape1[j] == 1 && inShape2[j] != 1) {
+                    CV_Assert(outShape[j] == inShape2[j]);
+
+                    /* `j` axis is also used fully; merge `i` and `j` */
+                    inShape1[i] = 1;
+                    inShape2[i] = inShape2[i] * inShape2[j];
+                    outShape[i] = inShape2[i];
+
+                    /* delete axis `j` */
+                    inShape1.erase(std::begin(inShape1) + j);
+                    inShape2.erase(std::begin(inShape2) + j);
+                    outShape.erase(std::begin(outShape) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape1.size() == outShape.size());
+                    CV_Assert(inShape2.size() == outShape.size());
+                    CV_Assert(inShape1[i] == 1);
+                    CV_Assert(inShape2[i] == outShape[i]);
+                }
+            }
+
+            /* check if axis `i` requires any broadcasting in tensor 2 */
+            if (inShape1[i] != 1 && inShape2[i] == 1) {
+                /* loop invariant: `i` is the first axis in the contiguous axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape1.size() && inShape1[j] != 1 && inShape2[j] == 1) {
+                    CV_Assert(outShape[j] == inShape1[j]);
+
+                    /* `j` axis is also used fully; merge `i` and `j` */
+                    inShape1[i] = inShape1[i] * inShape1[j];
+                    inShape2[i] = 1;
+                    outShape[i] = inShape1[i];
+
+                    /* delete axis `j` */
+                    inShape1.erase(std::begin(inShape1) + j);
+                    inShape2.erase(std::begin(inShape2) + j);
+                    outShape.erase(std::begin(outShape) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape1.size() == outShape.size());
+                    CV_Assert(inShape2.size() == outShape.size());
+                    CV_Assert(inShape1[i] == outShape[i]);
+                    CV_Assert(inShape2[i] == 1);
+                }
+            }
+        }
+
+        auto rank = outShape.size();
+
+        std::vector<std::size_t> inStride1(rank), inStride2(rank), outStride(rank);
+        inStride1.back() = 1;
+        inStride2.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape1) + 1, std::end(inShape1), std::begin(inStride1));
+        std::copy(std::begin(inShape2) + 1, std::end(inShape2), std::begin(inStride2));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride1.rbegin(), inStride1.rend(), inStride1.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(inStride2.rbegin(), inStride2.rend(), inStride2.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        std::vector<int> inBcast1(rank), inBcast2(rank);
+        std::transform(std::begin(inShape1), std::end(inShape1), std::begin(inBcast1), [](std::size_t sz) { return sz == 1; });
+        std::transform(std::begin(inShape2), std::end(inShape2), std::begin(inBcast2), [](std::size_t sz) { return sz == 1; });
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        eltwise_op_bcast_dispatcher<T, EltwiseOp, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, x, inStride1, inBcast1, y, inStride2, inBcast2, params);
+    }
+}
+
+template <class T>
+void eltwise_max_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, MaxFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_min_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, MinFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, SumFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_coeff_2(const Stream& stream, TensorSpan<T> output, T coeff_x, TensorView<T> x, T coeff_y, TensorView<T> y) {
+    eltwise_op<T, ScaledSumFunctor<T>>(stream, output, x, y, {coeff_x, coeff_y});
+}
+
+template <class T>
+void eltwise_prod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, ProductFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
+    template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+#endif
+    template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);
+    template void eltwise_sum_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/execution.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/execution.hpp
@@ -0,0 +1,81 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+
+#include "../cuda4dnn/csl/error.hpp"
+#include "../cuda4dnn/csl/stream.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    struct execution_policy {
+        execution_policy(dim3 grid_size, dim3 block_size)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
+
+        dim3 grid;
+        dim3 block;
+        std::size_t sharedMem;
+        cudaStream_t stream;
+    };
+
+    /* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
+    /*
+    template <class Kernel> inline
+    execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
+        int grid_size, block_size;
+        CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+        return execution_policy(grid_size, block_size, sharedMem, stream);
+    }*/
+
+    template <class Kernel> inline
+    execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
+        CV_Assert(max_threads > 0);
+
+        int grid_size = 0, block_size = 0;
+        CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+        if (grid_size * block_size > max_threads) {
+            grid_size = (max_threads + block_size - 1) / block_size;
+            if (block_size > max_threads)
+                block_size = max_threads;
+        }
+
+        CV_Assert(grid_size >= 1 && block_size >= 1);
+        return execution_policy(grid_size, block_size, sharedMem, stream);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, Args ...args) {
+        auto policy = make_policy(kernel);
+        kernel <<<policy.grid, policy.block>>> (args...);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
+        kernel <<<grid, block>>> (args...);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
+        kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (args...);
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fill_copy.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fill_copy.cu
@@ -0,0 +1,98 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void fill_vec(Span<T> output, T value) {
+            using vector_type = get_vector_type_t<T, N>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = value;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void  copy_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+            auto input_vPtr = vector_type::get_pointer(input.data());
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+
+        auto kernel = raw::fill_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, value);
+    }
+
+    template <class T>
+    void fill(const Stream& stream, Span<T> output, T value) {
+        if (is_fully_aligned<T>(output, 4)) {
+            launch_vectorized_fill<T, 4>(stream, output, value);
+        } else if (is_fully_aligned<T>(output, 2)) {
+            launch_vectorized_fill<T, 2>(stream, output, value);
+        } else {
+            launch_vectorized_fill<T, 1>(stream, output, value);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void fill(const Stream&, Span<__half>, __half);
+#endif
+    template void fill(const Stream&, Span<float>, float);
+    template void fill(const Stream&, Span<int>, int);
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_copy(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::copy_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void copy(const Stream& stream, Span<T> output, View<T> input) {
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_copy<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_copy<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_copy<T, 1>(stream, output, input);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void copy(const Stream&, Span<__half>, View<__half>);
+#endif
+    template void copy(const Stream&, Span<float>, View<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fp_conversion.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fp_conversion.cu
@@ -0,0 +1,102 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <std::size_t N>
+        __global__ void fp32_to_fp16(Span<__half> output, View<float> input) {
+            using output_vector_type = get_vector_type_t<__half, N>;
+            using input_vector_type = get_vector_type_t<float, N>;
+
+            auto output_vPtr = output_vector_type::get_pointer(output.data());
+            auto input_vPtr = input_vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
+                input_vector_type in_vec;
+                v_load(in_vec, input_vPtr[i]);
+
+                output_vector_type out_vec;
+                for (int j = 0; j < output_vector_type::size(); j++)
+                    out_vec.data[j] = __float2half(in_vec.data[j]);
+
+                v_store(output_vPtr[i], out_vec);
+            }
+        }
+
+        template <std::size_t N>
+        __global__ void fp16_to_fp32(Span<float> output, View<__half> input) {
+            using output_vector_type = get_vector_type_t<float, N>;
+            using input_vector_type = get_vector_type_t<__half, N>;
+
+            auto output_vPtr = output_vector_type::get_pointer(output.data());
+            auto input_vPtr = input_vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
+                input_vector_type in_vec;
+                v_load(in_vec, input_vPtr[i]);
+
+                output_vector_type out_vec;
+                for (int j = 0; j < output_vector_type::size(); j++)
+                    out_vec.data[j] = __half2float(in_vec.data[j]);
+
+                v_store(output_vPtr[i], out_vec);
+            }
+        }
+    }
+
+    template <std::size_t N> static
+    void launch_vectorized_fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
+        CV_Assert(is_fully_aligned<__half>(output, N));
+        CV_Assert(is_fully_aligned<float>(input, N));
+
+        auto kernel = raw::fp32_to_fp16<N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    void fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
+        if (is_fully_aligned<__half>(output, 4) && is_fully_aligned<float>(input, 4)) {
+            launch_vectorized_fp32_to_fp16<4>(stream, output, input);
+        } else if (is_fully_aligned<__half>(output, 2) && is_fully_aligned<float>(input, 2)) {
+            launch_vectorized_fp32_to_fp16<2>(stream, output, input);
+        } else {
+            launch_vectorized_fp32_to_fp16<1>(stream, output, input);
+        }
+    }
+
+    template <std::size_t N> static
+    void launch_vectorized_fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
+        CV_Assert(is_fully_aligned<float>(output, N));
+        CV_Assert(is_fully_aligned<__half>(input, N));
+
+        auto kernel = raw::fp16_to_fp32<N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    void fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
+        if (is_fully_aligned<float>(output, 4) && is_fully_aligned<__half>(input, 4)) {
+            launch_vectorized_fp16_to_fp32<4>(stream, output, input);
+        } else if (is_fully_aligned<float>(output, 2) && is_fully_aligned<__half>(input, 2)) {
+            launch_vectorized_fp16_to_fp32<2>(stream, output, input);
+        } else {
+            launch_vectorized_fp16_to_fp32<1>(stream, output, input);
+        }
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/functors.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/functors.hpp
@@ -0,0 +1,334 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
+#define OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
+
+#include <cuda_runtime.h>
+
+#include "math.hpp"
+
+#include "../cuda4dnn/csl/nvcc_defs.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+template <class T>
+struct IdentityFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE IdentityFunctor() { }
+    CUDA4DNN_DEVICE IdentityFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        return value;
+    };
+};
+
+template <class T>
+struct ReLUFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : slope(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T slope_) : slope(slope_) { }
+        T slope;
+    };
+
+    CUDA4DNN_DEVICE ReLUFunctor() : ReLUFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ReLUFunctor(const Params& params) : slope(params.slope) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::log1pexp;
+        return value >= T(0) ? value : slope * value;
+    }
+
+    T slope;
+};
+
+template <class T>
+struct ClippedReLUFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : floor(0), ceiling(6) { }
+        CUDA4DNN_HOST_DEVICE Params(T floor_, T ceiling_) : floor(floor_), ceiling(ceiling_) { }
+        T floor, ceiling;
+    };
+
+    CUDA4DNN_DEVICE ClippedReLUFunctor() : ClippedReLUFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ClippedReLUFunctor(const Params& params) : floor{params.floor}, ceiling{params.ceiling} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::clamp;
+        return clamp(value, floor, ceiling);
+    }
+
+    T floor, ceiling;
+};
+
+template <class T>
+struct TanHFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE TanHFunctor() { }
+    CUDA4DNN_DEVICE TanHFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::tanh;
+        return tanh(value);
+    }
+};
+
+template <class T>
+struct SwishFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SwishFunctor() { }
+    CUDA4DNN_DEVICE SwishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        // f(x) = x * sigmoid(x)
+        using csl::device::fast_divide;
+        using csl::device::fast_exp;
+        return fast_divide(value, static_cast<T>(1) + fast_exp(-value));
+    }
+};
+
+template <class T>
+struct MishFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MishFunctor() { }
+    CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::tanh;
+        using csl::device::log1pexp;
+        return value * tanh(log1pexp(value));
+    }
+};
+
+template <>
+struct MishFunctor<float> {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MishFunctor() { }
+    CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE float operator()(float value) {
+        // f(x) = x * tanh(log1pexp(x));
+        using csl::device::fast_divide;
+        using csl::device::fast_exp;
+
+        auto e = fast_exp(value);
+        auto n = e * e + 2 * e;
+        if (value <= -0.6f)
+            return value * fast_divide(n, n + 2);
+        return value - 2 * fast_divide(value, n + 2);
+    }
+};
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template <>
+struct MishFunctor<__half> {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MishFunctor() { }
+    CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE __half operator()(__half value) {
+        return MishFunctor<float>()(value);
+    }
+};
+#endif
+
+template <class T>
+struct SigmoidFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SigmoidFunctor() { }
+    CUDA4DNN_DEVICE SigmoidFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::fast_sigmoid;
+        return fast_sigmoid(value);
+    }
+};
+
+template <class T>
+struct ELUFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE ELUFunctor() { }
+    CUDA4DNN_DEVICE ELUFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::expm1;
+        return value >= T(0) ? value : expm1(value);
+    }
+};
+
+template <class T>
+struct AbsFunctor {
+    struct Params { };
+
+    CUDA4DNN_DEVICE AbsFunctor() { }
+    CUDA4DNN_DEVICE AbsFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::abs;
+        return abs(value);
+    }
+};
+
+template <class T>
+struct BNLLFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE BNLLFunctor() { }
+    CUDA4DNN_DEVICE BNLLFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::log1pexp;
+        return value > T(0) ? value + log1pexp(-value) : log1pexp(value);
+    }
+};
+
+template <class T>
+struct PowerFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : exp(1), scale(1), shift(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T exp_, T scale_, T shift_) : exp(exp_), scale(scale_), shift(shift_) { }
+        T exp, scale, shift;
+    };
+
+    CUDA4DNN_DEVICE PowerFunctor() : PowerFunctor(Params{}) { }
+    CUDA4DNN_DEVICE PowerFunctor(const Params& params) : exp{params.exp}, scale{params.scale}, shift{params.shift} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::pow;
+        return pow(shift + scale * value, exp);
+    }
+
+    T exp, scale, shift;
+};
+
+template <class T>
+struct ExpFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : normScale(1), normShift(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T nScale_, T nShift_) : normScale(nScale_), normShift(nShift_) { }
+        T normScale, normShift;
+    };
+
+    CUDA4DNN_DEVICE ExpFunctor() : ExpFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ExpFunctor(const Params& params) : normScale{params.normScale}, normShift{params.normShift} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::fast_exp;
+        return fast_exp(normShift + normScale * value);
+    }
+
+    T normScale, normShift;
+};
+
+template <class T>
+struct MaxFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MaxFunctor() { }
+    CUDA4DNN_DEVICE MaxFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::max;
+        return max(x, y);
+    }
+};
+
+template <class T>
+struct MinFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MinFunctor() { }
+    CUDA4DNN_DEVICE MinFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::min;
+        return min(x, y);
+    }
+};
+
+template <class T>
+struct SumFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SumFunctor() { }
+    CUDA4DNN_DEVICE SumFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <class T>
+struct ScaledSumFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : scale_x(1), scale_y(1) { }
+        CUDA4DNN_HOST_DEVICE Params(T scale_x_, T scale_y_) : scale_x(scale_x_), scale_y(scale_y_) { }
+        T scale_x, scale_y;
+    };
+
+    CUDA4DNN_DEVICE ScaledSumFunctor() : scale_x(1), scale_y(1) { }
+    CUDA4DNN_DEVICE ScaledSumFunctor(const Params& params) : scale_x{params.scale_x}, scale_y{params.scale_y} { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return scale_x * x + scale_y * y; }
+
+    T scale_x, scale_y;
+};
+
+template <class T>
+struct ProductFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE ProductFunctor() { }
+    CUDA4DNN_DEVICE ProductFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x * y; }
+};
+
+template <class T>
+struct DivFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE DivFunctor() { }
+    CUDA4DNN_DEVICE DivFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
+};
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_nms.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_nms.cu
@@ -0,0 +1,467 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "bbox_utils.hpp"
+#include "grid_stride_range.hpp"
+#include "block_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, bool NORMALIZED_BBOX, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void grid_nms(Span<unsigned int> mask_, Span<int> count_, View<T> bboxes_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs, float nms_threshold)
+    {
+        // topK_gs is topK rounded upwards to some size
+
+        // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
+        // bboxes: [batch_size, num_classes, topK, 4]
+        // count: [batch_size, num_classes]
+
+        const index_type c = blockIdx.y;
+        const index_type b = blockIdx.z;
+
+        if (c == background_class_id)
+            return;
+
+        auto mask = mask_.data() + (b * num_classes + c) * topK_gs * topK_gs / 32;
+        auto bboxes = bboxes_.data() + (b * num_classes + c) * topK * 4;
+        auto count = count_.data() + b * num_classes + c;
+
+        const auto boxes = *count;
+        if (boxes == 0)
+            return;
+
+        /* We divide the set of boxes into groups containing BLOCK_SIZE boxes */
+        const auto num_groups = (boxes + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        /* We need to calculate IOUs for every pair of boxes. We can generalize and say that
+         * we need to compute IOUs of every group with every other group including itself.
+         */
+        // Each block processes a pair of groups.
+        const index_type group_i = blockIdx.x % num_groups;
+        const index_type group_j = blockIdx.x / num_groups;
+
+        /* we use __syncthreads() later but note that the following condition will cause all threads
+         * in the block to exit; hence, no thread will execute a divergent __syncthreads()
+         */
+        if (group_i >= num_groups || group_j >= num_groups)
+            return;
+
+        /* Note that IOU(A, B) = IOU(B, A). Hence, if we compute IOU(GROUP_A, GROUP_B), we do not need
+         * to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
+         * each group has many boxes and we need IOUs amongst boxes within a group.
+         *
+         * We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
+         * compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
+         * early on and the working blocks are compute heavy.
+         */
+        if (group_i > group_j)
+            return;
+
+        /* the following variables contain the absolute box number of the first box of their respective groups */
+        const auto group_i_offset = group_i * BLOCK_SIZE;
+        const auto group_j_offset = group_j * BLOCK_SIZE;
+
+        /* MAIN LOOP LOGIC:
+         * We compare a box `i` from group_i with all boxes in group_j in each iteration. The box `j` is fixed
+         * for each thread. The `j` exactly maps to the thread index. Hence, the `j` is a loop invariant. Each
+         * thread of the block computes the overlap between box `i` and its box `j`.
+         *
+         * for (int i = 0; i < BLOCK_SIZE; i++)
+         * {
+         *    // i = box 1
+         *    // j = threadIdx.x = box 2
+         * }
+         */
+
+        /* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
+         * We store the `i` boxes in shared memory to allow global memory coalesing.
+         */
+        using vector_type = get_vector_type_t<T, 4>;
+        __shared__ vector_type group_i_boxes[BLOCK_SIZE];
+
+        /* We will precompute the sizes of `i` boxes in the code where we load them. The size computation
+         * is distributed across the block. Otherwise, all threads will have to compute the size of the same
+         * box simultaneously in the main loop. The size is computed while the memory subsystem is busy
+         * servicing requests for box coordinates; the compute resources would otherwise be idle in this phase.
+         */
+        /* we store the size as a float since the size can exceed fp16 limits for unnormalized boxes */
+        __shared__ float group_i_size[BLOCK_SIZE];
+
+        const auto bboxes_vPtr = vector_type::get_pointer(bboxes);
+
+        // load `i` boxes and precompute their sizes
+        {
+            int i = threadIdx.x;
+            if (group_i_offset + i < boxes)
+            {
+                vector_type box;
+                v_load(box, bboxes_vPtr[group_i_offset + i]);
+                v_store(group_i_boxes[i], box);
+
+                BoundingBox bbox;
+                bbox.xmin = box.data[0];
+                bbox.ymin = box.data[1];
+                bbox.xmax = box.data[2];
+                bbox.ymax = box.data[3];
+
+                group_i_size[i] = compute_bbox_size<NORMALIZED_BBOX>(bbox);
+            }
+        }
+
+        __syncthreads();
+
+        /* We compute overlap between boxes and check if the IOU exceeds the nms threshold.
+         * We store the result (exceeds or below nms_thresold) in a two-dimensional matrix.
+         * (i, j) is set to one if the overlap between i and j is within the nms threshold.
+         * We pack 32 results into one 32-bit integer. The effective memory layout of the
+         * matrix hence is (BLOCK_SIZE, BLOCK_SIZE / 32).
+         */
+        __shared__ unsigned int mask_shared[BLOCK_SIZE * BLOCK_SIZE / 32];
+
+        // load box `j` and precompute its size (fixed per thread)
+        BoundingBox bbox_j;
+        float bbox_j_size = 0;
+        if (group_j_offset + threadIdx.x < boxes)
+        {
+            vector_type box;
+            v_load(box, bboxes_vPtr[group_j_offset + threadIdx.x]);
+
+            bbox_j.xmin = box.data[0];
+            bbox_j.ymin = box.data[1];
+            bbox_j.xmax = box.data[2];
+            bbox_j.ymax = box.data[3];
+
+            bbox_j_size = compute_bbox_size<NORMALIZED_BBOX>(bbox_j);
+        }
+
+        /* Each thread computes a predicate which is broadcasted across the warp to obtain a 32-bit mask.
+         * The lane zero thread of each warp saves the mask. We store the offset to the mask array beforehand
+         * to save cycles in the compute-intensive main loop.
+         */
+        auto mask_offset = threadIdx.x / 32;
+
+        /* The main loop is compute intensive and causes the kernel to be overall compute-bound. Hence,
+         * this loop has been highly tuned. Please profile and verify carefully before making changes.
+         */
+        /* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
+         * the loop since the compiler cannot effectively unroll on its own preassumably due to presence
+         * of instructions forcing warp synchronization.
+         */
+        constexpr int UNROLL_SIZE = 4;
+
+        #pragma unroll 8
+        for (int s = 0; s < BLOCK_SIZE; s += UNROLL_SIZE)
+        {
+            bool do_not_reject_j[UNROLL_SIZE];
+
+            #pragma unroll
+            for (int k = 0; k < UNROLL_SIZE; k++)
+            {
+                int i = s + k;
+
+                /* The number of boxes need not necessarily be a multiple of BLOCK_SIZE.
+                 * However, the shared memory allocated can hold BLOCK_SIZE boxes from
+                 * each group. Accessing the undefined regions of shared memory is
+                 * a valid memory operation as long as the memory has been allocated.
+                 *
+                 * The condition below is only required when one of the groups does not
+                 * fully filled with valid boxes. This situations are relatively rare. It's
+                 * more common to see both groups completely filled.
+                 *
+                 * We comment this condition to improve the performance of the common case.
+                 * This leads to a net improvement.
+                 */
+                // if (group_i_offset + i < boxes && group_j_offset + threadIdx.x < boxes)
+                {
+                    BoundingBox bbox_i;
+                    float bbox_i_size;
+                    {
+                        vector_type box;
+                        v_load(box, group_i_boxes[i]);
+                        bbox_i.xmin = box.data[0];
+                        bbox_i.ymin = box.data[1];
+                        bbox_i.xmax = box.data[2];
+                        bbox_i.ymax = box.data[3];
+
+                        bbox_i_size = group_i_size[i];
+                    }
+
+                    using device::min;
+                    using device::max;
+
+                    BoundingBox intersect_bbox;
+                    intersect_bbox.xmin = max(bbox_i.xmin, bbox_j.xmin);
+                    intersect_bbox.ymin = max(bbox_i.ymin, bbox_j.ymin);
+                    intersect_bbox.xmax = min(bbox_i.xmax, bbox_j.xmax);
+                    intersect_bbox.ymax = min(bbox_i.ymax, bbox_j.ymax);
+
+                    float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
+
+                    using device::fast_divide_ftz;
+                    float iou = fast_divide_ftz(intersect_size, bbox_i_size + bbox_j_size - intersect_size);
+                    do_not_reject_j[k] = iou <= nms_threshold;
+                }
+            }
+
+            #pragma unroll
+            for (int k = 0; k < UNROLL_SIZE; k++)
+            {
+                // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
+                auto predicate = __ballot_sync(0xFFFFFFFF, do_not_reject_j[k]);
+                if (threadIdx.x % 32 == 0)
+                    mask_shared[mask_offset] = predicate;
+
+                /* The following operation should logically be inside the previous if branch. Note that `mask_offset`
+                 * is only used by lane zero threads. Hence, there is no harm in executing it other threads as it is
+                 * unused there.
+                 *
+                 * Keeping it inside prevents the compiler from treating it as a constexpr addition to the address in
+                 * successive unrolled iterations. A register is used and instructions are emitted to multiply the
+                 * addend by four to obtain the byte offset. Pulling it out of the branch makes the compiler do constexpr
+                 * addition on the address in successive unrolled iterations.
+                 */
+                mask_offset += BLOCK_SIZE / 32;
+            }
+        }
+
+        __syncthreads();
+
+        /* The mask data is organized as a two-dimensional bit matrix of size topK_gs * topK_gs.
+         * (i, j) is set to true if the overlap between `i` and `j` is beyond the nms threshold.
+         * We pack 32 results into one 32-bit integer. So the effective memory layout is topK_gs * topK_gs / 32.
+         */
+
+        /* Each box `i` was compared with BLOCK_SIZE `j` boxes. This amounts to BLOCK_SIZE / 32
+         * 32-bit integers per box `i`.
+         */
+        using mask_vector_type = get_vector_type_t<unsigned int, BLOCK_SIZE / 32>;
+
+        const int i = threadIdx.x;
+
+        auto mask_shared_vPtr = mask_vector_type::get_pointer(DevicePtr<unsigned>(mask_shared));
+        mask_vector_type temp;
+        v_load(temp, mask_shared_vPtr[i]);
+        for (int i = 0; i < mask_vector_type::size(); i++)
+            temp.data[i] = __brev(temp.data[i]);
+
+        auto mask_vPtr = mask_vector_type::get_pointer(mask);
+        v_store(mask_vPtr[((group_i_offset + i) * topK_gs + group_j_offset) / 32 / mask_vector_type::size()], temp);
+    }
+
+    template <int ITEMS_PER_THREAD, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void grid_nms_collect(Span<int> indices_, Span<int> count_, View<unsigned int> mask_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs_by32)
+    {
+        const index_type c = blockIdx.x;
+        if (c == background_class_id)
+            return;
+
+        const index_type b = blockIdx.y;
+
+        // topK_gs is topK rounded upwards to some size
+
+        // indices: [batch_size, num_classes, topK]
+        // count: [batch_size, num_classes]
+        // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
+
+        auto indices = indices_.data() + (b * num_classes + c) * topK;
+        auto count = count_.data() + (b * num_classes + c);
+        auto mask = mask_.data() + (b * num_classes + c) * topK_gs_by32 * 32 * topK_gs_by32;
+
+        const auto boxes = *count;
+        if (boxes == 0)
+            return;
+
+        /* We have a fixed number of threads and an arbitary number of boxes. We use an array of
+         * bits to store which boxes haven't been eliminated and which are still active. We organize
+         * the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
+         * is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
+         * `num_rows` is the minimum number of rows required to cover all the boxes.
+         *
+         * Each thread handles a specific column in the matrix. To improve performance, we process
+         * `ITEMS_PER_THREAD` number of elements per thread. This changes the shape to (num_rows,
+         * ROW_WIDTH) where ROW_WIDTH is BLOCK_SIZE * ITEMS_PER_THREAD.
+         */
+         constexpr int ROW_WIDTH = BLOCK_SIZE * ITEMS_PER_THREAD;
+
+         const index_type num_32b_masks = static_cast<unsigned>(boxes + 31) / 32;
+         const index_type num_rows = static_cast<unsigned>(num_32b_masks + ROW_WIDTH - 1) / ROW_WIDTH;
+
+        extern __shared__ unsigned int active_boxes[]; // the matrix described earlier
+
+        #pragma unroll 1
+        for (auto idx : block_stride_range<BLOCK_SIZE>(num_32b_masks))
+            active_boxes[idx] = (idx == num_32b_masks - 1) ? __brev((1u << (boxes % 32)) - 1) : 0xFFFFFFFF;
+
+        __syncthreads();
+
+        using vector_type = get_vector_type_t<unsigned int, ITEMS_PER_THREAD>;
+        auto mask_vPtr = vector_type::get_pointer(mask);
+        auto shared_vPtr = vector_type::get_pointer(DevicePtr<unsigned>(active_boxes));
+
+        int index_temp;
+        int thread0_count = 0;
+        int thread_id = threadIdx.x;
+
+        for (int step = 0; step < num_32b_masks; step++)
+        {
+            auto current_active = active_boxes[step];
+            while (current_active)
+            {
+                const index_type bit = __clz(current_active);
+                const index_type i = step * 32 + bit;
+
+                const int mask_offset = static_cast<unsigned>(i * topK_gs_by32) / ITEMS_PER_THREAD;
+
+                /* We fetch the index from the memory and store it in a register. We will not use it until
+                 * much later. This helps avoid a long scoreboard stall.
+                 */
+                if (thread_id == 0)
+                    index_temp = indices[i];
+
+                __syncthreads();
+
+                if (threadIdx.x == 0)
+                    active_boxes[step] = current_active ^ (0x80000000 >> bit);
+
+                __syncthreads();
+
+                #pragma unroll 1
+                for (int r = 0; r < num_rows; r++)
+                {
+                    const int idx = r * BLOCK_SIZE + thread_id;
+                    if ((step & ~(ITEMS_PER_THREAD - 1)) <= idx * ITEMS_PER_THREAD && idx * ITEMS_PER_THREAD < num_32b_masks)
+                    {
+                        auto active_boxes_vec = shared_vPtr[idx];
+                        auto mask_vec = mask_vPtr[mask_offset + idx];
+                        for (int i = 0; i < vector_type::size(); i++)
+                            active_boxes_vec.data[i] &= mask_vec.data[i];
+                        shared_vPtr[idx] = active_boxes_vec;
+                    }
+                }
+
+                __syncthreads();
+
+                if (thread_id == 0)
+                {
+                    indices[thread0_count] = index_temp;
+                    thread0_count++;
+                }
+
+                current_active = active_boxes[step];
+            }
+        }
+
+        if (threadIdx.x == 0)
+            *count = thread0_count;
+    }
+}
+
+constexpr int GROUP_SIZE = 128;
+
+static std::size_t getAlignedTopK(std::size_t topK)
+{
+    auto remainder = topK % GROUP_SIZE;
+    if (remainder == 0)
+        return topK;
+    return topK + (GROUP_SIZE - remainder);
+}
+
+std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK)
+{
+    auto topK_gs = getAlignedTopK(classwise_topK);
+    return num_classes * topK_gs * topK_gs / 32 * sizeof(unsigned int);
+}
+
+template <class T>
+void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold)
+{
+    // workspace: [batch_size, num_classes, topK_gs, topK_gs / 32]
+    // indices: [batch_size, num_classes, topK]
+    // count: [batch_size, num_classes]
+    // bboxes: [batch_size, num_classes, topK, 4] (only first count[b][c] boxes are read)
+
+    const auto batch_size = indices.get_axis_size(0);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(bboxes.get_axis_size(0) == batch_size);
+
+    const auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(bboxes.get_axis_size(1) == num_classes);
+
+    const auto topK = indices.get_axis_size(2);
+    CV_Assert(bboxes.get_axis_size(2) == topK);
+
+    CV_Assert(bboxes.get_axis_size(3) == 4);
+
+    const auto topK_gs = getAlignedTopK(topK);
+    CV_Assert(workspace.size() >= topK_gs * topK_gs / 32);
+
+    const auto boxes = topK;
+    const auto num_groups = (boxes + GROUP_SIZE - 1) / GROUP_SIZE;
+
+    {
+        // grid = (num_groups * num_groups, num_classes, batch_size)
+        // if the background class is the last class, we can reduce grid y dim by one
+        auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
+
+        constexpr int BLOCK_SIZE = GROUP_SIZE;
+
+        dim3 grid_size(num_groups * num_groups, grid_num_classes, batch_size);
+        dim3 block_size(BLOCK_SIZE);
+        auto policy = execution_policy(grid_size, block_size, stream);
+
+        if (normalized_bbox)
+        {
+            auto kernel = raw::grid_nms<T, true, BLOCK_SIZE>;
+            launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
+        }
+        else
+        {
+            auto kernel = raw::grid_nms<T, false, BLOCK_SIZE>;
+            launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
+        }
+    }
+
+    {
+        // grid = (num_classes, batch_size)
+        // if the background class is the last class, we can reduce grid x dim by one
+        auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
+
+        constexpr int BLOCK_SIZE = 64;
+
+        constexpr int ITEMS_PER_THREAD = 4;
+        auto kernel = raw::grid_nms_collect<ITEMS_PER_THREAD, BLOCK_SIZE>;
+
+        dim3 grid_size(grid_num_classes, batch_size);
+
+        auto sharedMem = topK_gs / 32 * 4;
+        auto policy = execution_policy(grid_size, BLOCK_SIZE, sharedMem, stream);
+        launch_kernel(kernel, policy, indices, count, workspace, num_classes, background_class_id, topK, topK_gs / 32);
+    }
+}
+
+std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
+
+template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
+template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_stride_range.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_stride_range.hpp
@@ -0,0 +1,68 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+
+#include "types.hpp"
+#include "index_helpers.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+template <int dim, class index_type = device::index_type, class size_type = device::size_type>
+class grid_stride_range_generic {
+public:
+    __device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { }
+    __device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
+
+    class iterator
+    {
+    public:
+        __device__ iterator(index_type pos_) : pos(pos_) {}
+
+        /* these iterators return the index when dereferenced; this allows us to loop
+            * through the indices using a range based for loop
+            */
+        __device__ index_type operator*() const { return pos; }
+
+        __device__ iterator& operator++() {
+            pos += getGridDim<dim>() * static_cast<index_type>(getBlockDim<dim>());
+            return *this;
+        }
+
+        __device__ bool operator!=(const iterator& other) const {
+            /* NOTE HACK
+                * 'pos' can move in large steps (see operator++)
+                * expansion of range for loop uses != as the loop conditioion
+                * => operator!= must return false if 'pos' crosses the end
+                */
+            return pos < other.pos;
+        }
+
+    private:
+        index_type pos;
+    };
+
+    __device__ iterator begin() const {
+        return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>());
+    }
+
+    __device__ iterator end() const {
+        return iterator(to);
+    }
+
+private:
+    index_type from, to;
+};
+
+using grid_stride_range_x = grid_stride_range_generic<0>;
+using grid_stride_range_y = grid_stride_range_generic<1>;
+using grid_stride_range_z = grid_stride_range_generic<2>;
+using grid_stride_range = grid_stride_range_x;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/index_helpers.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/index_helpers.hpp
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
+#define OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
+
+#include "types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+namespace detail {
+    using dim3_member_type = decltype(dim3::x);
+    using uint3_member_type = decltype(uint3::x);
+}
+
+template <int>  __device__ detail::dim3_member_type getGridDim();
+template <> inline __device__ detail::dim3_member_type getGridDim<0>() { return gridDim.x; }
+template <> inline __device__ detail::dim3_member_type getGridDim<1>() { return gridDim.y; }
+template <> inline __device__ detail::dim3_member_type getGridDim<2>() { return gridDim.z; }
+
+template <int> __device__ detail::dim3_member_type getBlockDim();
+template <> inline __device__ detail::dim3_member_type getBlockDim<0>() { return blockDim.x; }
+template <> inline __device__ detail::dim3_member_type getBlockDim<1>() { return blockDim.y; }
+template <> inline __device__ detail::dim3_member_type getBlockDim<2>() { return blockDim.z; }
+
+template <int> __device__ detail::uint3_member_type getBlockIdx();
+template <> inline __device__ detail::uint3_member_type getBlockIdx<0>() { return blockIdx.x; }
+template <> inline __device__ detail::uint3_member_type getBlockIdx<1>() { return blockIdx.y; }
+template <> inline __device__ detail::uint3_member_type getBlockIdx<2>() { return blockIdx.z; }
+
+template <int> __device__ detail::uint3_member_type getThreadIdx();
+template <> inline __device__ detail::uint3_member_type getThreadIdx<0>() { return threadIdx.x; }
+template <> inline __device__ detail::uint3_member_type getThreadIdx<1>() { return threadIdx.y; }
+template <> inline __device__ detail::uint3_member_type getThreadIdx<2>() { return threadIdx.z; }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/kernel_dispatcher.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/kernel_dispatcher.hpp
@@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
+ * one kernel which can work with the maximally ranked tensors, we make one kernel for each supported
+ * tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a
+ * toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor
+ * rank as a template parameter.
+ *
+ * The kernel is a template and we have different instantiations for each rank. This causes the following pattern
+ * to arise frequently:
+ *
+ * if(rank == 3)
+ *     kernel<T, 3>();
+ * else if(rank == 2)
+ *     kernel<T, 2>();
+ * else
+ *     kernel<T, 1>();
+ *
+ * The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER.
+ * This macro creates a function which selects the correct kernel instantiation at runtime.
+ *
+ * Example:
+ *
+ * // function which setups the kernel and launches it
+ * template <class T, std::size_t Rank>
+ * void launch_some_kernel(...);
+ *
+ * // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
+ * GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel);
+ *
+ * // internal API function
+ * template <class T>
+ * void some(...) {
+ *    // ...
+ *    auto rank = input.rank();
+ *    some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...);
+ * }
+ */
+
+/*
+ * name     name of the dispatcher function that is generated
+ * func     template function that requires runtime selection
+ *
+ * T        first template parameter to `func`
+ * start    starting rank
+ * end      ending rank (inclusive)
+ *
+ * Executes func<T, selector> based on runtime `selector` argument given `selector` lies
+ * within the range [start, end]. If outside the range, no instantiation of `func` is executed.
+ */
+#define GENERATE_KERNEL_DISPATCHER(name,func);                                          \
+    template <class T, std::size_t start, std::size_t end, class... Args> static        \
+    typename std::enable_if<start == end, void>                                         \
+    ::type name(int selector, Args&& ...args) {                                         \
+        if(selector == start)                                                           \
+            func<T, start>(std::forward<Args>(args)...);                                \
+    }                                                                                   \
+                                                                                        \
+    template <class T, std::size_t start, std::size_t end, class... Args> static        \
+    typename std::enable_if<start != end, void>                                         \
+    ::type name(int selector, Args&& ...args) {                                         \
+        if(selector == start)                                                           \
+            func<T, start>(std::forward<Args>(args)...);                                \
+        else                                                                            \
+            name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...);    \
+    }
+
+// Same as GENERATE_KERNEL_DISPATCHER but takes two class template parameters T and TP1 instead of just T
+#define GENERATE_KERNEL_DISPATCHER_2TP(name,func);                                              \
+    template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static   \
+    typename std::enable_if<start == end, void>                                                 \
+    ::type name(int selector, Args&& ...args) {                                                 \
+        if(selector == start)                                                                   \
+            func<TP1, TP2, start>(std::forward<Args>(args)...);                                 \
+    }                                                                                           \
+                                                                                                \
+    template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static   \
+    typename std::enable_if<start != end, void>                                                 \
+    ::type name(int selector, Args&& ...args) {                                                 \
+        if(selector == start)                                                                   \
+            func<TP1, TP2, start>(std::forward<Args>(args)...);                                 \
+        else                                                                                    \
+            name<TP1, TP2, start + 1, end, Args...>(selector, std::forward<Args>(args)...);     \
+    }
+
+#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/limits.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/limits.hpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <cfloat>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T>
+    struct numeric_limits;
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <>
+    struct numeric_limits<__half> {
+        __device__ static __half min() { return 0.0000610; }
+        __device__ static __half max() { return 65504.0; }
+        __device__ static __half lowest() { return -65504.0; }
+    };
+#endif
+
+    template <>
+    struct numeric_limits<float> {
+        __device__ static float min() { return FLT_MIN; }
+        __device__ static float max() { return FLT_MAX; }
+        __device__ static float lowest() { return -FLT_MAX; }
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/math.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/math.hpp
@@ -0,0 +1,154 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP
+#define OPENCV_DNN_SRC_CUDA_MATH_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); }
+    template <> inline __device__ float abs(float val) { return fabsf(val); }
+    template <> inline __device__ double abs(double val) { return fabs(val); }
+
+    template <class T> __device__ T exp(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half exp(__half val) { return hexp(val); }
+#endif
+    template <> inline __device__ float exp(float val) { return expf(val); }
+    template <> inline __device__ double exp(double val) { return ::exp(val); }
+
+    template <class T> __device__ T expm1(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half expm1(__half val) { return hexp(val) - __half(1); }
+#endif
+    template <> inline __device__ float expm1(float val) { return expm1f(val); }
+    template <> inline __device__ double expm1(double val) { return ::expm1(val); }
+
+    template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); }
+    template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); }
+    template <> inline __device__ double max(double x, double y) { return fmax(x, y); }
+
+    template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); }
+    template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
+    template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
+
+    template <class T> __device__ T log1p(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
+#endif
+    template <> inline __device__ float log1p(float val) { return log1pf(val); }
+
+    template <class T> __device__ T log1pexp(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half log1pexp(__half val) {
+        if (val <= __half(-4.0))
+            return exp(val);
+        else if (val <= __half(8.0))
+            return log1p(exp(val));
+        else if (val <= __half(8.7))
+            return val + exp(-val);
+        else
+            return val;
+    }
+#endif
+    template <> inline __device__ float log1pexp(float val) {
+        if (val <= -20)
+            return expf(val);
+        else if (val <= 9.0)
+            return log1pf(expf(val));
+        else if (val <= 14.6)
+            return val + exp(-val);
+        else
+            return val;
+    }
+    template <> inline __device__ double log1pexp(double val) {
+        if (val <= -37)
+            return exp(val);
+        else if (val <= 18)
+            return log1p(exp(val));
+        else if (val <= 33.3)
+            return val + exp(-val);
+        else
+            return val;
+    }
+
+    template <class T> __device__ T tanh(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half tanh(__half val) { return tanhf(val); }
+#endif
+    template <> inline __device__ float tanh(float val) { return tanhf(val); }
+    template <> inline __device__ double tanh(double val) { return ::tanh(val); }
+
+    template <class T> __device__ T pow(T val, T exp);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); }
+#endif
+    template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); }
+    template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); }
+
+    template <class T> __device__ T sqrt(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); }
+#endif
+    template <> inline __device__ float sqrt(float val) { return sqrtf(val); }
+    template <> inline __device__ double sqrt(double val) { return ::sqrt(val); }
+
+    template <class T> __device__ T rsqrt(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); }
+#endif
+    template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); }
+    template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); }
+
+    template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); }
+
+    template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
+
+    template <class T> __device__ long lround(T value);
+    template <> inline __device__ long lround(double value) { return ::lround(value); }
+    template <> inline __device__ long lround(float value) { return lroundf(value); }
+
+    template <class T> __device__ T round(T value);
+    template <> inline __device__ double round(double value) { return ::round(value); }
+    template <> inline __device__ float round(float value) { return roundf(value); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half round(__half value) { return hrint(value); }
+#endif
+
+    template <class T> __device__ T ceil(T value);
+    template <> inline __device__ double ceil(double value) { return ::ceil(value); }
+    template <> inline __device__ float ceil(float value) { return ceilf(value); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half ceil(__half value) { return hceil(value); }
+#endif
+
+    template <class T> __device__ T mul_ftz(T x, T y) { return x * y; }
+    template <> inline __device__ float mul_ftz(float x, float y) {
+        float result;
+        asm("mul.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
+        return result;
+    }
+
+    template <class T> __device__ T fast_divide(T x, T y) { return x / y; }
+    template <> inline __device__ float fast_divide(float x, float y) { return __fdividef(x, y); }
+
+    template <class T> __device__ T fast_divide_ftz(T x, T y) { return fast_divide(x, y); }
+    template <> inline __device__ float fast_divide_ftz(float x, float y) {
+        float result;
+        asm("div.approx.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
+        return result;
+    }
+
+    template <class T> __device__ T fast_exp(T value) { return exp(value); }
+    template <> inline __device__ float fast_exp(float value) { return __expf(value); }
+
+    template <class T> __device__ T fast_sigmoid(T value) { return sigmoid(value); }
+    template <> inline __device__ float fast_sigmoid(float value) { return __fdividef(1, 1 + __expf(-value)); }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/max_unpooling.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/max_unpooling.cu
@@ -0,0 +1,328 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "array.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <type_traits>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Order,
+        typename std::enable_if<Order == 1 || Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
+        __global__ void max_pooling_with_indices(
+            Span<T> output, Span<T> indices, View<T> input, size_type channels,
+            array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+            array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+        {
+            /* every element in the output is mapped to a window in the input and each thread processes several windows */
+            for (auto idx : grid_stride_range(output.size())) {
+                size_type out_spatial_size = 1;
+                array<index_type, Order> window_idx;
+                for (int i = Order - 1; i >= 0; i--) {
+                    window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
+                    out_spatial_size *= out_spatial_dims[i];
+                }
+
+                const index_type n = idx / (out_spatial_size * channels);
+                const index_type c = (idx / out_spatial_size) % channels;
+
+                array<index_type, Order> start;
+                for(int i = 0; i < Order; i++)
+                    start[i] = window_idx[i] * strides[i] - padding_left[i];
+
+                array<index_type, Order> end;
+                for (int i = 0; i < Order; i++) {
+                    using device::min;
+                    end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
+                }
+
+                for (int i = 0; i < Order; i++) {
+                    using device::max;
+                    start[i] = max(start[i], 0);
+                }
+
+                T max_value = numeric_limits<T>::lowest();
+                index_type max_idx = -1;
+
+                size_type in_spatial_size = 1;
+                for (int i = 0; i < Order; i++)
+                    in_spatial_size *= in_spatial_dims[i];
+
+                const auto outer_offset =  (n * channels + c) * in_spatial_size;
+                if (Order == 1) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        index_type offset = 0;
+                        index_type stride = 1;
+                        for (int i = Order - 1; i >= 0; i--) {
+                            offset += stride * idx[i];
+                            stride *= in_spatial_dims[i];
+                        }
+
+                        if (input[outer_offset + offset] > max_value) {
+                            max_idx = offset;
+                            max_value = input[outer_offset + offset];
+                        }
+                    }
+                } else if (Order == 2) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+                            index_type offset = 0;
+                            index_type stride = 1;
+                            for (int i = Order - 1; i >= 0; i--) {
+                                offset += stride * idx[i];
+                                stride *= in_spatial_dims[i];
+                            }
+
+                            if (input[outer_offset + offset] > max_value) {
+                                max_idx = offset;
+                                max_value = input[outer_offset + offset];
+                            }
+                        }
+                    }
+                } else if(Order == 3) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+                            for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
+                                index_type offset = 0;
+                                index_type stride = 1;
+                                for (int i = Order - 1; i >= 0; i--) {
+                                    offset += stride * idx[i];
+                                    stride *= in_spatial_dims[i];
+                                }
+
+                                if (input[outer_offset + offset] > max_value) {
+                                    max_idx = offset;
+                                    max_value = input[outer_offset + offset];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                output[idx] = max_value;
+                indices[idx] = max_idx;
+            }
+        }
+
+        template <class T, std::size_t Order>
+        __global__ void max_unpooling(
+            Span<T> output, View<T> input, View<T> indices, size_type channels,
+            array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+            array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+        {
+            /* the output has already been zero filled */
+            /* Every input value represents a window in the output. The max unpooling operation
+             * copies the input value to exactly one location in the output window which is given
+             * by the indices tensor.
+             */
+            for (auto idx : grid_stride_range(input.size())) {
+                size_type in_spatial_size = 1;
+                array<index_type, Order> window_idx;
+                for (int i = Order - 1; i >= 0; i--) {
+                    window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
+                    in_spatial_size *= in_spatial_dims[i];
+                }
+
+                const index_type n = idx / (in_spatial_size * channels);
+                const index_type c = (idx / in_spatial_size) % channels;
+
+                array<index_type, Order> start;
+                for (int i = 0; i < Order; i++) {
+                    using device::min;
+                    using device::max;
+                    start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
+                }
+
+                size_type out_spatial_size = 1;
+                for (int i = 0; i < Order; i++)
+                    out_spatial_size *= out_spatial_dims[i];
+
+                index_type outer_offset = (n * channels + c) * out_spatial_size;
+                output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Order> static
+    void launch_max_pooling_kernel(
+        const Stream& stream,
+        Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
+        const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+        const std::vector<std::size_t>& window_size,
+        const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(indices.size() == output.size());
+        CV_Assert(out_spatial_dims.size() == Order);
+        CV_Assert(in_spatial_dims.size() == Order);
+        CV_Assert(window_size.size() == Order);
+        CV_Assert(strides.size() == Order);
+        CV_Assert(padding_left.size() == Order);
+
+        array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+        out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+        in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+        array<size_type, Order> window_size_k, strides_k, padding_left_k;
+        window_size_k.assign(std::begin(window_size), std::end(window_size));
+        strides_k.assign(std::begin(strides), std::end(strides));
+        padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+        auto kernel = raw::max_pooling_with_indices<T, Order>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, indices, input, channels,
+            out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+    }
+
+    template <class T>
+    void max_pooling_with_indices(
+        const Stream& stream,
+        TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(is_shape_same(output, indices));
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto order = window_size.size();
+        CV_Assert(strides.size() == order);
+        CV_Assert(padding_left.size() == order);
+        CV_Assert(output.rank() == order + 2);
+        CV_Assert(input.rank() == order + 2);
+
+        std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+        for (int i = 0; i < order; i++) {
+            in_spatial_dims[i] = input.get_axis_size(2 + i);
+            out_spatial_dims[i] = output.get_axis_size(2 + i);
+        }
+
+        CV_Assert(1 <= order && order <= 3);
+        std::size_t channels = input.get_axis_size(1);
+        if (order == 3) {
+            launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 2) {
+            launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 1) {
+            launch_max_pooling_kernel<T, 1>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void max_pooling_with_indices(const Stream&,
+        TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+#endif
+
+    template void max_pooling_with_indices(const Stream&,
+        TensorSpan<float>, TensorSpan<float>, TensorView<float>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+    template <class T, std::size_t Order> static
+    void launch_max_unpooling_kernel(
+        const Stream& stream,
+        Span<T> output, View<T> input, View<T> indices, std::size_t channels,
+        const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+        const std::vector<std::size_t>& window_size,
+        const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(out_spatial_dims.size() == Order);
+        CV_Assert(in_spatial_dims.size() == Order);
+        CV_Assert(window_size.size() == Order);
+        CV_Assert(strides.size() == Order);
+        CV_Assert(padding_left.size() == Order);
+        CV_Assert(indices.size() == input.size());
+
+        array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+        out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+        in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+        array<size_type, Order> window_size_k, strides_k, padding_left_k;
+        window_size_k.assign(std::begin(window_size), std::end(window_size));
+        strides_k.assign(std::begin(strides), std::end(strides));
+        padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+        auto kernel = raw::max_unpooling<T, Order>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, output, input, indices, channels,
+            out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+    }
+
+    template <class T>
+    void max_unpooling(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(is_shape_same(input, indices));
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto order = window_size.size();
+        CV_Assert(strides.size() == order);
+        CV_Assert(padding_left.size() == order);
+        CV_Assert(output.rank() == order + 2);
+        CV_Assert(input.rank() == order + 2);
+
+        std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+        for (int i = 0; i < order; i++) {
+            in_spatial_dims[i] = input.get_axis_size(2 + i);
+            out_spatial_dims[i] = output.get_axis_size(2 + i);
+        }
+
+        kernels::fill<T>(stream, output, 0.0);
+
+        /* only max_unpooling2d and max_unpooling3d are supported */
+        CV_Assert(2 <= order && order <= 3);
+        std::size_t channels = input.get_axis_size(1);
+        if (order == 3) {
+            launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 2) {
+            launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void max_unpooling(const Stream&,
+        TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+#endif
+
+    template void max_unpooling(const Stream&,
+        TensorSpan<float>, TensorView<float>, TensorView<float>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/memory.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/memory.hpp
@@ -0,0 +1,32 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_MEMORY_HPP
+#define OPENCV_DNN_SRC_CUDA_MEMORY_HPP
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+template <class T>
+__device__ T load_ldg(const T& src) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+    return __ldg(&src);
+#else
+    return src;
+#endif
+}
+
+template <class T>
+__device__ T load_ldg(const T* src) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+    return __ldg(src);
+#else
+    return *src;
+#endif
+}
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_MEMORY_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/mvn.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/mvn.cu
@@ -0,0 +1,145 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "atomics.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T>
+    __global__ void reduce_mean(Span<float> means, View<T> input, size_type inner_size) {
+        for (auto idx : grid_stride_range(input.size())) {
+            const index_type outer_idx = idx / inner_size;
+            atomicAdd(&means[outer_idx], static_cast<float>(input[idx]) / inner_size);
+        }
+    }
+
+    template <class T>
+    __global__ void reduce_mean_sqr_sum(Span<float> means, Span<float> sum_sqrs, View<T> input, size_type inner_size) {
+        for (auto idx : grid_stride_range(input.size())) {
+            const index_type outer_idx = idx / inner_size;
+            auto x = static_cast<float>(input[idx]);
+            atomicAdd(&means[outer_idx], x / inner_size);
+            atomicAdd(&sum_sqrs[outer_idx], x * x);
+        }
+    }
+
+    __global__ void compute_normalization_scale(Span<float> scale, View<float> means, View<float> sums_sqr, size_type inner_size, float eps) {
+        for (auto idx : grid_stride_range(scale.size())) {
+            auto mean = means[idx];
+            auto var = sums_sqr[idx] / inner_size - mean * mean;
+            using device::rsqrt;
+            scale[idx] = rsqrt(eps + var);
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean(Span<T> output, View<T> input, View<float> means, size_type inner_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            output[idx] = static_cast<float>(input[idx]) - means[outer_idx];
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean_variance(Span<T> output, View<T> input, View<float> means, View<float> scale, size_type inner_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * scale[outer_idx];
+        }
+    }
+}
+
+template <class T>
+void reduce_mean(const Stream& stream, Span<float> means, View<T> input, std::size_t inner_size)
+{
+    CV_Assert(input.size() / inner_size == means.size());
+
+    auto kernel = raw::reduce_mean<T>;
+    auto policy = make_policy(kernel, input.size(), 0, stream);
+    launch_kernel(kernel, policy, means, input, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void reduce_mean(const Stream&, Span<float>, View<__half>, std::size_t);
+#endif
+template void reduce_mean(const Stream&, Span<float>, View<float>, std::size_t);
+
+template <class T>
+void reduce_mean_sqr_sum(const Stream& stream, Span<float> means, Span<float> sum_sqrs, View<T> input, std::size_t inner_size)
+{
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(input.size() / inner_size == sum_sqrs.size());
+
+    auto kernel = raw::reduce_mean_sqr_sum<T>;
+    auto policy = make_policy(kernel, input.size(), 0, stream);
+    launch_kernel(kernel, policy, means, sum_sqrs, input, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<__half>, std::size_t);
+#endif
+template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<float>, std::size_t);
+
+void compute_normalization_scale(const Stream& stream, Span<float> scale, View<float> means, View<float> sum_sqrs, std::size_t inner_size, float eps)
+{
+    CV_Assert(scale.size() == means.size());
+    CV_Assert(scale.size() == sum_sqrs.size());
+
+    auto kernel = raw::compute_normalization_scale;
+    auto policy = make_policy(kernel, scale.size(), 0, stream);
+    launch_kernel(kernel, policy, scale, means, sum_sqrs, inner_size, eps);
+}
+
+template <class T>
+void normalize_mean(const Stream& stream, Span<T> output, View<T> input, View<float> means, std::size_t inner_size)
+{
+    CV_Assert(output.size() == input.size());
+    CV_Assert(input.size() / inner_size == means.size());
+
+    auto kernel = raw::normalize_mean<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, means, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean(const Stream&, Span<__half>, View<__half>, View<float>, std::size_t);
+#endif
+template void normalize_mean(const Stream&, Span<float>, View<float>, View<float>, std::size_t);
+
+template <class T>
+void normalize_mean_variance(const Stream& stream, Span<T> output, View<T> input, View<float> means, View<float> scale, std::size_t inner_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(input.size() / inner_size == scale.size());
+
+    auto kernel = raw::normalize_mean_variance<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, means, scale, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance(const Stream&, Span<__half>, View<__half>, View<float>, View<float>, std::size_t);
+#endif
+template void normalize_mean_variance(const Stream&, Span<float>, View<float>, View<float>, View<float>, std::size_t);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu
@@ -0,0 +1,123 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "atomics.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T>
+        __global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+            for (auto idx : grid_stride_range(input.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                atomicAdd(&output[sum_idx], device::abs(input[idx]));
+            }
+        }
+
+        template <class T>
+        __global__ void reciprocal(Span<T> output, T epsilon) {
+            for (auto idx : grid_stride_range(output.size()))
+                output[idx] = T(1) / (output[idx] + epsilon);
+        }
+
+        template <class T>
+        __global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+           for (auto idx : grid_stride_range(input.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                atomicAdd(&output[sum_idx], input[idx] * input[idx]);
+           }
+        }
+
+        template <class T>
+        __global__ void rsqrt(Span<T> output, T epsilon) {
+            for (auto idx : grid_stride_range(output.size())) {
+                using device::sqrt;
+                output[idx] = T(1) / sqrt(output[idx] + epsilon);
+            }
+        }
+
+        template <class T>
+        __global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
+            for (auto idx : grid_stride_range(output.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                output[idx] = input[idx] * sums[sum_idx];
+            }
+        }
+    }
+
+    template <class T>
+    void normalize(
+        const Stream& stream,
+        Span<T> output,
+        View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
+        Span<T> workspace)
+    {
+        CV_Assert(output.size() == input.size());
+        CV_Assert(output.size() == outer_size * mid_size * inner_size);
+        CV_Assert(norm == 1 || norm == 2);
+        CV_Assert(workspace.size() >= outer_size * inner_size);
+
+        auto sums = Span<T>(workspace.data(), outer_size * inner_size);
+
+        fill<T>(stream, sums, 0.0);
+
+        if (norm == 1) {
+            auto reduce_kernel = raw::reduce_sum_abs<T>;
+            auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+            launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+            auto reciprocal_kernel = raw::reciprocal<T>;
+            policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
+            launch_kernel(reciprocal_kernel, policy, sums, epsilon);
+        } else {
+            auto reduce_kernel = raw::reduce_sum_squared<T>;
+            auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+            launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+            auto rsqrt_kernel = raw::rsqrt<T>;
+            policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
+            launch_kernel(rsqrt_kernel, policy, sums, epsilon);
+        }
+
+        auto scale_kernel = raw::apply_norm<T>;
+        auto policy = make_policy(scale_kernel, output.size(), 0, stream);
+        launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
+#endif
+    template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/padding.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/padding.cu
@@ -0,0 +1,201 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void copy_with_reflection101(
+            Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
+            View<T> input, array<size_type, Rank> in_strides)
+        {
+            for (auto i : grid_stride_range(output.size())) {
+                /* compute output axis indices corresponding to element 'i' */
+                array<index_type, Rank> out_index;
+                out_index[0] = i / out_strides[0];
+                for (int j = 1; j < Rank; j++)
+                    out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
+
+                /* compute input axis indices corresponding to output axis indices */
+                array<index_type, Rank> in_index;
+                for (int j = 0; j < Rank; j++) {
+                    /* if out_index < start, the point is in the left reflection region
+                     * the reflected value's index is the absolute value of the difference
+                     *
+                     * otherwise, if the value is in the copy region, out_index - start gives the input index
+                     */
+                    using device::abs;
+                    in_index[j] = abs(out_index[j] - start[j]);
+
+                    /* if out_index >= end, it's in the right reflection region */
+                    if (out_index[j] >= end[j])
+                        in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
+                }
+
+                /* compute input element number from input axis indices */
+                index_type iidx = 0;
+                for (int j = 0; j < Rank; j++)
+                    iidx += in_index[j] * in_strides[j];
+
+                output[i] = input[iidx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_copy_with_reflection101(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride,
+        const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+        CV_Assert(ranges.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> start_k, end_k;
+        for (int i = 0; i < Rank; i++) {
+            start_k[i] = ranges[i].first;
+            end_k[i] = ranges[i].second;
+        }
+
+        auto kernel = raw::copy_with_reflection101<T, Rank>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
+
+    template <class T>
+    void copy_with_reflection101(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::pair<std::size_t, std::size_t>> ranges)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == ranges.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+         * output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
+         * The padding operation essentially copies items from the input tensor to new locations in the output tensor
+         * and pads the remaining.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output indices
+         * for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
+         * there cannot be extra padding since the axes have unit size. The first index does not contribute to the
+         * element's address calculation and hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            ranges.erase(std::begin(ranges));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == ranges.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes which do not have any padding can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
+         * padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+         *
+         * Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
+         * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any padding */
+            if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
+                CV_Assert(inShape[i] == outShape[i]);
+
+                /* we now iterate through the axes which follow and try to merge */
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
+                    CV_Assert(inShape[j] == outShape[j]);
+
+                    /* `j` is also unpadded; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    ranges[i].second = new_size;
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    ranges.erase(std::begin(ranges) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == ranges.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+#endif
+    template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+
+}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/permute.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/permute.cu
@@ -0,0 +1,288 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void permute(
+            array<index_type, Rank> axis_order,
+            Span<T> output, array<size_type, Rank> outStrides,
+            View<T> input, array<size_type, Rank> inStrides)
+        {
+            for (auto i : grid_stride_range(input.size())) {
+                index_type oldPosition = 0;
+                index_type newPosition = i;
+
+                for (int j = 0; j < Rank; j++)
+                {
+                    auto order = axis_order[j];
+                    oldPosition += (newPosition / outStrides[j]) * inStrides[order];
+                    newPosition %= outStrides[j];
+                }
+
+                output[i] = input[oldPosition];
+            }
+        }
+
+        template <class T, int TILE_SIZE, int ROWS_PER_THREAD>
+        __global__ void transpose(Span<T> output, View<T> input, size_type in_width, size_type out_width)
+        {
+            __shared__ T tile[TILE_SIZE][TILE_SIZE + 1];
+
+            /* blockDim.y = TILE_SIZE / ROWS_PER_THREAD, blockDim.x = TILE_SIZE */
+            const index_type in_x = blockIdx.x * TILE_SIZE + threadIdx.x;
+            const index_type in_y_begin = blockIdx.y * TILE_SIZE + threadIdx.y;
+
+            /* Every valid input location has a corresponding output location and vice versa.
+             * Hence, if we do not load values into the shared memory for a given location, we
+             * also won't read them for storing in the output.
+             */
+            for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
+            {
+                const auto in_y_current = in_y_begin + j;
+                if (in_x < in_width && in_y_current < out_width)
+                    tile[threadIdx.y + j][threadIdx.x] = input[in_y_current * in_width + in_x];
+            }
+
+            __syncthreads();
+
+            /* We interchange `threadIdx.x` and `threadIdx.y` so that consecutive output indices map to
+             * consecutive threads. This would allow writes across threds in a warp to be coalesced.
+             */
+            const index_type out_x = blockIdx.y * TILE_SIZE + threadIdx.x;
+            const index_type out_y_begin = blockIdx.x * TILE_SIZE + threadIdx.y;
+
+            for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
+            {
+                const auto out_y_current = out_y_begin + j;
+                if (out_x < out_width && out_y_current < in_width)
+                    output[out_y_current * out_width + out_x] = tile[threadIdx.x][threadIdx.y + j];
+            }
+        }
+    }
+
+    template <class T>
+    void transpose(const Stream& stream, Span<T> output, View<T> input, std::size_t in_width, std::size_t out_width)
+    {
+        /* Each block processes a TILE_SIZE x TILE_SIZE piece */
+        constexpr int TILE_SIZE = 32;
+
+        /* Each thread processes ROWS_PER_THREAD rows. We do this to decrease the number of threads required
+         * in a block so that the cost of the block-wide synchronization is minimized.
+         */
+        constexpr int ROWS_PER_THREAD = 4;
+
+        dim3 grid_size((in_width + TILE_SIZE - 1) / TILE_SIZE, (out_width + TILE_SIZE - 1) / TILE_SIZE);
+        dim3 block_size(TILE_SIZE, TILE_SIZE / ROWS_PER_THREAD);
+        auto policy = execution_policy(grid_size, block_size, stream);
+
+        auto kernel = raw::transpose<T, TILE_SIZE, ROWS_PER_THREAD>;
+        launch_kernel(kernel, policy, output, input, in_width, out_width);
+    }
+
+    template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t);
+    template void transpose(const Stream&, Span<float>, View<float>, std::size_t, std::size_t);
+
+    template <class T, std::size_t Rank> static
+    void launch_permute_kernel(
+        const Stream& stream,
+        const std::vector<std::size_t>& order,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride)
+    {
+        CV_Assert(order.size() == Rank);
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+
+        array<index_type, Rank> order_k;
+        order_k.assign(std::begin(order), std::end(order));
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        auto kernel = raw::permute<T, Rank>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel);
+
+    template <class T>
+    void permute(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> order)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(input.rank() == order.size());
+        CV_Assert(input.size() == output.size());
+
+        auto rank = output.rank();
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* singleton axes do not contribute towards address calculation
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+         * output tensor will be some permutation of the input tensor indices. Let the output
+         * tensor indices be [o1, o2, ...]. The permutation operation essentially copies items
+         * from the input tensor to new locations in the output tensor as dictated by the indices.
+         *
+         * If the size of the nth axis (say i2) of the input is one the input and output indicies for
+         * all the elements will be of the form be [i1, 0, ...] and [..., 0, ...] respectively.
+         * The index does not contribute to the element's address calculation and hence would give
+         * identical result if it weren't there.
+         */
+        for (int i = 0; i < rank; i++)
+        {
+            /* index `i` corresponds to the axis index in the output; order[i] has the corresponding axis index in the input */
+            while (i < rank && outShape[i] == 1)
+            {
+                int in_i = order[i];
+                CV_Assert(inShape[in_i] == 1);
+
+                /* delete axis `i` */
+                inShape.erase(std::begin(inShape) + in_i);
+                outShape.erase(std::begin(outShape) + i);
+
+                /* deletion of an axis reduces an axis in the input tensor which would cause the indices
+                 * of the axes that come after the deleted axis to reduce by one
+                 */
+                order.erase(order.begin() + i);
+                for (auto& axis : order)
+                    if (axis > in_i)
+                        axis--;
+
+                rank--;
+
+                /* optimizations should not break the invariants */
+                CV_Assert(rank == order.size());
+                CV_Assert(inShape.size() == order.size());
+                CV_Assert(outShape.size() == order.size());
+                CV_Assert(input.size() == output.size());
+            }
+        }
+
+        /* contiguous axes whose relative ordering stays same before and after permutation can be merged into one axis
+         * example: in permute order 0 2 3 1, axes 2 and 3 can be grouped into a single axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i0, i1, i2, i3, ...]. Let the permutation order be [0, 3, 1, 2, ...].
+         * Note that i1 and i2 are adjacent axes in the same order in input as well as output. The indices in the output tensor
+         * will be [i0, i3, i1, i2, ...].
+         *
+         * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * (size2 * stride2) + i2 * stride2` which is `(i1 * size2 + i2) * stride2`,
+         * in both input and output. Note stride2 can be different in the input and output. We can merge the two axes into one axis
+         * with a size of `size1 * size2`. The new offset added will be `i12 * stride12` as the kernel iterates through `i12`. Note
+         * that `i12` is actually `(i1 * size2 + i2)` and `stride12` is `stride2`.
+         */
+         for (int i = 0; i < rank; i++) {
+            /* the indices used in the loops such as `i` and `j` are axis indices in the output tensor */
+            /* the corresponding input axis indices are `order[i]` and `order[j]`*/
+
+            /* loop invariant: `i` is the first axis in the contiguous unpermuted axis sequence */
+
+            int j = i + 1; /* `j` is the axis which we will attempt to merge */
+            while (j < rank && (order[i] + 1) == order[j]) {
+                /* axis `i` and axis `j` do not change relative order */
+
+                auto in_i = order[i], in_j = order[j];
+
+                auto new_size = inShape[in_i] * inShape[in_j];
+                inShape[in_i] = new_size;
+                outShape[i] = new_size;
+
+                /* delete axis `j` */
+                inShape.erase(std::begin(inShape) + in_j);
+                outShape.erase(std::begin(outShape) + j);
+
+                /* deletion of an axis reduces an axis in the input tensor which would cause the indices
+                 * of the axes that come after the deleted axis to reduce by one
+                 */
+                order.erase(order.begin() + j);
+                for (auto& axis : order)
+                    if (axis > order[i])
+                        axis--;
+
+                rank--;
+
+                /* optimizations should not break the invariants */
+                CV_Assert(rank == order.size());
+                CV_Assert(inShape.size() == order.size());
+                CV_Assert(outShape.size() == order.size());
+                CV_Assert(input.size() == output.size());
+            }
+        }
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        const bool is_in_order = [&order] {
+            for (int i = 0; i < order.size(); i++)
+                if (order[i] != i)
+                    return false;
+            return true;
+        }();
+
+        if (is_in_order)
+        {
+            kernels::copy<T>(stream, output, input);
+        }
+        else if(rank == 2)
+        {
+            /* use the more efficient transpose kernel */
+            transpose<T>(stream, output, input, inShape[1], outShape[1]);
+        }
+        else
+        {
+            CV_Assert(3 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+            permute_dispatcher<T, 3, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+#endif
+    template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/prior_box.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/prior_box.cu
@@ -0,0 +1,176 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, bool Normalize>
+        __global__ void prior_box(
+            Span<T> output,
+            View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+            size_type layerWidth, size_type layerHeight,
+            size_type imageWidth, size_type imageHeight)
+        {
+            /* each box consists of two pair of coordinates and hence 4 values in total */
+            /* since the entire output consists (first channel at least) of these boxes,
+             * we are garunteeed that the output is aligned to a boundary of 4 values
+             */
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+
+            /* num_points contains the number of points in the feature map of interest
+             * each iteration of the stride loop selects a point and generates prior boxes for it
+             */
+            size_type num_points = layerWidth * layerHeight;
+            for (auto idx : grid_stride_range(num_points)) {
+                const index_type x = idx % layerWidth,
+                                 y = idx / layerWidth;
+
+                index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size();
+                for (int i = 0; i < boxWidth.size(); i++) {
+                    for (int j = 0; j < offsetX.size(); j++) {
+                        float center_x = (x + offsetX[j]) * stepX;
+                        float center_y = (y + offsetY[j]) * stepY;
+
+                        vector_type vec;
+                        if(Normalize) {
+                            vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth;
+                            vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight;
+                            vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth;
+                            vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight;
+                        } else {
+                            vec.data[0] = center_x - boxWidth[i] * 0.5f;
+                            vec.data[1] = center_y - boxHeight[i] * 0.5f;
+                            vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f;
+                            vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f;
+                        }
+
+                        v_store(output_vPtr[output_offset_v4], vec);
+                        output_offset_v4++;
+                    }
+                }
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_clip(Span<T> output) {
+            for (auto i : grid_stride_range(output.size())) {
+                using device::clamp;
+                output[i] = clamp<T>(output[i], 0.0, 1.0);
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_set_variance1(Span<T> output, float variance) {
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / 4)) {
+                vector_type vec;
+                for (int j = 0; j < 4; j++)
+                    vec.data[j] = variance;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) {
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / 4)) {
+                vector_type vec;
+                for(int j = 0; j < 4; j++)
+                    vec.data[j] = variance[j];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, bool Normalize> static
+    void launch_prior_box_kernel(
+        const Stream& stream,
+        Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+        std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight)
+    {
+        auto num_points = layerWidth * layerHeight;
+        auto kernel = raw::prior_box<T, Normalize>;
+        auto policy = make_policy(kernel, num_points, 0, stream);
+        launch_kernel(kernel, policy,
+            output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+            layerWidth, layerHeight, imageWidth, imageHeight);
+    }
+
+    template <class T>
+    void generate_prior_boxes(
+        const Stream& stream,
+        Span<T> output,
+        View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+        std::vector<float> variance,
+        std::size_t numPriors,
+        std::size_t layerWidth, std::size_t layerHeight,
+        std::size_t imageWidth, std::size_t imageHeight,
+        bool normalize, bool clip)
+    {
+        if (normalize) {
+            launch_prior_box_kernel<T, true>(
+                stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+                layerWidth, layerHeight, imageWidth, imageHeight
+            );
+        } else {
+            launch_prior_box_kernel<T, false>(
+                stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+                layerWidth, layerHeight, imageWidth, imageHeight
+            );
+        }
+
+        std::size_t channel_size = layerHeight * layerWidth * numPriors * 4;
+        CV_Assert(channel_size * 2 == output.size());
+
+        if (clip) {
+            auto output_span_c1 = Span<T>(output.data(), channel_size);
+            auto kernel = raw::prior_box_clip<T>;
+            auto policy = make_policy(kernel, output_span_c1.size(), 0, stream);
+            launch_kernel(kernel, policy, output_span_c1);
+        }
+
+        auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size);
+        if (variance.size() == 1) {
+            auto kernel = raw::prior_box_set_variance1<T>;
+            auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+            launch_kernel(kernel, policy, output_span_c2, variance[0]);
+        } else {
+            array<float, 4> variance_k;
+            variance_k.assign(std::begin(variance), std::end(variance));
+            auto kernel = raw::prior_box_set_variance4<T>;
+            auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+            launch_kernel(kernel, policy, output_span_c2, variance_k);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float,
+        std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+#endif
+
+    template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float,
+        std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/region.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/region.cu
@@ -0,0 +1,216 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "limits.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T>
+        __global__ void region_box(
+            Span<T> output, View<T> input, View<T> bias,
+            size_type boxes_per_cell, size_type box_size,
+            size_type rows, size_type cols, T scale_x_y,
+            size_type height_norm, size_type width_norm,
+            T object_prob_cutoff, bool new_coords)
+        {
+            using vector2_type = get_vector_type_t<T, 2>;
+            auto bias_vPtr = vector2_type::get_pointer(bias.data());
+
+            for (auto box_index : grid_stride_range(output.size() / box_size)) {
+                const auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
+                const auto box_offset = box_index * box_size;
+
+                const auto batch_inner_size = rows * cols * boxes_per_cell;
+                const auto row_inner_size = cols * boxes_per_cell;
+                const auto col_inner_size = boxes_per_cell;
+
+                const auto y = (box_index % batch_inner_size) / row_inner_size;
+                const auto x = (box_index % row_inner_size) / col_inner_size;
+
+                /* When new_coords is true, we shouldn't use logistic activation again */
+                T objectness_prob;
+                if (new_coords)
+                {
+                    const auto tmp_x = (input[box_offset + 0] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+                    const auto tmp_y = (input[box_offset + 1] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+
+                    output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
+                    output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
+
+                    vector2_type bias_xy;
+                    v_load(bias_xy, bias_vPtr[box_of_the_cell]);
+
+                    output[box_offset + 2] = input[box_offset + 2] * input[box_offset + 2] *
+                                             static_cast<T>(4) * bias_xy.data[0] / static_cast<T>(width_norm);
+                    output[box_offset + 3] = input[box_offset + 3] * input[box_offset + 3] *
+                                             static_cast<T>(4) * bias_xy.data[1] / static_cast<T>(height_norm);
+
+                    objectness_prob = input[box_offset + 4];
+                }
+                else
+                {
+                    const auto tmp_x = (fast_sigmoid(input[box_offset + 0]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+                    const auto tmp_y = (fast_sigmoid(input[box_offset + 1]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+
+                    output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
+                    output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
+
+                    vector2_type bias_xy;
+                    v_load(bias_xy, bias_vPtr[box_of_the_cell]);
+
+                    output[box_offset + 2] = fast_exp(input[box_offset + 2]) * bias_xy.data[0] / static_cast<T>(width_norm);
+                    output[box_offset + 3] = fast_exp(input[box_offset + 3]) * bias_xy.data[1] / static_cast<T>(height_norm);
+
+                    /* squash objectness score into a probability */
+                    objectness_prob = fast_sigmoid(input[box_offset + 4]);
+                }
+
+                /* ignore prediction if the objectness probability is less than the cutoff */
+                if (objectness_prob < object_prob_cutoff)
+                    objectness_prob = 0;
+
+                output[box_offset + 4] = objectness_prob;
+            }
+        }
+
+        template <class T>
+        __global__ void region_sigmoid_class_score(Span<T> output, View<T> input, T class_prob_cutoff,
+                                                   size_type box_size, bool new_coords)
+        {
+            for (auto idx : grid_stride_range(output.size())) {
+                const index_type box_no = idx / box_size;
+                const index_type start_of_box = box_no * box_size;
+                const index_type box_offset = idx % box_size;
+
+                if (box_offset < 5) {
+                    /* continue as we have already processed these in region_box */
+                    continue;
+                }
+
+                auto objectness_prob = output[start_of_box + 4];
+
+                /* the class probabilities we currently have are conditional class probabilities
+                 * given the object
+                 *
+                 * to obtain the actual class probability, we multiply the conditional probability
+                 * with the object probability
+                 *
+                 * when new_coords is true, we shouldn't use logistic activation again.
+                 */
+
+                T actual_class_prob;
+                if (new_coords)
+                {
+                    actual_class_prob = objectness_prob * input[idx];
+                }
+                else
+                {
+                    actual_class_prob = objectness_prob * fast_sigmoid(input[idx]);
+                }
+
+                if (actual_class_prob <= class_prob_cutoff)
+                    actual_class_prob = T(0);
+                output[idx] = actual_class_prob;
+            }
+        }
+
+        template <class T>
+        __global__ void region_softmax_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size) {
+            for (auto box_no : grid_stride_range(output.size() / box_size)) {
+                const index_type start_of_box = box_no * box_size;
+                const index_type start_idx = start_of_box + 5;
+                const index_type end_idx = start_of_box + box_size;
+
+                auto largest = numeric_limits<T>::lowest();
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    using device::max;
+                    largest = max(largest, input[idx]);
+                }
+
+                auto sum = T(0);
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    using device::exp;
+                    auto temp = exp(input[idx] - largest);
+                    sum += temp;
+                    output[idx] = temp;
+                }
+
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    auto softmax_score = output[idx] / sum;
+
+                    /* the class probabilities we currently have are conditional class probabilities
+                     * given the object
+                     *
+                     * to obtain the actual class probability, we multiply the conditional probability
+                     * with the object probability
+                     */
+                    auto objectness_prob = output[start_of_box + 4];
+                    auto actual_class_prob = objectness_prob * softmax_score;
+                    if (actual_class_prob <= class_prob_cutoff)
+                        actual_class_prob = T(0);
+                    output[idx] = actual_class_prob;
+                }
+            }
+        }
+    }
+
+    template <class T>
+    void region(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
+        T object_prob_cutoff, T class_prob_cutoff,
+        std::size_t boxes_per_cell, std::size_t box_size,
+        std::size_t rows, std::size_t cols, T scale_x_y,
+        std::size_t height_norm, std::size_t width_norm,
+        bool if_true_sigmoid_else_softmax, /* true = sigmoid, false = softmax */
+        bool new_coords)
+    {
+        CV_Assert(output.size() == input.size());
+        CV_Assert(output.size() % box_size == 0);
+        CV_Assert(is_fully_aligned(bias, 2));
+
+        auto box_kernel = raw::region_box<T>;
+        auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream);
+        launch_kernel(box_kernel, box_policy,
+            output, input, bias, boxes_per_cell, box_size,
+            rows, cols, scale_x_y, height_norm, width_norm,
+            object_prob_cutoff, new_coords);
+
+        if (if_true_sigmoid_else_softmax) {
+            auto kernel_score = raw::region_sigmoid_class_score<T>;
+            auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
+            launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size, new_coords);
+        } else {
+            auto kernel_score = raw::region_softmax_class_score<T>;
+            auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
+            launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void region(const Stream&, Span<__half>, View<__half>, View<__half>,
+        __half, __half, std::size_t, std::size_t, std::size_t, std::size_t, __half, std::size_t, std::size_t, bool, bool);
+#endif
+
+    template void region(const Stream&, Span<float>, View<float>, View<float>,
+        float, float, std::size_t, std::size_t, std::size_t, std::size_t, float, std::size_t, std::size_t, bool, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/resize.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/resize.cu
@@ -0,0 +1,245 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void resize_nn(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            float o2i_fy, float o2i_fx, bool round, bool half_pixel_centers)
+        {
+            auto in_image_size = in_height * in_width;
+            auto out_image_size = out_height * out_width;
+
+            /* think of the output and input as a collection of 2d images with the last axis
+             * representing the width and the last but one axis representing the height
+             *
+             * the remaining axis together form a collection of these images/channels
+             */
+            auto num_effective_channels = output.size() / out_image_size;
+
+            /* we process multiple channels every iteration to reuse the identical computation
+             * involved with the spatial dimensions
+             *
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
+             */
+            auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
+
+            /* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
+             * combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
+             * iterations in total to finish the resize operation
+             */
+            auto iters_required = num_channel_iters_per_xy * out_image_size;
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                auto in_yf = half_pixel_centers ? (y + 0.5f) * o2i_fy : y * o2i_fy;
+                auto in_xf = half_pixel_centers ? (x + 0.5f) * o2i_fx : x * o2i_fx;
+
+                using device::lround;
+                index_type in_y = round ? lround(in_yf) : static_cast<index_type>(in_yf);
+                index_type in_x = round ? lround(in_xf) : static_cast<index_type>(in_xf);
+
+                using device::min;
+                in_y = min(in_y, in_height - 1);
+                in_x = min(in_x, in_width - 1);
+
+                index_type in_idx = c_start * in_image_size + in_y * in_width + in_x;
+                index_type out_idx = c_start * out_image_size + y * out_width + x;
+
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    output[out_idx] = load_ldg(input[in_idx]);
+
+                    in_idx += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void resize_bilinear(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            float o2i_fy, float o2i_fx, bool half_pixel_centers)
+        {
+            auto in_image_size = in_height * in_width;
+            auto out_image_size = out_height * out_width;
+
+            /* think of the output and input as a collection of 2d images with the last axis
+             * representing the width and the last but one axis representing the height
+             *
+             * the remaining axis together form a collection of these images/channels
+             */
+            auto num_effective_channels = output.size() / out_image_size;
+
+            /* we process multiple channels every iteration to reuse the identical computation
+             * involved with the spatial dimensions
+             *
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
+             */
+            auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
+
+            /* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
+             * combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
+             * iterations in total to finish the resize operation
+             */
+            auto iters_required = num_channel_iters_per_xy * out_image_size;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
+                const index_type c_end = c_start + CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                using device::max;
+                auto in_x = half_pixel_centers ? max<float>((x + 0.5f) * o2i_fx - 0.5f, 0.0f) : x * o2i_fx;
+                auto in_y = half_pixel_centers ? max<float>((y + 0.5f) * o2i_fy - 0.5f, 0.0f) : y * o2i_fy;
+
+                auto in_x0 = static_cast<index_type>(in_x);
+                auto in_y0 = static_cast<index_type>(in_y);
+
+                using device::min;
+                auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling to reduce register pressure; not sure how but it works */
+                for (auto c = c_start; c < c_end; c++) {
+                    auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
+                         v_01 = load_ldg(input[in_offset_r0 + in_x1]),
+                         v_10 = load_ldg(input[in_offset_r1 + in_x0]),
+                         v_11 = load_ldg(input[in_offset_r1 + in_x1]);
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - in_y0) * T(v_10 - v_00) +
+                        T(in_x - in_x0) * T(v_01 - v_00) +
+                        T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_resize_nn(const Stream& stream,
+        Span<T> output, size_type out_height, size_type out_width,
+        View<T> input, size_type in_height, size_type in_width,
+        float scale_y, float scale_x, bool round, bool half_pixel_centers)
+    {
+        auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width,  scale_y, scale_x, round, half_pixel_centers);
+    }
+
+    template <class T>
+    void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers) {
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_effective_channels = input.size_range(0, 2);
+        auto num_iters = num_effective_channels * out_height * out_width;
+
+        if (num_effective_channels % 32 == 0 && num_iters > 655360) {
+            launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 16 == 0 && num_iters > 327680) {
+            launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 8 == 0 && num_iters > 163840) {
+            launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 4 == 0 && num_iters > 81920) {
+            launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 2 == 0) {
+            launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else {
+            launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool, bool);
+#endif
+    template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool,bool);
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_resize_bilinear(const Stream& stream,
+        Span<T> output, size_type out_height, size_type out_width,
+        View<T> input, size_type in_height, size_type in_width,
+        float scale_y, float scale_x, bool half_pixel_centers)
+    {
+        auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+    }
+
+    template <class T>
+    void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers) {
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_effective_channels = input.size_range(0, 2);
+        auto num_iters = num_effective_channels * out_height * out_width;
+
+        if (num_effective_channels % 16 == 0 && num_iters > 163840) {
+            launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else if (num_effective_channels % 8 == 0 && num_iters > 81920) {
+            launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else if (num_effective_channels % 4 == 0 && num_iters > 40960) {
+            launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else if (num_effective_channels % 2 == 0) {
+            launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else {
+            launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool);
+#endif
+    template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/roi_pooling.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/roi_pooling.cu
@@ -0,0 +1,181 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void roi_pooling(
+            Span<T> output, size_type pooled_height, size_type pooled_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> rois, size_type num_channels, float spatial_scale)
+        {
+            // input: [1, num_channels, in_height, in_width]
+            const auto in_image_size = in_height * in_width;
+
+            // rois: [num_rois, 5]
+            auto num_rois = rois.size() / 5;
+
+            // output: [num_rois, num_channels, pooled_height, pooled_width]
+            const auto out_spatial_size = pooled_height * pooled_width;
+            const auto out_roi_size = num_channels * out_spatial_size;
+
+            /* we have to compute the output value for every combination of (roi, c, y, x) in the output
+             *
+             * the computation involving (y, x) are identical for all non-spatial dimensions
+             * the computation and memory requests involving the roi are identical for remaining three axes
+             *
+             * we process multiple channels every iteration to reuse the identical computation
+             * and memory requests involved with the roi and spatial dimensions
+             */
+            /*
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_channels / CHANNELS_PER_ITER) iterations per (roi, x, y)
+             */
+            auto num_channel_iters_per_roi_xy = num_channels / CHANNELS_PER_ITER;
+
+            /* we need `num_channel_iters_per_roi_xy` iterations per (roi, x, y) and there are
+             * `num_rois` rois and `out_spatial_size` combinations of (x, y)
+             */
+            auto iters_per_roi = num_channel_iters_per_roi_xy * out_spatial_size;
+            auto iters_required = num_rois * iters_per_roi;
+
+            for (auto iter : grid_stride_range(iters_required))
+            {
+                const index_type roi_no = iter / iters_per_roi;
+                const index_type c_start = ((iter % iters_per_roi) / out_spatial_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_spatial_size) / pooled_width;
+                const index_type x = iter % pooled_width;
+
+                const index_type roi_offset = roi_no * 5;
+
+                using device::round;
+                const index_type batch_id = rois[roi_offset + 0];
+                const index_type x_start_roi = round(static_cast<float>(rois[roi_offset + 1]) * spatial_scale);
+                const index_type y_start_roi = round(static_cast<float>(rois[roi_offset + 2]) * spatial_scale);
+                const index_type x_end_roi = round(static_cast<float>(rois[roi_offset + 3]) * spatial_scale);
+                const index_type y_end_roi = round(static_cast<float>(rois[roi_offset + 4]) * spatial_scale);
+
+                using device::max;
+                const auto roi_width = max<index_type>(x_end_roi - x_start_roi + 1, 1);
+                const auto roi_height = max<index_type>(y_end_roi - y_start_roi + 1, 1);
+
+                const auto roi_width_ratio = static_cast<float>(roi_width) / pooled_width;
+                const auto roi_height_ratio = static_cast<float>(roi_height) / pooled_height;
+
+                auto x_start = x_start_roi + static_cast<index_type>(x * roi_width_ratio);
+                auto y_start = y_start_roi + static_cast<index_type>(y * roi_height_ratio);
+
+                using device::ceil;
+                auto x_end = x_start_roi + static_cast<index_type>(ceil((x + 1) * roi_width_ratio));
+                auto y_end = y_start_roi + static_cast<index_type>(ceil((y + 1) * roi_height_ratio));
+
+                using device::max;
+                x_start = max<index_type>(x_start, 0);
+                y_start = max<index_type>(y_start, 0);
+
+                using device::min;
+                x_end = min<index_type>(x_end, in_width);
+                y_end = min<index_type>(y_end, in_height);
+
+                index_type in_offset = (batch_id * num_channels + c_start) * in_height * in_width;
+                index_type out_idx = roi_no * out_roi_size + c_start * out_spatial_size + y * pooled_width + x;
+
+                for (int i = 0; i < CHANNELS_PER_ITER; i++)
+                {
+                    /* We have to set the output to zero if (x_start >= x_end) or (y_start >= y_end). If either
+                     * condition is true, the loops below won't execute even a single iteration. Hence, by setting
+                     * `max_val` to zero in this case, we can combine it with the `else` code.
+                     */
+                    T max_val = (x_start >= x_end || y_start >= y_end) ? T(0) : device::numeric_limits<T>::lowest();
+
+                    for (auto iy = y_start; iy < y_end; iy++)
+                    {
+                        const auto in_idx = in_offset + iy * in_width;
+                        for (auto ix = x_start; ix < x_end; ix++)
+                        {
+                            max_val = max(max_val, load_ldg(input[in_idx + ix]));
+                        }
+                    }
+
+                    output[out_idx] = max_val;
+
+                    in_offset += in_image_size;
+                    out_idx += out_spatial_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_roi_pooling(const Stream& stream,
+        Span<T> output, size_type pooled_height, size_type pooled_width,
+        View<T> input, size_type in_height, size_type in_width,
+        View<T> rois, size_type num_channels, float spatial_scale)
+    {
+        auto kernel = raw::roi_pooling<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+    }
+
+    template <class T>
+    void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois, float spatial_scale)
+    {
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        size_type num_channels = output.get_axis_size(1);
+
+        size_type pooled_height = output.get_axis_size(2);
+        size_type pooled_width = output.get_axis_size(3);
+
+        size_type in_height = input.get_axis_size(2);
+        size_type in_width = input.get_axis_size(3);
+
+        if (num_channels % 64 == 0) {
+            launch_multichannel_roi_pooling<T, 64>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 32 == 0) {
+            launch_multichannel_roi_pooling<T, 32>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 16 == 0) {
+            launch_multichannel_roi_pooling<T, 16>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 8 == 0) {
+            launch_multichannel_roi_pooling<T, 8>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 4 == 0) {
+            launch_multichannel_roi_pooling<T, 4>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 2 == 0) {
+            launch_multichannel_roi_pooling<T, 2>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else {
+            launch_multichannel_roi_pooling<T, 1>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois, float spatial_scale);
+#endif
+    template void roi_pooling(const Stream& stream, TensorSpan<float> output, TensorView<float> input, View<float> rois, float spatial_scale);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/scale_shift.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/scale_shift.cu
@@ -0,0 +1,235 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type bias_idx = (i / inner_size) % bias.size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for(int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] + bias[bias_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type scale_idx = (i / inner_size) % weights.size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * weights[scale_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = alpha * vec.data[j] + beta;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type scale_idx = (i / inner_size) % weights.size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::biasN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, bias);
+    }
+
+    template <class T>
+    void biasN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> bias)
+    {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
+        } else {
+            launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+#endif
+    template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+    template <class T, std::size_t N> static
+    void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::scaleN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, weights);
+    }
+
+    template <class T>
+    void scaleN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> weights)
+    {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
+        } else {
+            launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+#endif
+    template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+    template <class T, std::size_t N> static
+    void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::scale1_with_bias1_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, alpha, beta);
+    }
+
+    template <class T>
+    void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+        CV_Assert(output.size() == input.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
+        } else {
+            launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+#endif
+    template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
+
+    template <class T, std::size_t N> static
+    void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::scaleN_with_biasN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
+    }
+
+    template <class T>
+    void scaleN_with_biasN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> weights, TensorView<T> bias)
+    {
+        CV_Assert(is_shape_same(input, output));
+        CV_Assert(weights.size() == bias.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
+        } else {
+            launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
+#endif
+    template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/shortcut.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/shortcut.cu
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, std::size_t N>
+    __global__ void input_shortcut_vec(
+        Span<T> output,
+        View<T> input, index_type c_input, /* `c_input` = number of channels in `input` */
+        View<T> from, index_type c_from, /* `c_from` = number of channels in `from` */
+        size_type channel_stride /* common for both `input` and `from` */)
+    {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto input_vPtr = vector_type::get_pointer(input.data());
+        auto from_vPtr = vector_type::get_pointer(from.data());
+
+        auto batch_stride_input = c_input * channel_stride;
+        auto batch_stride_from = c_from * channel_stride;
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            const auto actual_idx = i * vector_type::size();
+            const auto b = actual_idx / batch_stride_input; /* `input` and `output` have the same shape */
+            const auto c = (actual_idx % batch_stride_input) / channel_stride;
+            const auto c_offset = actual_idx % channel_stride;
+
+            vector_type vec_input;
+            v_load(vec_input, input_vPtr[i]);
+
+            /* We can break down the shortcut operation into two steps:
+             * - copy `input` to `output`
+             * - add `from` to corresponding channels in `output`
+             *
+             * In this scheme, only some channels in the `output` differ from `input`. They differ in the channels
+             * which have a corresponding channel in `from`.
+             */
+            if (c < c_from) {
+                const auto from_actual_idx = b * batch_stride_from + c * channel_stride + c_offset;
+                const auto from_vec_idx = from_actual_idx / vector_type::size();
+
+                vector_type vec_from;
+                v_load(vec_from, from_vPtr[from_vec_idx]);
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec_input.data[j] += vec_from.data[j];
+            }
+
+            v_store(output_vPtr[i], vec_input);
+        }
+    }
+}
+
+template <class T, std::size_t N>
+void launch_vectorized_input_shortcut(const Stream& stream, Span<T> output, View<T> input, std::size_t c_input, View<T> from, std::size_t c_from, std::size_t channel_stride) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(input, N));
+    CV_Assert(is_fully_aligned<T>(from, N));
+    CV_Assert(channel_stride % N == 0);
+
+    auto kernel = raw::input_shortcut_vec<T, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, input, c_input, from, c_from, channel_stride);
+}
+
+template <class T>
+void input_shortcut(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> from) {
+    CV_Assert(is_shape_same(output, input));
+    CV_Assert(output.rank() == from.rank());
+    for (int i = 0; i < output.rank(); i++) {
+        if (i != 1) {
+            CV_Assert(from.get_axis_size(i) == output.get_axis_size(i));
+        }
+    }
+
+    auto channel_stride = output.size_range(2, output.rank()); /* same for `output`, `input` and `from` */
+    auto c_input = input.get_axis_size(1);
+    auto c_from = from.get_axis_size(1);
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_fully_aligned<T>(from, 4) && channel_stride % 4 == 0) {
+        launch_vectorized_input_shortcut<T, 4>(stream, output, input, c_input, from, c_from, channel_stride);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_fully_aligned<T>(from, 2) && channel_stride % 2 == 0) {
+        launch_vectorized_input_shortcut<T, 2>(stream, output, input, c_input, from, c_from, channel_stride);
+    } else {
+        launch_vectorized_input_shortcut<T, 1>(stream, output, input, c_input, from, c_from, channel_stride);
+    }
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void input_shortcut(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<__half>);
+#endif
+template void input_shortcut(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/slice.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/slice.cu
@@ -0,0 +1,203 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void slice(
+            Span<T> output, array<size_type, Rank> out_strides,
+            View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset)
+        {
+            for (auto i : grid_stride_range(output.size())) {
+                index_type out_index = i / out_strides[0];
+                index_type in_index = in_offset[0] + out_index;
+                index_type iidx = in_index * in_strides[0];
+                for (int j = 1; j < Rank; j++) {
+                    out_index = (i % out_strides[j - 1]) / out_strides[j];
+                    in_index = in_offset[j] + out_index;
+                    iidx += in_index * in_strides[j];
+                }
+
+                output[i] = input[iidx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_slice(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+        CV_Assert(inOffset.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> inOffset_k;
+        inOffset_k.assign(std::begin(inOffset), std::end(inOffset));
+
+        auto kernel = raw::slice<T, Rank>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice);
+
+    template <class T>
+    void slice(const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> offsets)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == offsets.size());
+
+        /* copy directly if no slicing is required */
+        if (is_shape_same(output, input))
+        {
+            CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
+            kernels::copy<T>(stream, output, input);
+            return;
+        }
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input
+         * tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are ignored.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output indices
+         * for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that
+         * there cannot be any ignored items since the axes have unit size. The first index does not contribute to the
+         * element's address calculation and hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(offsets[0] == 0);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            offsets.erase(std::begin(offsets));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == offsets.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes which do not undergo slicing can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any
+         * slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...].
+         *
+         * Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`.
+         * Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any slicing */
+            if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+                    /* `j` axis is also unsliced; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    offsets[i] = 0; /* redundant */
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    offsets.erase(std::begin(offsets) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == offsets.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(offsets[i] == 0);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        /* We can do a copy if the reduced rank is two and only the first axis is sliced.
+         * The general requirement is that only one axis is sliced and all the axes that
+         * preceed the sliced axis are singleton. However, the reductions above will remove
+         * all the leading singleton axes and merge the trailing unsliced axes into one, or
+         * zero if there are no trailing unsliced axes. The latter is handled separately.
+         */
+        if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
+        {
+            auto stride = inShape[1];
+            auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
+            kernels::copy<T>(stream, output, sliced_input);
+            return;
+        }
+
+        if (rank == 1)
+        {
+            auto sliced_input = View<T>(input.get() + offsets[0], output.size());
+            kernels::copy<T>(stream, output, sliced_input);
+            return;
+        }
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets);
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+#endif
+    template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/types.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/types.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP
+#define OPENCV_DNN_SRC_CUDA_TYPES_HPP
+
+#include <cstdint>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    /* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
+     * Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions.
+     *
+     * If we do not need to handle huge tensors, we can use 32-bit indices and get better performance.
+     */
+#ifdef __CUDACC__
+    using size_type = int;
+    using index_type = int;
+#else
+    using size_type = std::int32_t;
+    using index_type = std::int32_t;
+#endif
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/vector_traits.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/vector_traits.hpp
@@ -0,0 +1,120 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/pointer.hpp"
+
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    /** \file vector_traits.hpp
+     *  \brief utility classes and functions for vectorized memory loads/stores
+     *
+     * Example:
+     * using vector_type = get_vector_type_t<float, 4>;
+     *
+     * auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
+     * auto output_vPtr = type::get_pointer(optr);  // optr is of type DevicePtr<float>
+     *
+     * vector_type vec;
+     * v_load(vec, input_vPtr);
+     *
+     * for(int i = 0; i < vector_type::size(); i++)
+     *      vec[i] = do_something(vec[i]);
+     *
+     * v_store(output_vPtr, vec);
+     */
+
+    namespace detail {
+        template <size_type N> struct raw_type_ { };
+        template <> struct raw_type_<256> { typedef ulonglong4 type; };
+        template <> struct raw_type_<128> { typedef uint4 type; };
+        template <> struct raw_type_<64> { typedef uint2 type; };
+        template <> struct raw_type_<32> { typedef uint1 type; };
+        template <> struct raw_type_<16> { typedef uchar2 type; };
+        template <> struct raw_type_<8> { typedef uchar1 type; };
+
+        template <size_type N> struct raw_type {
+            using type = typename raw_type_<N>::type;
+            static_assert(sizeof(type) * 8 == N, "");
+        };
+    }
+
+    /* \tparam T    type of element in the vector
+     * \tparam N    "number of elements" of type T in the vector
+     */
+    template <class T, size_type N>
+    union vector_type {
+        using value_type = T;
+        using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type;
+
+        __device__ vector_type() { }
+
+        __device__ static constexpr size_type size() { return N; }
+
+        raw_type raw;
+        T data[N];
+
+        template <class U> static __device__
+        typename std::enable_if<std::is_const<U>::value, const vector_type*>
+        ::type get_pointer(csl::DevicePtr<U> ptr) {
+            return reinterpret_cast<const vector_type*>(ptr.get());
+        }
+
+        template <class U> static __device__
+        typename std::enable_if<!std::is_const<U>::value, vector_type*>
+        ::type get_pointer(csl::DevicePtr<U> ptr) {
+            return reinterpret_cast<vector_type*>(ptr.get());
+        }
+    };
+
+    template <class V>
+    __device__ void v_load(V& dest, const V& src) {
+        dest.raw = src.raw;
+    }
+
+    template <class V>
+    __device__ void v_load(V& dest, const V* src) {
+        dest.raw = src->raw;
+    }
+
+    template <class V>
+    __device__ void v_load_ldg(V& dest, const V& src) {
+        dest.raw = load_ldg(src.raw);
+    }
+
+    template <class V>
+    __device__ void v_load_ldg(V& dest, const V* src) {
+        dest.raw = load_ldg(src->raw);
+    }
+
+    template <class V>
+    __device__ void v_store(V* dest, const V& src) {
+        dest->raw = src.raw;
+    }
+
+    template <class V>
+    __device__ void v_store(V& dest, const V& src) {
+        dest.raw = src.raw;
+    }
+
+    template <class T, size_type N>
+    struct get_vector_type {
+        typedef vector_type<T, N> type;
+    };
+
+    template <class T, size_type N>
+    using get_vector_type_t = typename get_vector_type<T, N>::type;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cublas.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cublas.hpp
@@ -0,0 +1,368 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
+
+#include "error.hpp"
+#include "stream.hpp"
+#include "pointer.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cublas_v2.h>
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#define CUDA4DNN_CHECK_CUBLAS(call) \
+    ::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
+
+    /** @brief exception class for errors thrown by the cuBLAS API */
+    class cuBLASException : public CUDAException {
+    public:
+        using CUDAException::CUDAException;
+    };
+
+    namespace detail {
+        static void check(cublasStatus_t status, const char* func, const char* file, int line) {
+            auto cublasGetErrorString = [](cublasStatus_t err) {
+                switch (err) {
+                case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+                case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+                case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+                case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+                case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+                case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+                case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+                case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+                case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+                case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+                }
+                return "UNKNOWN_CUBLAS_ERROR";
+            };
+
+            if (status != CUBLAS_STATUS_SUCCESS)
+                throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
+        }
+    }
+
+    /** non-copyable cuBLAS smart handle
+     *
+     * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
+     * is destroyed after use. The handle must always be associated with a non-default stream. The stream
+     * must be specified during construction.
+     *
+     * Refer to stream API for more information for the choice of forcing non-default streams.
+     */
+    class UniqueHandle {
+    public:
+        UniqueHandle() noexcept : handle{ nullptr } { }
+        UniqueHandle(UniqueHandle&) = delete;
+        UniqueHandle(UniqueHandle&& other) noexcept {
+            stream = std::move(other.stream);
+            handle = other.handle;
+            other.handle = nullptr;
+        }
+
+        /** creates a cuBLAS handle and associates it with the stream specified
+         *
+         * Exception Guarantee: Basic
+         */
+        UniqueHandle(Stream strm) : stream(std::move(strm)) {
+            CV_Assert(stream);
+            CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
+            try {
+                CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
+            } catch (...) {
+                /* cublasDestroy won't throw if a valid handle is passed */
+                CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
+                throw;
+            }
+        }
+
+        ~UniqueHandle() noexcept {
+            if (handle) {
+                /* cublasDestroy won't throw if a valid handle is passed */
+                CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
+            }
+        }
+
+        UniqueHandle& operator=(const UniqueHandle&) = delete;
+        UniqueHandle& operator=(UniqueHandle&& other) noexcept {
+            CV_Assert(other);
+            if (&other != this) {
+                UniqueHandle(std::move(*this)); /* destroy current handle */
+                stream = std::move(other.stream);
+                handle = other.handle;
+                other.handle = nullptr;
+            }
+            return *this;
+        }
+
+        /** returns the raw cuBLAS handle */
+        cublasHandle_t get() const noexcept {
+            CV_Assert(handle);
+            return handle;
+        }
+
+        /** returns true if the handle is valid */
+        explicit operator bool() const noexcept { return static_cast<bool>(handle); }
+
+    private:
+        Stream stream;
+        cublasHandle_t handle;
+    };
+
+    /** @brief sharable cuBLAS smart handle
+     *
+     * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
+     * is destroyed after all references to the handle are destroyed. The handle must always
+     * be associated with a non-default stream. The stream must be specified during construction.
+     *
+     * @note Moving a Handle object to another invalidates the former
+     */
+    class Handle {
+    public:
+        Handle() = default;
+        Handle(const Handle&) = default;
+        Handle(Handle&&) = default;
+
+        /** creates a cuBLAS handle and associates it with the stream specified
+         *
+         * Exception Guarantee: Basic
+         */
+        Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
+
+        Handle& operator=(const Handle&) = default;
+        Handle& operator=(Handle&&) = default;
+
+        /** returns true if the handle is valid */
+        explicit operator bool() const noexcept { return static_cast<bool>(handle); }
+
+        /** returns the raw cuBLAS handle */
+        cublasHandle_t get() const noexcept {
+            CV_Assert(handle);
+            return handle->get();
+        }
+
+    private:
+        std::shared_ptr<UniqueHandle> handle;
+    };
+
+    /** @brief GEMM for colummn-major matrices
+     *
+     * \f$ C = \alpha AB + \beta C \f$
+     *
+     * @tparam          T           matrix element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuBLAS Handle
+     * @param           transa      use transposed matrix of A for computation
+     * @param           transb      use transposed matrix of B for computation
+     * @param           rows_c      number of rows in C
+     * @param           cols_c      number of columns in C
+     * @param           common_dim  common dimension of A (or trans A) and B (or trans B)
+     * @param           alpha       scale factor for AB
+     * @param[in]       A           pointer to column-major matrix A in device memory
+     * @param           lda         leading dimension of matrix A
+     * @param[in]       B           pointer to column-major matrix B in device memory
+     * @param           ldb         leading dimension of matrix B
+     * @param           beta        scale factor for C
+     * @param[in,out]   C           pointer to column-major matrix C in device memory
+     * @param           ldc         leading dimension of matrix C
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void gemm(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        T alpha, const DevicePtr<const T> A, std::size_t lda,
+        const DevicePtr<const T> B, std::size_t ldb,
+        T beta, const DevicePtr<T> C, std::size_t ldc);
+
+    template <> inline
+    void gemm<half>(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        half alpha, const DevicePtr<const half> A, std::size_t lda,
+        const DevicePtr<const half> B, std::size_t ldb,
+        half beta, const DevicePtr<half> C, std::size_t ldc)
+    {
+        CV_Assert(handle);
+
+        auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+            opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+        int irows_c = static_cast<int>(rows_c),
+            icols_c = static_cast<int>(cols_c),
+            icommon_dim = static_cast<int>(common_dim),
+            ilda = static_cast<int>(lda),
+            ildb = static_cast<int>(ldb),
+            ildc = static_cast<int>(ldc);
+
+        CUDA4DNN_CHECK_CUBLAS(
+            cublasHgemm(
+                handle.get(),
+                opa, opb,
+                irows_c, icols_c, icommon_dim,
+                &alpha, A.get(), ilda,
+                B.get(), ildb,
+                &beta, C.get(), ildc
+            )
+        );
+    }
+
+    template <> inline
+    void gemm<float>(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        float alpha, const DevicePtr<const float> A, std::size_t lda,
+        const DevicePtr<const float> B, std::size_t ldb,
+        float beta, const DevicePtr<float> C, std::size_t ldc)
+    {
+        CV_Assert(handle);
+
+        auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+            opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+        int irows_c = static_cast<int>(rows_c),
+            icols_c = static_cast<int>(cols_c),
+            icommon_dim = static_cast<int>(common_dim),
+            ilda = static_cast<int>(lda),
+            ildb = static_cast<int>(ldb),
+            ildc = static_cast<int>(ldc);
+
+        CUDA4DNN_CHECK_CUBLAS(
+            cublasSgemm(
+                handle.get(),
+                opa, opb,
+                irows_c, icols_c, icommon_dim,
+                &alpha, A.get(), ilda,
+                B.get(), ildb,
+                &beta, C.get(), ildc
+            )
+        );
+    }
+
+    /** @brief Strided batched GEMM for colummn-major matrices
+     *
+     * \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i
+     *
+     * @tparam          T           matrix element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuBLAS Handle
+     * @param           transa      use transposed matrix of A_i for computation
+     * @param           transb      use transposed matrix of B_i for computation
+     * @param           rows_c      number of rows in C_i
+     * @param           cols_c      number of columns in C_i
+     * @param           common_dim  common dimension of A_i (or trans A_i) and B_i (or trans B_i)
+     * @param           alpha       scale factor for A_i B_i
+     * @param[in]       A           pointer to stack of column-major matrices A in device memory
+     * @param           lda         leading dimension of matrix A_i
+     * @param           strideA     stride between matrices in A
+     * @param[in]       B           pointer to stack of column-major matrices B in device memory
+     * @param           ldb         leading dimension of matrix B_i
+     * @param           strideB     stride between matrices in B
+     * @param           beta        scale factor for C_i
+     * @param[in,out]   C           pointer to stack of column-major matrices C in device memory
+     * @param           ldc         leading dimension of matrix C_i
+     * @param           strideC     stride between matrices in C
+     * @param           batchCount  number of matrices in the batch
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void gemmStridedBatched(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        T alpha, const DevicePtr<const T> A, std::size_t lda, std::size_t strideA,
+        const DevicePtr<const T> B, std::size_t ldb, std::size_t strideB,
+        T beta, const DevicePtr<T> C, std::size_t ldc, std::size_t strideC,
+        std::size_t batchCount);
+
+    template <> inline
+    void gemmStridedBatched<half>(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        half alpha, const DevicePtr<const half> A, std::size_t lda, std::size_t strideA,
+        const DevicePtr<const half> B, std::size_t ldb, std::size_t strideB,
+        half beta, const DevicePtr<half> C, std::size_t ldc, std::size_t strideC,
+        std::size_t batchCount)
+    {
+        CV_Assert(handle);
+
+        const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+                   opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+        const auto irows_c = static_cast<int>(rows_c),
+                   icols_c = static_cast<int>(cols_c),
+                   icommon_dim = static_cast<int>(common_dim),
+                   ilda = static_cast<int>(lda),
+                   ildb = static_cast<int>(ldb),
+                   ildc = static_cast<int>(ldc);
+
+        const auto batch_count = static_cast<int>(batchCount);
+        const auto stride_a = static_cast<long long int>(strideA),
+                   stride_b = static_cast<long long int>(strideB),
+                   stride_c = static_cast<long long int>(strideC);
+
+        CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
+
+        CUDA4DNN_CHECK_CUBLAS(
+            cublasHgemmStridedBatched(
+                handle.get(),
+                opa, opb,
+                irows_c, icols_c, icommon_dim,
+                &alpha, A.get(), ilda, stride_a,
+                B.get(), ildb, stride_b,
+                &beta, C.get(), ildc, stride_c,
+                batch_count
+            )
+        );
+    }
+
+    template <> inline
+    void gemmStridedBatched<float>(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        float alpha, const DevicePtr<const float> A, std::size_t lda, std::size_t strideA,
+        const DevicePtr<const float> B, std::size_t ldb, std::size_t strideB,
+        float beta, const DevicePtr<float> C, std::size_t ldc, std::size_t strideC,
+        std::size_t batchCount)
+    {
+        CV_Assert(handle);
+
+        const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+                   opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+        const auto irows_c = static_cast<int>(rows_c),
+                   icols_c = static_cast<int>(cols_c),
+                   icommon_dim = static_cast<int>(common_dim),
+                   ilda = static_cast<int>(lda),
+                   ildb = static_cast<int>(ldb),
+                   ildc = static_cast<int>(ldc);
+
+        const auto batch_count = static_cast<int>(batchCount);
+        const auto stride_a = static_cast<long long int>(strideA),
+                   stride_b = static_cast<long long int>(strideB),
+                   stride_c = static_cast<long long int>(strideC);
+
+        CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
+
+        CUDA4DNN_CHECK_CUBLAS(
+            cublasSgemmStridedBatched(
+                handle.get(),
+                opa, opb,
+                irows_c, icols_c, icommon_dim,
+                &alpha, A.get(), ilda, stride_a,
+                B.get(), ildb, stride_b,
+                &beta, C.get(), ildc, stride_c,
+                batch_count
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
--- a/Show More
+++ b/Show More