feat: 切换后端至PaddleOCR-NCNN，切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法，已通过基本的兼容性测试 2.工程改为使用CMake组织，后续为了更好地兼容第三方库，不再提供QMake工程 3.重整权利声明文件，重整代码工程，确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN，切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activation_eltwise.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activation_eltwise.cu
@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
+    __global__ void generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        ActivationOp activation_op(act_params);
+        EltwiseOp eltwise_op(eltwise_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j]), eltwise_vec.data[j]);
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
+void launch_vectorized_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+
+    auto kernel = raw::generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, eltwise, act_params, eltwise_params);
+}
+
+template <class T, class ActivationOp, class EltwiseOp> static
+void generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4)) {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2)) {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    } else {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    }
+}
+
+template <class T>
+void relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T slope) {
+    generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {slope});
+}
+
+template <class T>
+void clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {floor, ceiling});
+}
+
+template <class T>
+void tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T exp, T scale, T shift) {
+    generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float);
+template void clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float);
+template void tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activations.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activations.cu
@ -0,0 +1,209 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
+
+namespace raw {
+    template <class T, class ActivationOp, std::size_t N>
+    __global__ void generic_op_vec(Span<T> output, View<T> input, const typename ActivationOp::Params params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto input_vPtr = vector_type::get_pointer(input.data());
+
+        ActivationOp activation_op(params);
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            vector_type vec;
+            v_load(vec, input_vPtr[i]);
+            for (int j = 0; j < vector_type::size(); j++)
+                vec.data[j] = activation_op(vec.data[j]);
+            v_store(output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto input_vPtr = vector_type::get_pointer(input.data());
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            const index_type c = (i / inner_size) % slope.size();
+
+            vector_type vec;
+            v_load(vec, input_vPtr[i]);
+            for (int j = 0; j < vector_type::size(); j++)
+                vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c];
+            v_store(output_vPtr[i], vec);
+        }
+    }
+
+} /* namespace raw */
+
+template <class T, class ActivationOp, std::size_t N> static
+void launch_vectorized_generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(input, N));
+
+    auto kernel = raw::generic_op_vec<T, ActivationOp, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, input, params);
+}
+
+template <class T, class ActivationOp> static
+void generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params = {}) {
+    CV_Assert(input.size() == output.size());
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+        launch_vectorized_generic_op<T, ActivationOp, 4>(stream, output, input, params);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+        launch_vectorized_generic_op<T, ActivationOp, 2>(stream, output, input, params);
+    } else {
+        launch_vectorized_generic_op<T, ActivationOp, 1>(stream, output, input, params);
+    }
+}
+
+template <class T>
+void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+    generic_op<T, ReLUFunctor<T>>(stream, output, input, {slope});
+}
+
+template <class T>
+void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    generic_op<T, ClippedReLUFunctor<T>>(stream, output, input, {floor, ceiling});
+}
+
+template <class T>
+void tanh(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, TanHFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void swish(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, SwishFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void mish(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, MishFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, SigmoidFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void elu(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, ELUFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void bnll(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, BNLLFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void abs(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, AbsFunctor<T>>(stream, output, input);
+}
+
+template <class T>
+void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
+    CV_Assert(input.size() == output.size());
+
+    if (static_cast<float>(exp) == 1.0f) {
+        scale1_with_bias1(stream, output, input, scale, shift);
+        return;
+    }
+
+    generic_op<T, PowerFunctor<T>>(stream, output, input, {exp, scale, shift});
+}
+
+template <class T>
+void exp(const Stream& stream, Span<T> output, View<T> input, T normScale, T normShift) {
+    generic_op<T, ExpFunctor<T>>(stream, output, input, {normScale, normShift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
+template void swish<__half>(const Stream&, Span<__half>, View<__half>);
+template void mish<__half>(const Stream&, Span<__half>, View<__half>);
+template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
+template void elu<__half>(const Stream&, Span<__half>, View<__half>);
+template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
+template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
+template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+#endif
+
+
+template void relu<float>(const Stream&, Span<float>, View<float>, float);
+template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void tanh<float>(const Stream&, Span<float>, View<float>);
+template void swish<float>(const Stream&, Span<float>, View<float>);
+template void mish<float>(const Stream&, Span<float>, View<float>);
+template void sigmoid<float>(const Stream&, Span<float>, View<float>);
+template void elu<float>(const Stream&, Span<float>, View<float>);
+template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
+template void bnll<float>(const Stream&, Span<float>, View<float>);
+template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
+template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
+
+template <class T, std::size_t N> static
+void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(input, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::axiswise_relu_vec<T, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, input, inner_size / N, slope);
+}
+
+template <class T>
+void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+    CV_Assert(input.size() == output.size());
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+        launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+        launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope);
+    } else {
+        launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope);
+    }
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>);
+#endif
+    template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/array.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/array.hpp
@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+
+#include <cstddef>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T, std::size_t N>
+    struct array {
+        using value_type        = T;
+        using size_type         = device::size_type;
+        using difference_type   = std::ptrdiff_t;
+        using reference         = typename std::add_lvalue_reference<value_type>::type;
+        using const_reference   = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type;
+        using pointer           = typename std::add_pointer<value_type>::type;
+        using const_pointer     = typename std::add_pointer<typename std::add_const<value_type>::type>::type;
+        using iterator          = pointer;
+        using const_iterator    = const_pointer;
+        using reverse_iterator  = std::reverse_iterator<iterator>;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+        __host__ __device__ bool empty() const noexcept { return N == 0; }
+        __host__ __device__ size_type size() const noexcept { return N; }
+
+        __host__ __device__ iterator begin() noexcept { return ptr; }
+        __host__ __device__ iterator end() noexcept { return ptr + N; }
+        __host__ __device__ const_iterator begin() const noexcept { return ptr; }
+        __host__ __device__ const_iterator end() const noexcept { return ptr + N; }
+
+        __host__ __device__ const_iterator cbegin() const noexcept { return ptr; }
+        __host__ __device__ const_iterator cend() const noexcept { return ptr + N; }
+
+        __host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; }
+        __host__ __device__ reverse_iterator rend() noexcept { return ptr; }
+        __host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; }
+        __host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; }
+
+        __host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; }
+        __host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; }
+
+        template <class InputItr>
+        __host__ void assign(InputItr first, InputItr last) {
+            std::copy(first, last, std::begin(ptr));
+        }
+
+        __host__ __device__ reference operator[](int idx) { return ptr[idx]; }
+        __host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; }
+
+        __host__ __device__ reference front() { return ptr[0]; }
+        __host__ __device__ const_reference front() const { return ptr[0]; }
+
+        __host__ __device__ reference back() { return ptr[N - 1]; }
+        __host__ __device__ const_reference back() const { return ptr[N - 1]; }
+
+        __host__ __device__ pointer data() noexcept { return ptr; }
+        __host__ __device__ const_pointer data() const noexcept { return ptr; }
+
+        T ptr[N];
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/atomics.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/atomics.hpp
@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
+// This function was introduced in CUDA 10.
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700 && CUDART_VERSION >= 10000)
+// And half-precision floating-point operations are not supported by devices of compute capability strictly lower than 5.3
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
+#elif __CUDA_ARCH__ < 530
+#else
+inline __device__ void atomicAdd(__half* address, __half val) {
+    unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        __half tmpres = hsum + val;
+        hsum = __half_raw(tmpres);
+
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+
+#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bbox_utils.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bbox_utils.hpp
@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
+#define OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
+
+#include "math.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    struct BoundingBox
+    {
+        float xmin, ymin, xmax, ymax;
+    };
+
+    template <bool NORMALIZED_BBOX>
+    __device__ __forceinline__ float compute_bbox_size(BoundingBox bbox)
+    {
+        float width = bbox.xmax - bbox.xmin;
+        float height = bbox.ymax - bbox.ymin;
+        if (width < 0 || height < 0)
+            return 0.0;
+
+        if (!NORMALIZED_BBOX)
+        {
+            width += 1;
+            height += 1;
+        }
+
+        using csl::device::mul_ftz;
+        return mul_ftz(width, height);
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation.cu
@ -0,0 +1,120 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, class ActivationOp, std::size_t N>
+    __global__ void biasN_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, const typename ActivationOp::Params params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        ActivationOp activation_op(params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % bias.size();
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++)
+                vec.data[j] = activation_op(vec.data[j] + bias[bias_idx]);
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+} /* namespace raw */
+
+template <class T, class ActivationOp, std::size_t N> static
+void launch_vectorized_biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params) {
+    CV_Assert(inplace_output.size() % inner_size == 0);
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_generic_op_inplace_vec<T, ActivationOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, params);
+}
+
+template <class T, class ActivationOp> static
+void biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params = {}) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 4>(stream, inplace_output, inner_size, bias, params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 2>(stream, inplace_output, inner_size, bias, params);
+    } else {
+        launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 1>(stream, inplace_output, inner_size, bias, params);
+    }
+}
+
+template <class T>
+void biasN_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
+    biasN_generic_op_inplace<T, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {slope});
+}
+
+template <class T>
+void biasN_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceil));
+    biasN_generic_op_inplace<T, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {floor, ceil});
+}
+
+template <class T>
+void biasN_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, TanHFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, SwishFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, MishFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    biasN_generic_op_inplace<T, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power, T scale, T shift) {
+    biasN_generic_op_inplace<T, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, {power, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
+template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half);
+template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half, __half);
+#endif
+
+template void biasN_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
+template void biasN_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float);
+template void biasN_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+template void biasN_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation_eltwise.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation_eltwise.cu
@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
+    __global__ void biasN_generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        ActivationOp activation_op(act_params);
+        EltwiseOp eltwise_op(eltwise_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % bias.size();
+
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j] + bias[bias_idx]), eltwise_vec.data[j]);
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
+void launch_vectorized_biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, act_params, eltwise_params);
+}
+
+template <class T, class ActivationOp, class EltwiseOp> static
+void biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
+        launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
+        launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
+    } else {
+        launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
+    }
+}
+
+template <class T>
+void biasN_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
+    biasN_generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {slope});
+}
+
+template <class T>
+void biasN_clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    biasN_generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {floor, ceiling});
+}
+
+template <class T>
+void biasN_tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
+    biasN_generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void biasN_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
+template void biasN_clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
+template void biasN_tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void biasN_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
+template void biasN_clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
+template void biasN_tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_eltwise_activation.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_eltwise_activation.cu
@ -0,0 +1,132 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
+    __global__ void biasN_eltwise_op_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        EltwiseOp eltwise_op(eltwise_params);
+        ActivationOp activation_op(act_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % bias.size();
+
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = activation_op(eltwise_op(output_vec.data[j] + bias[bias_idx], eltwise_vec.data[j]));
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
+void launch_vectorized_biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inplace_output.size() % bias.size() == 0);
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_eltwise_op_generic_op_inplace_vec<T, EltwiseOp, ActivationOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, eltwise_params, act_params);
+}
+
+template <class T, class EltwiseOp, class ActivationOp> static
+void biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
+        launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 4>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
+        launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 2>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
+    } else {
+        launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 1>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
+    }
+}
+
+template <class T>
+void biasN_eltwise_sum_2_identity_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, IdentityFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {slope});
+}
+
+template <class T>
+void biasN_eltwise_sum_2_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {floor, ceiling});
+}
+
+template <class T>
+void biasN_eltwise_sum_2_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, TanHFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SwishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, MishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
+}
+
+template <class T>
+void biasN_eltwise_sum_2_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
+    biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void biasN_eltwise_sum_2_identity_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
+template void biasN_eltwise_sum_2_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
+template void biasN_eltwise_sum_2_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
+template void biasN_eltwise_sum_2_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void biasN_eltwise_sum_2_identity_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
+template void biasN_eltwise_sum_2_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
+template void biasN_eltwise_sum_2_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
+template void biasN_eltwise_sum_2_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/block_stride_range.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/block_stride_range.hpp
@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
+#define OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
+
+#include "types.hpp"
+#include "index_helpers.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+template <int dim, int BLOCK_SIZE = 0, class index_type = device::index_type, class size_type = device::size_type>
+class block_stride_range_generic {
+public:
+    __device__ block_stride_range_generic(index_type to_) : from(0), to(to_) { }
+    __device__ block_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
+
+    class iterator
+    {
+    public:
+        __device__ iterator(index_type pos_) : pos(pos_) {}
+
+        /* these iterators return the index when dereferenced; this allows us to loop
+         * through the indices using a range based for loop
+         */
+        __device__ index_type operator*() const { return pos; }
+
+        __device__ iterator& operator++() {
+            const index_type block_size = BLOCK_SIZE == 0 ? getBlockDim<dim>() : BLOCK_SIZE;
+            pos += block_size;
+            return *this;
+        }
+
+        __device__ bool operator!=(const iterator& other) const {
+            /* NOTE HACK
+             * 'pos' can move in large steps (see operator++)
+             * expansion of range for loop uses != as the loop conditioion
+             * => operator!= must return false if 'pos' crosses the end
+             */
+            return pos < other.pos;
+        }
+
+    private:
+        index_type pos;
+    };
+
+    __device__ iterator begin() const {
+        return iterator(from + getThreadIdx<dim>());
+    }
+
+    __device__ iterator end() const {
+        return iterator(to);
+    }
+
+private:
+    index_type from, to;
+};
+
+using block_stride_range_x = block_stride_range_generic<0>;
+using block_stride_range_y = block_stride_range_generic<1>;
+using block_stride_range_z = block_stride_range_generic<2>;
+
+template <size_type BLOCK_SIZE = 0>
+using block_stride_range = block_stride_range_generic<0, BLOCK_SIZE>;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/concat.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/concat.cu
@ -0,0 +1,277 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void concat_vec(
+            Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+            View<T> input, size_type input_axis_size, size_type concat_size)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            /* we need to copy all the elements of input to some location in the output
+             * we copy blocks of size `total_concat_size` to some location in the output
+             */
+            const auto total_concat_size = concat_size * input_axis_size;
+
+            for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) {
+                const index_type idx = in_idx * vector_type::size();
+                const index_type concat_num = idx / total_concat_size;
+                const index_type concat_index = idx % total_concat_size;
+                const index_type top_index = concat_index +
+                    (concat_num * output_axis_size + output_axis_offset) * concat_size;
+
+                const auto out_idx = top_index / vector_type::size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[in_idx]);
+                v_store(output_vPtr[out_idx], vec);
+            }
+        }
+
+        template <class T, std::size_t Rank>
+        __global__ void concat_with_offsets(
+            Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset,
+            View<T> input, array<size_type, Rank> in_strides)
+        {
+            for (auto i : grid_stride_range(input.size())) {
+                index_type in_index = i / in_strides[0];
+                index_type out_index = out_offset[0] + in_index;
+                index_type oidx = out_index * out_strides[0];
+                for (int j = 1; j < Rank; j++) {
+                    in_index = (i % in_strides[j - 1]) / in_strides[j];
+                    out_index = out_offset[j] + in_index;
+                    oidx += out_index * out_strides[j];
+                }
+
+                output[oidx] = input[i];
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_concat(const Stream& stream,
+        Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+        View<T> input, size_type input_axis_size, size_type concat_size)
+    {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        /* more assertions are required to fully check for vectorization possibility; check concat() */
+
+        auto kernel = raw::concat_vec<T, N>;
+        auto policy = make_policy(kernel, input.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+    }
+
+    template <class T>
+    void concat(
+        const Stream& stream,
+        TensorSpan<T> output, std::size_t output_axis_offset,
+        TensorView<T> input, std::size_t axis)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output_axis_offset < output.get_axis_size(axis));
+
+        /* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
+         * in the output and we can copy each block directly
+         */
+        if (output.size_range(0, axis) == 1)
+        {
+            auto stride = output.size_range(axis + 1, output.rank());
+            auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
+            kernels::copy<T>(stream, sliced_output, input);
+            return;
+        }
+
+        /* let's call the axis of interest as the channel axis for the purpose of the following discussion
+         * even though it can be any axis
+         *
+         * for each batch item:
+         *    we move all the channels from the input (which together, for a single batch item, is contiguous)
+         *    of a batch item to its corresponding contiguous place in the output
+         *
+         * for a valid vector operation:
+         * - the size of each copy block must be aligned
+         * - input must be aligned
+         * - all the destination locations in the output must be aligned
+         */
+        std::size_t concat_size = output.size_range(axis + 1, output.rank());
+
+        std::size_t input_axis_size = input.get_axis_size(axis);
+        std::size_t output_axis_size = output.get_axis_size(axis);
+
+        std::size_t copy_block_size = concat_size * input_axis_size;
+        std::size_t copy_block_stride = concat_size * output_axis_size;
+        std::size_t starting_offset = output_axis_offset * concat_size;
+
+        /* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size`
+         * to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride`
+         */
+
+        bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0;
+        bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0;
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) {
+            launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) {
+            launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        } else {
+            launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
+#endif
+    template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>,  std::size_t);
+
+    template <class T, std::size_t Rank> static
+    void launch_concat_with_offsets(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset,
+        View<T> input, const std::vector<std::size_t>& inStride)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(outOffset.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> outOffset_k;
+        outOffset_k.assign(std::begin(outOffset), std::end(outOffset));
+
+        auto kernel = raw::concat_with_offsets<T, Rank>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets);
+
+    template <class T>
+    void concat_with_offsets(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> offsets)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == offsets.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output
+         * tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items
+         * from the input tensor to new locations in the output tensor.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output
+         * indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...]
+         * respectively. The first index does not contribute to the element's address calculation and
+         * hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(offsets[0] == 0);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            offsets.erase(std::begin(offsets));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == offsets.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes that undergo full copy can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any
+         * concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+         *
+         * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`.
+         * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any slicing */
+            if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+                    /* `j` axis is also copied fully; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    offsets[i] = 0; /* redundant */
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    offsets.erase(std::begin(offsets) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == offsets.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(offsets[i] == 0);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride);
+    }
+
+    template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+    template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/crop_and_resize.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/crop_and_resize.cu
@ -0,0 +1,171 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void crop_and_resize(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes,
+            size_type num_channels)
+        {
+            // input [1, num_channels, in_height, in_width]
+            // output [boxes, num_channels, out_height, out_width]
+
+            const auto in_image_size = in_height * in_width;
+            const auto out_image_size = out_height * out_width;
+            const auto out_box_size = num_channels * out_image_size;
+
+            /* we have to compute the output value for every combination of (box, c, y, x) in the output
+             *
+             * the computation involving (y, x) are identical for all non-spatial dimensions
+             * the computation and memory requests involving the box are identical for remaining three axes
+             *
+             * we process multiple channels every iteration to reuse the identical computation
+             * and memory requests involved with the box and spatial dimensions
+             */
+
+            /*
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y)
+             */
+            auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER;
+
+            /* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are
+             * `num_boxes` boxes and `out_image_size` combinations of (x, y)
+             */
+            auto num_boxes = boxes.size() / 7; /* 7 values per box */
+            auto iters_per_box = num_channel_iters_per_box_xy * out_image_size;
+            auto iters_required = num_boxes * iters_per_box;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type box_no = iter / iters_per_box;
+                const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                const index_type box_offset = box_no * 7;
+                const auto left = boxes[box_offset + 3],
+                           top = boxes[box_offset + 4],
+                           right = boxes[box_offset + 5],
+                           bottom = boxes[box_offset + 6];
+
+                const auto box_width = right - left;
+                const auto box_height = bottom - top;
+
+                const auto o2i_fy = static_cast<T>(in_height - 1) / static_cast<T>(out_height - 1);
+                const auto o2i_fx = static_cast<T>(in_width - 1) / static_cast<T>(out_width - 1);
+
+                const auto height_scale = box_height * o2i_fy;
+                const auto width_scale = box_width * o2i_fx;
+
+                const auto in_y = top * static_cast<T>(in_height - 1) + static_cast<T>(y) * height_scale;
+                const auto in_x = left * static_cast<T>(in_width - 1) + static_cast<T>(x) * width_scale;
+
+                const auto in_y0 = static_cast<index_type>(in_y);
+                const auto in_x0 = static_cast<index_type>(in_x);
+
+                using device::min;
+                const auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                const auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling */
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
+                         v_01 = load_ldg(input[in_offset_r0 + in_x1]),
+                         v_10 = load_ldg(input[in_offset_r1 + in_x0]),
+                         v_11 = load_ldg(input[in_offset_r1 + in_x1]);
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - T(in_y0)) * T(v_10 - v_00) +
+                        T(in_x - T(in_x0)) * T(v_01 - v_00) +
+                        T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_crop_and_resize(const Stream& stream,
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes, size_type num_channels)
+    {
+        auto kernel = raw::crop_and_resize<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+    }
+
+    template <class T>
+    void crop_and_resize(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> boxes) {
+        CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_channels = input.get_axis_size(1);
+
+        if (num_channels % 64 == 0) {
+            launch_multichannel_crop_and_resize<T, 64>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 32 == 0) {
+            launch_multichannel_crop_and_resize<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 16 == 0) {
+            launch_multichannel_crop_and_resize<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 8 == 0) {
+            launch_multichannel_crop_and_resize<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 4 == 0) {
+            launch_multichannel_crop_and_resize<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 2 == 0) {
+            launch_multichannel_crop_and_resize<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else {
+            launch_multichannel_crop_and_resize<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes);
+#endif
+    template void crop_and_resize<float>(const Stream&, TensorSpan<float>, TensorView<float>, View<float> boxes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/detection_output.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/detection_output.cu
@ -0,0 +1,897 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "bbox_utils.hpp"
+#include "grid_stride_range.hpp"
+#include "block_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX>
+    __global__ void decode_bbox(Span<T> decoded_bboxes, View<T> locations, View<T> priors,
+        bool transpose_location, bool normalized_bbox,
+        size_type num_loc_classes, index_type background_class_id,
+        float clip_width, float clip_height)
+    {
+        // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+        // locations: [batch_size, num_priors, num_loc_classes, 4]
+        // priors: [1, C, num_priors, 4]
+        // C = 2 if !VARIANCE_ENCODED_IN_TARGET; otherwise, 1
+
+        /* 4 bbox values + 4 variance values per prior */
+        constexpr int PRIOR_BOX_SIZE = VARIANCE_ENCODED_IN_TARGET ? 4 : 8;
+        const size_type num_priors = priors.size() / PRIOR_BOX_SIZE;
+
+        using vector_type = get_vector_type_t<T, 4>;
+        auto locations_vPtr = vector_type::get_pointer(locations.data());
+        auto priors_vPtr = vector_type::get_pointer(priors.data());
+        auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
+
+        const auto boxes_per_batch = num_priors * num_loc_classes;
+        for (auto idx : grid_stride_range(decoded_bboxes.size() / 4))
+        {
+            index_type p;
+            index_type c;
+
+            if (SHARE_LOCATION)
+            {
+                // locations are shared across all classes => num_loc_classes = 1
+                p = idx % boxes_per_batch;
+                c = 0;
+            }
+            else
+            {
+                p = (idx % boxes_per_batch) / num_loc_classes;
+                c = idx % num_loc_classes;
+            }
+
+            if (!SHARE_LOCATION && c == background_class_id)
+                continue;
+
+            BoundingBox bbox;
+            {
+                vector_type location;
+                v_load(location, locations_vPtr[idx]);
+
+                if (transpose_location)
+                {
+                    bbox.ymin = location.data[0];
+                    bbox.xmin = location.data[1];
+                    bbox.ymax = location.data[2];
+                    bbox.xmax = location.data[3];
+                }
+                else
+                {
+                    bbox.xmin = location.data[0];
+                    bbox.ymin = location.data[1];
+                    bbox.xmax = location.data[2];
+                    bbox.ymax = location.data[3];
+                }
+            }
+
+            if (!VARIANCE_ENCODED_IN_TARGET)
+            {
+                vector_type prior_variance;
+                v_load_ldg(prior_variance, priors_vPtr[num_priors + p]);
+
+                bbox.xmin *= static_cast<float>(prior_variance.data[0]);
+                bbox.ymin *= static_cast<float>(prior_variance.data[1]);
+                bbox.xmax *= static_cast<float>(prior_variance.data[2]);
+                bbox.ymax *= static_cast<float>(prior_variance.data[3]);
+            }
+
+            BoundingBox prior;
+            {
+                vector_type prior_box;
+                v_load_ldg(prior_box, priors_vPtr[p]);
+
+                prior.xmin = prior_box.data[0];
+                prior.ymin = prior_box.data[1];
+                prior.xmax = prior_box.data[2];
+                prior.ymax = prior_box.data[3];
+            }
+
+            BoundingBox decoded_bbox;
+            if (CORNER_TRUE_CENTER_FALSE)
+            {
+                decoded_bbox.xmin = prior.xmin + bbox.xmin;
+                decoded_bbox.ymin = prior.ymin + bbox.ymin;
+                decoded_bbox.xmax = prior.xmax + bbox.xmax;
+                decoded_bbox.ymax = prior.ymax + bbox.ymax;
+            }
+            else
+            {
+                auto prior_width = prior.xmax - prior.xmin;
+                auto prior_height = prior.ymax - prior.ymin;
+                if (!normalized_bbox)
+                {
+                    prior_width += 1;
+                    prior_height += 1;
+                }
+
+                auto prior_center_x = prior.xmin + prior_width * 0.5f;
+                auto prior_center_y = prior.ymin + prior_height * 0.5f;
+
+                auto decode_bbox_center_x = bbox.xmin * prior_width + prior_center_x;
+                auto decode_bbox_center_y = bbox.ymin * prior_height + prior_center_y;
+
+                using device::exp;
+                float decode_bbox_width = exp(bbox.xmax) * prior_width;
+                float decode_bbox_height = exp(bbox.ymax) * prior_height;
+
+                decoded_bbox.xmin = decode_bbox_center_x - decode_bbox_width * 0.5f;
+                decoded_bbox.ymin = decode_bbox_center_y - decode_bbox_height * 0.5f;
+                decoded_bbox.xmax = decode_bbox_center_x + decode_bbox_width * 0.5f;
+                decoded_bbox.ymax = decode_bbox_center_y + decode_bbox_height * 0.5f;
+            }
+
+            vector_type decoded_bbox_vec;
+            if (CLIP_BBOX)
+            {
+                decoded_bbox_vec.data[0] = clamp(decoded_bbox.xmin, 0.0f, clip_width);
+                decoded_bbox_vec.data[1] = clamp(decoded_bbox.ymin, 0.0f, clip_height);
+                decoded_bbox_vec.data[2] = clamp(decoded_bbox.xmax, 0.0f, clip_width);
+                decoded_bbox_vec.data[3] = clamp(decoded_bbox.ymax, 0.0f, clip_height);
+            }
+            else
+            {
+                decoded_bbox_vec.data[0] = decoded_bbox.xmin;
+                decoded_bbox_vec.data[1] = decoded_bbox.ymin;
+                decoded_bbox_vec.data[2] = decoded_bbox.xmax;
+                decoded_bbox_vec.data[3] = decoded_bbox.ymax;
+            }
+
+            v_store(decoded_bboxes_vPtr[idx], decoded_bbox_vec);
+        }
+    }
+
+    template <class T, int BINS, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void findTopK(Span<int> indices_, Span<int> count_, View<T> scores_, float threshold, size_type classwise_topK, size_type num_classes, size_type num_priors, index_type background_class_id)
+    {
+        /* We need to sort boxes based on their confidence scores. The confidence scores fall in
+         * the range [0.0, 1.0]. We break the range into bins and perform count sort. This is an
+         * approximate algorithm.
+         *
+         * Each block handles a particular class of a particular batch item.
+         */
+        const auto c = blockIdx.x;
+        const auto b = blockIdx.y;
+
+        if (c == background_class_id)
+            return;
+
+        // indices: [batch_size, num_classes, classwise_topK]
+        // count: [batch_size, num_classes]
+        // scores: [batch_size, num_classes, num_priors]
+
+        auto count = count_.data() + b * num_classes + c;
+        auto scores = scores_.data() + (b * num_classes + c) * num_priors;
+        auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+
+        /* We do not require a large number of bins to find the top K confidence scores. We will use
+         * a reasonable number of bins which will fit in the shared memory.
+         *
+         * Note that smaller scores will have a smaller index, i.e. the `bins` are ordered in
+         * ascending order.
+         */
+
+        __shared__ int bins[BINS];
+
+        #pragma unroll
+        for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
+            bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
+
+        __syncthreads();
+
+        for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
+        {
+            const float confidence = load_ldg(scores[i]);
+            if (confidence > threshold)
+            {
+                using device::fast_divide_ftz;
+                auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                using device::clamp;
+                int bin_index = conf_scaled * BINS;
+
+                /* We store counts of confidence scores in the bins. Our ultimate goal is to store the indices
+                 * of the `classwise_topK` confidence values in the `indices` array.
+                 *
+                 * We use a little trick to parallelize the process of filling up the `indices` array.
+                 * We want every thread in the block to participate in the process. To do so, we want the
+                 * bins array to be shifted by one place to the left. We will be computing the suffix sum
+                 * of the bins array later. Details and reasons for doing so will be explained later.
+                 */
+                bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
+
+                if (bin_index >= 0)
+                    atomicAdd(&bins[bin_index], 1);
+            }
+        }
+
+        __syncthreads();
+
+        constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
+        // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
+
+        if (threadIdx.x < WARP_SIZE)
+        {
+            /* We can compute suffix sum of an array in groups of N numbers.
+             * Let N be 4 for this example.
+             *
+             * 1) Last 4 numbers
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:                                            42  33  23  12
+             *
+             * 2) Middle 4 numbers
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:                    |   26  21  15  8   |
+             *
+             * We add `42` (first element in the previous group) to each element to get:
+             *
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             *                                      |   68  63  57  50  |   42  33  23  12
+             * 3) First 4 numbers
+             *
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:    10  9   7   4   |
+             *
+             * We add `68` (first element in the previous group) to each element to get:
+             *
+             *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
+             * group suffix sum:    78  77  75  72  |   68  63  57  50  |   42  33  23  12
+             *
+             * What we are left with now is the suffix sum of the entire array.
+             *
+             * We use the aforementioned logic in the code below but work in groups of `warpSize`.
+             */
+
+            /* We calculate suffix sums WARP_SIZE elements at a time starting from the right end.
+             * Hence, we will need BINS / WARP_SIZE number of iterations.
+             *
+             * Each iteration uses shuffle instructions to exchange data between threads. Shuffle
+             * instructions cannot be used in warp-divergent code. If the bins are a multiple of
+             * the warpSize, all the threads in the warp will participate.
+             */
+            static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
+
+            const int thread_id = threadIdx.x;
+            const int inverse_lane_id = WARP_SIZE - thread_id - 1;
+
+            int previous_group_first_element = 0;
+            for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
+            {
+                const index_type idx = iter * WARP_SIZE + thread_id;
+                auto value = bins[idx];
+
+                for (int i = 1; i < WARP_SIZE; i *= 2)
+                {
+                    auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
+                    if (inverse_lane_id >= i)
+                        value += n;
+                }
+
+                value += previous_group_first_element;
+                bins[idx] = value;
+
+                previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
+            }
+        }
+
+        if (threadIdx.x == 0)
+            *count = 0;
+
+        __syncthreads();
+
+        for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
+        {
+            const float confidence = load_ldg(scores[i]);
+            if (confidence > threshold)
+            {
+                using device::fast_divide_ftz;
+                auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                int bin_index = conf_scaled * BINS;
+                bin_index = clamp<int>(bin_index, 0, BINS - 1);
+
+                /* This bounding box is eligible to be selected unless it does not fall in
+                 * the `classwise_topK`. If it did, we would have to compute the location where it needs
+                 * to be stored.
+                 *
+                 * Suppose we had just 4 bins and say the following were the counts:
+                 * BIN0 2
+                 * BIN1 1
+                 * BIN2 3
+                 * BIN3 0 (last bin is always zero as we shift left by one while populating the bins)
+                 *
+                 * We will try our best to store the boxes in a sorted order in the `indices` array.
+                 * This requires that the boxes in later bins (higher confidence scores) must be
+                 * stored earlier.
+                 *
+                 * We compute the suffix sum of the array. This gives us:
+                 * BIN0 6
+                 * BIN1 4
+                 * BIN2 3
+                 * BIN3 0
+                 *
+                 * The bins now give us the location in the `indices` array from which the indices of the
+                 * scores corresponding to that bin would be stored. We atomically increment the bin count
+                 * everytime we store a box corresponding to that bin. Therefore, the value in the bins
+                 * gives the index in the `indices` array where the next box corresponding to that bin  must
+                 * be put.
+                 */
+
+                const index_type idx = atomicAdd(&bins[bin_index], 1);
+                if (idx < classwise_topK)
+                {
+                    indices[idx] = i;
+                    atomicAdd(&count[0], 1);
+                }
+            }
+        }
+    }
+
+    template <class T>
+    __global__ void box_collect(Span<T> collected_bboxes_, View<T> decoded_bboxes_, View<int> indices_, View<int> count_, bool share_location, size_type num_priors, size_type num_classes, size_type classwise_topK, index_type background_class_id)
+    {
+        const index_type c = blockIdx.x;
+        if (c == background_class_id)
+            return;
+
+        const index_type b = blockIdx.y;
+
+        // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+        // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+        // indices: [batch_size, num_classes, classwise_topK]
+        // count: [batch_size, num_classes]
+
+        const auto num_loc_classes = share_location ? 1 : num_classes;
+
+        auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
+        auto decoded_bboxes = decoded_bboxes_.data() + b * num_priors * num_loc_classes * 4;
+        auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+        auto count = count_.data() + b * num_classes + c;
+
+        const auto boxes = load_ldg(&count[0]);
+        if (boxes == 0)
+            return;
+
+        using vector_type = get_vector_type_t<T, 4>;
+        auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes);
+        auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
+
+        for (auto i : block_stride_range<>(boxes))
+        {
+            const auto prior_id = indices[i];
+            const index_type idx = share_location ? prior_id : (prior_id * num_classes + c);
+
+            vector_type box;
+            v_load(box, decoded_bboxes_vPtr[idx]);
+            v_store(collected_bboxes_vPtr[i], box);
+        }
+    }
+
+    template <class T, bool NORMALIZED_BBOX>
+    __global__ void blockwise_class_nms(Span<int> indices_, Span<int> count_, View<T> collected_bboxes_, size_type num_classes, size_type classwise_topK, index_type background_class_id, float nms_threshold)
+    {
+        const index_type b = blockIdx.x / num_classes;
+        const index_type c = blockIdx.x % num_classes;
+        if (c == background_class_id)
+            return;
+
+        // indices: [batch_size, num_classes, classwise_topK]
+        // count: [batch_size, num_classes]
+        // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+
+        auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+        auto count = count_.data() + b * num_classes + c;
+        auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
+
+        const auto boxes = count[0];
+        if (boxes == 0)
+            return;
+
+        using vector_type = get_vector_type_t<T, 4>;
+        auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
+
+        for (int i = 0; i < boxes; i++)
+        {
+            auto prior_id = indices[i];
+            if (prior_id != -1)
+            {
+                BoundingBox bbox1;
+                {
+                    vector_type box;
+                    v_load(box, collected_bboxes_vPtr[i]);
+
+                    bbox1.xmin = box.data[0];
+                    bbox1.ymin = box.data[1];
+                    bbox1.xmax = box.data[2];
+                    bbox1.ymax = box.data[3];
+                }
+
+                for (auto j : block_stride_range<>(i + 1, boxes))
+                {
+                    prior_id = indices[j];
+                    if (prior_id == -1)
+                        continue;
+
+                    BoundingBox bbox2;
+                    {
+                        vector_type box;
+                        v_load_ldg(box, collected_bboxes_vPtr[j]);
+
+                        bbox2.xmin = box.data[0];
+                        bbox2.ymin = box.data[1];
+                        bbox2.xmax = box.data[2];
+                        bbox2.ymax = box.data[3];
+                    }
+
+                    using device::min;
+                    using device::max;
+
+                    BoundingBox intersect_bbox;
+                    intersect_bbox.xmin = max(bbox1.xmin, bbox2.xmin);
+                    intersect_bbox.ymin = max(bbox1.ymin, bbox2.ymin);
+                    intersect_bbox.xmax = min(bbox1.xmax, bbox2.xmax);
+                    intersect_bbox.ymax = min(bbox1.ymax, bbox2.ymax);
+
+                    float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
+                    float bbox1_size = compute_bbox_size<NORMALIZED_BBOX>(bbox1);
+                    float bbox2_size = compute_bbox_size<NORMALIZED_BBOX>(bbox2);
+
+                    using device::fast_divide_ftz;
+                    float iou = fast_divide_ftz(intersect_size, bbox1_size + bbox2_size - intersect_size);
+                    if (iou > nms_threshold)
+                        indices[j] = -1;
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if (threadIdx.x == 0)
+            count[0] = 0;
+
+        __syncthreads();
+
+        for (auto i : block_stride_range<>(boxes))
+        {
+            auto prior_id = indices[i];
+            if(prior_id != -1)
+            {
+                const index_type idx = atomicAdd(&count[0], 1);
+                indices[idx] = prior_id;
+            }
+        }
+    }
+
+    template <class T, std::size_t BINS, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void nms_collect(
+        Span<int> kept_indices, Span<int> kept_count, View<int> indices_, View<int> count, View<T> scores_, float threshold,
+        size_type num_classes, size_type num_priors, size_type classwise_topK, size_type keepTopK, index_type background_class_id)
+    {
+        // sorting algorithm is documented in detail in findTopK kernel comments
+        // no explanations are provided here
+
+        // kept_indices: [batch_size, keepTopK]
+        // kept_count: [batch_size]
+
+        const auto b = blockIdx.x;
+
+        __shared__ int bins[BINS];
+
+        #pragma unroll
+        for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
+            bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
+
+        __syncthreads();
+
+        for (int c = 0; c < num_classes; c++)
+        {
+            if (c == background_class_id)
+                continue;
+
+            // indices: [batch_size, num_classes, classwise_topK]
+            // count: [batch_size, num_classes]
+            // scores: [batch_size, num_classes, num_priors]
+
+            const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+            const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
+
+            auto boxes = count[b * num_classes + c];
+
+            for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
+            {
+                auto prior_id = indices[i];
+                const float confidence = load_ldg(scores[prior_id]);
+                if (confidence > threshold)
+                {
+                    using device::fast_divide_ftz;
+                    auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                    using device::clamp;
+                    int bin_index = conf_scaled * BINS;
+                    bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
+
+                    if (bin_index >= 0)
+                        atomicAdd(&bins[bin_index], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
+        // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
+
+        if (threadIdx.x < WARP_SIZE)
+        {
+            static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
+
+            const int thread_id = threadIdx.x;
+            const int inverse_lane_id = WARP_SIZE - thread_id - 1;
+
+            int previous_group_first_element = 0;
+            for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
+            {
+                const index_type idx = iter * WARP_SIZE + thread_id;
+                auto value = bins[idx];
+
+                for (int i = 1; i < WARP_SIZE; i *= 2)
+                {
+                    auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
+                    if (inverse_lane_id >= i)
+                        value += n;
+                }
+
+                value += previous_group_first_element;
+                bins[idx] = value;
+
+                previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
+            }
+        }
+
+        if (threadIdx.x == 0)
+            kept_count[b] = 0;
+
+        __syncthreads();
+
+        for (int c = 0; c < num_classes; c++)
+        {
+            if (c == background_class_id)
+                continue;
+
+            const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
+            const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
+
+            auto boxes = count[b * num_classes + c];
+
+            for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
+            {
+                auto prior_id = indices[i];
+                const float confidence = load_ldg(scores[prior_id]);
+                if (confidence > threshold)
+                {
+                    using device::fast_divide_ftz;
+                    auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
+
+                    using device::clamp;
+                    int bin_index = conf_scaled * BINS;
+                    bin_index = clamp<int>(bin_index, 0, BINS - 1);
+
+                    const index_type idx = atomicAdd(&bins[bin_index], 1);
+                    if (idx < keepTopK)
+                    {
+                        kept_indices[b * keepTopK + idx] = c * num_priors + prior_id;
+                        atomicAdd(&kept_count[b], 1);
+                    }
+                }
+            }
+        }
+    }
+
+    template <class T>
+    __global__ void consolidate_detections(Span<T> output,
+        View<int> kept_indices, View<int> kept_count, View<T> decoded_bboxes, View<T> scores, bool share_location,
+        size_type batch_size, size_type num_classes, size_type num_priors, size_type keepTopK, DevicePtr<int> num_detections)
+    {
+        using vector_type = get_vector_type_t<T, 4>;
+        auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
+
+        // output: [1, 1, batch_size * keepTopK, 7]
+        // kept_indices: [batch_size, keepTopK]
+        // kept_count: [batch_size]
+        // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+        // scores: [batch_size, num_classes, num_priors]
+
+        for (int b = 0; b < batch_size; b++)
+        {
+            for (auto i : grid_stride_range(kept_count[b]))
+            {
+                auto score_id = kept_indices[b * keepTopK + i];
+                auto c = score_id / num_priors;
+                auto prior_id = score_id % num_priors;
+
+                const auto confidence = scores[b * num_classes * num_priors + score_id];
+
+                index_type bbox_id;
+                if (share_location)
+                {
+                    // decoded_bboxes: [batch_size, num_priors, 1, 4]
+                    bbox_id = b * num_priors + prior_id;
+                }
+                else
+                {
+                    // decoded_bboxes: [batch_size, num_priors, num_classes, 4]
+                    bbox_id = (b * num_priors + prior_id) * num_classes + c;
+                }
+
+                vector_type bbox;
+                v_load(bbox, decoded_bboxes_vPtr[bbox_id]);
+
+                auto output_id = atomicAdd(num_detections.get(), 1);
+                output[output_id * 7 + 0] = b;
+                output[output_id * 7 + 1] = c;
+                output[output_id * 7 + 2] = confidence;
+                output[output_id * 7 + 3] = bbox.data[0];
+                output[output_id * 7 + 4] = bbox.data[1];
+                output[output_id * 7 + 5] = bbox.data[2];
+                output[output_id * 7 + 6] = bbox.data[3];
+            }
+        }
+    }
+}
+
+template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX> static
+void launch_decode_boxes_kernel(const Stream& stream, Span<T> decoded_bboxes, View<T> locations, View<T> priors,
+    bool transpose_location, bool normalized_bbox,
+    size_type num_loc_classes, index_type background_class_id,
+    float clip_width, float clip_height)
+{
+    auto kernel = raw::decode_bbox<T, SHARE_LOCATION, VARIANCE_ENCODED_IN_TARGET, CORNER_TRUE_CENTER_FALSE, CLIP_BBOX>;
+    auto policy = make_policy(kernel, decoded_bboxes.size() / 4, 0, stream);
+    launch_kernel(kernel, policy, decoded_bboxes, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
+}
+
+template <class T, unsigned int current, class ...Args> static
+typename std::enable_if<current == 0, void>
+::type dispatch_decode_bboxes(int selector, Args&& ...args) {
+    if(selector == 0)
+        launch_decode_boxes_kernel<T, 0, 0, 0, 0>(std::forward<Args>(args)...);
+}
+
+template <class T, unsigned int current, class ...Args> static
+typename std::enable_if<current != 0, void>
+::type dispatch_decode_bboxes(int selector, Args&& ...args) {
+    if(selector == current)
+        launch_decode_boxes_kernel<T,
+                static_cast<bool>(current & 8),
+                static_cast<bool>(current & 4),
+                static_cast<bool>(current & 2),
+                static_cast<bool>(current & 1)>(std::forward<Args>(args)...);
+    else
+        dispatch_decode_bboxes<T, current - 1, Args...>(selector, std::forward<Args>(args)...);
+}
+
+template <class T>
+void decode_bboxes(const Stream& stream, Span<T> output, View<T> locations, View<T> priors,
+    std::size_t num_loc_classes,
+    bool share_location, std::size_t background_class_id,
+    bool transpose_location, bool variance_encoded_in_target,
+    bool corner_true_or_center_false, bool normalized_bbox,
+    bool clip_box, float clip_width, float clip_height)
+{
+    /* `config` combines three kernel template options into one number using which a bit of TMP code can
+     * run through all possible combinations and instantiate the correct template
+     */
+    unsigned int config = (share_location << 3 | variance_encoded_in_target << 2 | corner_true_or_center_false << 1 | clip_box);
+    dispatch_decode_bboxes<T, 15>(config, stream, output, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
+}
+
+template void decode_bboxes(const Stream&, Span<__half>, View<__half>, View<__half>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
+template void decode_bboxes(const Stream&, Span<float>, View<float>, View<float>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
+
+template <class T>
+void findTopK(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> scores, std::size_t background_class_id, float threshold)
+{
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+    // scores: [batch_size, num_classes, num_priors]
+
+    const auto batch_size = indices.get_axis_size(0);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(scores.get_axis_size(0) == batch_size);
+
+    const auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(scores.get_axis_size(1) == num_classes);
+
+    const auto classwise_topK = indices.get_axis_size(2);
+    const auto num_priors = scores.get_axis_size(2);
+
+    /* each block processes one class from each batch */
+    constexpr auto BLOCK_SIZE = 256;
+
+    dim3 grid_size(num_classes, batch_size);
+    dim3 block_size(BLOCK_SIZE);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    auto kernel = raw::findTopK<T, 2048, BLOCK_SIZE>;
+    launch_kernel(kernel, policy, indices, count, scores, threshold, classwise_topK, num_classes, num_priors, background_class_id);
+}
+
+template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, std::size_t, float);
+template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, std::size_t, float);
+
+template <class T>
+void box_collect(const Stream& stream, TensorSpan<T> collected_bboxes, TensorView<T> decoded_bboxes, TensorView<int> indices, TensorView<int> count, bool share_location, std::size_t background_class_id)
+{
+    // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+    // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+
+    const auto batch_size = collected_bboxes.get_axis_size(0);
+    CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
+    CV_Assert(indices.get_axis_size(0) == batch_size);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+
+    const auto num_classes = collected_bboxes.get_axis_size(1);
+    CV_Assert(indices.get_axis_size(1) == num_classes);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+
+    const auto classwise_topK = collected_bboxes.get_axis_size(2);
+    CV_Assert(indices.get_axis_size(2) == classwise_topK);
+
+    const auto num_priors = decoded_bboxes.get_axis_size(1);
+
+    CV_Assert(!share_location || decoded_bboxes.get_axis_size(2) == 1);
+
+    constexpr int BLOCK_SIZE = 256;
+
+    /* each block processes one class from each batch */
+    dim3 grid_size(num_classes, batch_size);
+    dim3 block_size(BLOCK_SIZE);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    auto kernel = raw::box_collect<T>;
+    launch_kernel(kernel, policy, collected_bboxes, decoded_bboxes, indices, count, share_location, num_priors, num_classes, classwise_topK, background_class_id);
+}
+
+template void box_collect(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<int>, TensorView<int>, bool, std::size_t);
+template void box_collect(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<int>, TensorView<int>, bool, std::size_t);
+
+template <class T>
+void blockwise_class_nms(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> collected_bboxes,
+    bool normalized_bbox, std::size_t background_class_id, float nms_threshold)
+{
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+    // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
+
+    const auto batch_size = indices.get_axis_size(0);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(collected_bboxes.get_axis_size(0) == batch_size);
+
+    const auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(collected_bboxes.get_axis_size(1) == num_classes);
+
+    const auto classwise_topK = indices.get_axis_size(2);
+    CV_Assert(collected_bboxes.get_axis_size(2) == classwise_topK);
+
+    /* each block processes one class from each batch */
+    auto num_blocks = batch_size * num_classes;
+    auto num_threads = std::max<std::size_t>(std::min<std::size_t>(1024, classwise_topK), 32);
+
+    dim3 grid_size(num_blocks);
+    dim3 block_size(num_threads);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    if (normalized_bbox)
+    {
+        auto kernel = raw::blockwise_class_nms<T, true>;
+        launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
+    }
+    else
+    {
+        auto kernel = raw::blockwise_class_nms<T, false>;
+        launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
+    }
+}
+
+template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, bool, std::size_t, float);
+template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, bool, std::size_t, float);
+
+template <class T>
+void nms_collect(const Stream& stream, TensorSpan<int> kept_indices, TensorSpan<int> kept_count,
+    TensorView<int> indices, TensorView<int> count, TensorView<T> scores, float threshold, std::size_t background_class_id)
+{
+    // kept_indices: [batch_size, keepTopK]
+    // kept_count: [batch_size]
+
+    // indices: [batch_size, num_classes, classwise_topK]
+    // count: [batch_size, num_classes]
+    // scores: [batch_size, num_classes, num_priors]
+
+    auto batch_size = kept_indices.get_axis_size(0);
+    CV_Assert(kept_count.get_axis_size(0) == batch_size);
+    CV_Assert(indices.get_axis_size(0) == batch_size);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(scores.get_axis_size(0) == batch_size);
+
+    auto keepTopK = kept_indices.get_axis_size(1);
+
+    auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(scores.get_axis_size(1) == num_classes);
+
+    auto classwise_topK = indices.get_axis_size(2);
+    auto num_priors = scores.get_axis_size(2);
+
+    auto num_blocks = batch_size;
+    constexpr int BLOCK_SIZE = 1024;
+
+    dim3 grid_size(num_blocks);
+    dim3 block_size(BLOCK_SIZE);
+    auto policy = execution_policy(grid_size, block_size, stream);
+
+    auto kernel = raw::nms_collect<T, 1024, BLOCK_SIZE>;
+    launch_kernel(kernel, policy, kept_indices, kept_count, indices, count, scores, threshold, num_classes, num_priors, classwise_topK, keepTopK, background_class_id);
+}
+
+template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<__half>, float, std::size_t);
+template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<float>, float, std::size_t);
+
+template <class T>
+void consolidate_detections(const Stream& stream, TensorSpan<T> output,
+    TensorView<int> kept_indices, TensorView<int> kept_count,
+    TensorView<T> decoded_bboxes, TensorView<T> scores, bool share_location, DevicePtr<int> num_detections)
+{
+    // output: [1, 1, batch_size * keepTopK, 7]
+    // kept_indices: [batch_size, keepTopK]
+    // kept_count: [batch_size]
+    // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
+    // scores: [batch_size, num_classes, num_priors]
+
+    auto batch_size = kept_indices.get_axis_size(0);
+    CV_Assert(kept_count.get_axis_size(0) == batch_size);
+    CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
+    CV_Assert(scores.get_axis_size(0) == batch_size);
+
+    auto keepTopK = kept_indices.get_axis_size(1);
+
+    auto num_classes = scores.get_axis_size(1);
+    auto num_priors = scores.get_axis_size(2);
+
+    CV_Assert(batch_size * keepTopK * 7 == output.size());
+
+    auto kernel = raw::consolidate_detections<T>;
+    auto policy = make_policy(kernel, keepTopK, 0, stream);
+    launch_kernel(kernel, policy, output, kept_indices, kept_count, decoded_bboxes, scores, share_location, batch_size, num_classes, num_priors, keepTopK, num_detections);
+}
+
+template void consolidate_detections(const Stream&, TensorSpan<__half>, TensorView<int>, TensorView<int>, TensorView<__half>, TensorView<__half>, bool, DevicePtr<int>);
+template void consolidate_detections(const Stream&, TensorSpan<float>, TensorView<int>, TensorView<int>, TensorView<float>, TensorView<float>, bool, DevicePtr<int>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_activation.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_activation.cu
@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
+    __global__ void eltwise_op_generic_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto x_vPtr = vector_type::get_pointer(x.data());
+        auto y_vPtr = vector_type::get_pointer(y.data());
+
+        EltwiseOp eltwise_op(eltwise_params);
+        ActivationOp activation_op(act_params);
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            vector_type vec_x, vec_y;
+            v_load(vec_x, x_vPtr[i]);
+            v_load(vec_y, y_vPtr[i]);
+            for(int j = 0; j < vec_x.size(); j++)
+                vec_x.data[j] = activation_op(eltwise_op(vec_x.data[j], vec_y.data[j]));
+            v_store(output_vPtr[i], vec_x);
+        }
+    }
+}
+
+template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
+void launch_vectorized_eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(x, N));
+    CV_Assert(is_fully_aligned<T>(y, N));
+
+    auto kernel = raw::eltwise_op_generic_op_vec<T, EltwiseOp, ActivationOp, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, x, y, eltwise_params, act_params);
+}
+
+template <class T, class EltwiseOp, class ActivationOp> static
+void eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
+    CV_Assert(output.size() == x.size());
+    CV_Assert(output.size() == y.size());
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+        launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 4>(stream, output, x, y, eltwise_params, act_params);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 4)) {
+        launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 2>(stream, output, x, y, eltwise_params, act_params);
+    } else {
+        launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 1>(stream, output, x, y, eltwise_params, act_params);
+    }
+}
+
+template <class T>
+void eltwise_sum_2_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T slope) {
+    eltwise_op_generic_op<T, SumFunctor<T>, ReLUFunctor<T>>(stream, output, x, y, {}, {slope});
+}
+
+template <class T>
+void eltwise_sum_2_clipped_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    eltwise_op_generic_op<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, output, x, y, {}, {floor, ceiling});
+}
+
+template <class T>
+void eltwise_sum_2_tanh(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, TanHFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_swish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, SwishFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_mish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, MishFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_sigmoid(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+    eltwise_op_generic_op<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2_power(const Stream& stream, Span<T> output, View<T> x, View<T> y, T exp, T scale, T shift) {
+    eltwise_op_generic_op<T, SumFunctor<T>, PowerFunctor<T>>(stream, output, x, y, {}, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void eltwise_sum_2_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half);
+template void eltwise_sum_2_clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half);
+template void eltwise_sum_2_tanh<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_swish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_mish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_sigmoid<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
+template void eltwise_sum_2_power<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void eltwise_sum_2_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float);
+template void eltwise_sum_2_clipped_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float, float);
+template void eltwise_sum_2_tanh<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_swish<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_mish<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_sigmoid<float>(const Stream&, Span<float>, View<float>, View<float>);
+template void eltwise_sum_2_power<float>(const Stream&, Span<float>, View<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_ops.cu
@ -0,0 +1,334 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "functors.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, class EltwiseOp, std::size_t N>
+    __global__ void eltwise_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto x_vPtr = vector_type::get_pointer(x.data());
+        auto y_vPtr = vector_type::get_pointer(y.data());
+
+        EltwiseOp eltwise_op(params);
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            vector_type vec_x, vec_y;
+            v_load(vec_x, x_vPtr[i]);
+            v_load(vec_y, y_vPtr[i]);
+            for (int j = 0; j < vector_type::size(); j++)
+                vec_x.data[j] = eltwise_op(vec_x.data[j], vec_y.data[j]);
+            v_store(output_vPtr[i], vec_x);
+        }
+    }
+
+    template <class T, class EltwiseOp, std::size_t Rank>
+    __global__ void eltwise_op_bcast(
+        Span<T> output, array<size_type, Rank> out_strides,
+        View<T> x, array<size_type, Rank> x_strides, array<bool, Rank> x_bcast,
+        View<T> y, array<size_type, Rank> y_strides, array<bool, Rank> y_bcast,
+        const typename EltwiseOp::Params params) {
+        EltwiseOp eltwise_op(params);
+
+        for (auto i : grid_stride_range(output.size())) {
+            index_type out_index = i / out_strides[0];
+            index_type x_index = x_bcast[0] ? 0 : out_index * x_strides[0];
+            index_type y_index = y_bcast[0] ? 0 : out_index * y_strides[0];
+
+            for (int j = 1; j < Rank; j++)
+            {
+                out_index = (i % out_strides[j - 1]) / out_strides[j];
+                if (!x_bcast[j])
+                    x_index += out_index * x_strides[j];
+                if (!y_bcast[j])
+                    y_index += out_index * y_strides[j];
+            }
+
+            output[i] = eltwise_op(x[x_index], y[y_index]);
+        }
+    }
+}
+
+template <class T, class EltwiseOp, std::size_t N> static
+void launch_vectorized_eltwise_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& params) {
+    CV_Assert(x.size() == y.size());
+    CV_Assert(x.size() == output.size());
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(x, N));
+    CV_Assert(is_fully_aligned<T>(y, N));
+
+    auto kernel = raw::eltwise_op_vec<T, EltwiseOp, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, x, y, params);
+}
+
+template <class T, class EltwiseOp, std::size_t Rank> static
+void launch_eltwise_op_bcast(
+    const Stream& stream,
+    Span<T> output, const std::vector<std::size_t>& outStride,
+    View<T> x, const std::vector<std::size_t>& inStride1, const std::vector<int>& inBcast1,
+    View<T> y, const std::vector<std::size_t>& inStride2, const std::vector<int>& inBcast2,
+    const typename EltwiseOp::Params& params)
+{
+    CV_Assert(outStride.size() == Rank);
+    CV_Assert(inStride1.size() == Rank);
+    CV_Assert(inStride2.size() == Rank);
+    CV_Assert(inBcast1.size() == Rank);
+    CV_Assert(inBcast2.size() == Rank);
+
+    array<size_type, Rank> outStride_k, inStride1_k, inStride2_k;
+    outStride_k.assign(std::begin(outStride), std::end(outStride));
+    inStride1_k.assign(std::begin(inStride1), std::end(inStride1));
+    inStride2_k.assign(std::begin(inStride2), std::end(inStride2));
+
+    array<bool, Rank> inBcast1_k, inBcast2_k;
+    inBcast1_k.assign(std::begin(inBcast1), std::end(inBcast1));
+    inBcast2_k.assign(std::begin(inBcast2), std::end(inBcast2));
+
+    auto kernel = raw::eltwise_op_bcast<T, EltwiseOp, Rank>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, outStride_k, x, inStride1_k, inBcast1_k, y, inStride2_k, inBcast2_k, params);
+}
+
+GENERATE_KERNEL_DISPATCHER_2TP(eltwise_op_bcast_dispatcher, launch_eltwise_op_bcast);
+
+template <class T, class EltwiseOp> static
+void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y, const typename EltwiseOp::Params& params = {}) {
+    if (is_shape_same(output, x) && is_shape_same(output, y))
+    {
+        /* no broadcasting; use fast path */
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_op<T, EltwiseOp, 4>(stream, output, x, y, params);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_op<T, EltwiseOp, 2>(stream, output, x, y, params);
+        } else {
+            launch_vectorized_eltwise_op<T, EltwiseOp, 1>(stream, output, x, y, params);
+        }
+    }
+    else
+    {
+        CV_Assert(is_shape_compatible(output, x));
+        CV_Assert(is_shape_compatible(output, y));
+
+        /* matching singleton axes in both input tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Singleton axes do not contribute towards address calculation. They are redundant
+         * unless there is broadcasting. If both input tensors have singleton axis at a
+         * specified position, there is no broadcasting on that axis.
+         *
+         * Example:
+         * ---------
+         * x: [1, 256, 32, 32] -> [256, 32, 32]
+         * y: [1, 256, 1, 1] -> [256, 1, 1]
+         */
+        for (int r = 0; r < output.rank(); r++)
+        {
+            while (x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
+                CV_Assert(output.get_axis_size(r) == 1);
+
+                x.squeeze(r);
+                y.squeeze(r);
+                output.squeeze(r);
+            }
+        }
+
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes that do not broadcast can be merged into one axis
+         *
+         * Example:
+         * ---------
+         * x: [32, 8, 8] -> [32, 64]
+         * y: [1, 8, 8] -> [1, 64]
+         */
+        for (int i = 0; i < inShape1.size(); i++) {
+            /* check if axis `i` requires any broadcasting */
+            if (inShape1[i] == inShape2[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape1.size() && inShape1[j] == inShape2[j]) {
+                    CV_Assert(outShape[j] == inShape1[j]);
+
+                    /* `j` axis is also used fully; merge `i` and `j` */
+                    auto new_size = inShape1[i] * inShape1[j];
+                    inShape1[i] = new_size;
+                    inShape2[i] = new_size;
+
+                    /* delete axis `j` */
+                    inShape1.erase(std::begin(inShape1) + j);
+                    inShape2.erase(std::begin(inShape2) + j);
+                    outShape.erase(std::begin(outShape) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape1.size() == outShape.size());
+                    CV_Assert(inShape2.size() == outShape.size());
+                    CV_Assert(inShape1[i] == outShape[i]);
+                    CV_Assert(inShape2[i] == outShape[i]);
+                }
+            }
+        }
+
+        /* contiguous broadcasting axes on the same tensor can be merged into one axis
+         *
+         * Example:
+         * ---------
+         * x: [256, 8, 8] -> [256, 64]
+         * y: [256, 1, 1] -> [256, 1]
+         */
+        for (int i = 0; i < inShape1.size(); i++) {
+            /* check if axis `i` requires any broadcasting in tensor 1 */
+            if (inShape1[i] == 1 && inShape2[i] != 1) {
+                /* loop invariant: `i` is the first axis in the contiguous axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape1.size() && inShape1[j] == 1 && inShape2[j] != 1) {
+                    CV_Assert(outShape[j] == inShape2[j]);
+
+                    /* `j` axis is also used fully; merge `i` and `j` */
+                    inShape1[i] = 1;
+                    inShape2[i] = inShape2[i] * inShape2[j];
+                    outShape[i] = inShape2[i];
+
+                    /* delete axis `j` */
+                    inShape1.erase(std::begin(inShape1) + j);
+                    inShape2.erase(std::begin(inShape2) + j);
+                    outShape.erase(std::begin(outShape) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape1.size() == outShape.size());
+                    CV_Assert(inShape2.size() == outShape.size());
+                    CV_Assert(inShape1[i] == 1);
+                    CV_Assert(inShape2[i] == outShape[i]);
+                }
+            }
+
+            /* check if axis `i` requires any broadcasting in tensor 2 */
+            if (inShape1[i] != 1 && inShape2[i] == 1) {
+                /* loop invariant: `i` is the first axis in the contiguous axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape1.size() && inShape1[j] != 1 && inShape2[j] == 1) {
+                    CV_Assert(outShape[j] == inShape1[j]);
+
+                    /* `j` axis is also used fully; merge `i` and `j` */
+                    inShape1[i] = inShape1[i] * inShape1[j];
+                    inShape2[i] = 1;
+                    outShape[i] = inShape1[i];
+
+                    /* delete axis `j` */
+                    inShape1.erase(std::begin(inShape1) + j);
+                    inShape2.erase(std::begin(inShape2) + j);
+                    outShape.erase(std::begin(outShape) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape1.size() == outShape.size());
+                    CV_Assert(inShape2.size() == outShape.size());
+                    CV_Assert(inShape1[i] == outShape[i]);
+                    CV_Assert(inShape2[i] == 1);
+                }
+            }
+        }
+
+        auto rank = outShape.size();
+
+        std::vector<std::size_t> inStride1(rank), inStride2(rank), outStride(rank);
+        inStride1.back() = 1;
+        inStride2.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape1) + 1, std::end(inShape1), std::begin(inStride1));
+        std::copy(std::begin(inShape2) + 1, std::end(inShape2), std::begin(inStride2));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride1.rbegin(), inStride1.rend(), inStride1.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(inStride2.rbegin(), inStride2.rend(), inStride2.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        std::vector<int> inBcast1(rank), inBcast2(rank);
+        std::transform(std::begin(inShape1), std::end(inShape1), std::begin(inBcast1), [](std::size_t sz) { return sz == 1; });
+        std::transform(std::begin(inShape2), std::end(inShape2), std::begin(inBcast2), [](std::size_t sz) { return sz == 1; });
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        eltwise_op_bcast_dispatcher<T, EltwiseOp, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, x, inStride1, inBcast1, y, inStride2, inBcast2, params);
+    }
+}
+
+template <class T>
+void eltwise_max_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, MaxFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_min_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, MinFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, SumFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_sum_coeff_2(const Stream& stream, TensorSpan<T> output, T coeff_x, TensorView<T> x, T coeff_y, TensorView<T> y) {
+    eltwise_op<T, ScaledSumFunctor<T>>(stream, output, x, y, {coeff_x, coeff_y});
+}
+
+template <class T>
+void eltwise_prod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, ProductFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
+    template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+#endif
+    template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);
+    template void eltwise_sum_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/execution.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/execution.hpp
@ -0,0 +1,81 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+
+#include "../cuda4dnn/csl/error.hpp"
+#include "../cuda4dnn/csl/stream.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    struct execution_policy {
+        execution_policy(dim3 grid_size, dim3 block_size)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
+
+        dim3 grid;
+        dim3 block;
+        std::size_t sharedMem;
+        cudaStream_t stream;
+    };
+
+    /* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
+    /*
+    template <class Kernel> inline
+    execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
+        int grid_size, block_size;
+        CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+        return execution_policy(grid_size, block_size, sharedMem, stream);
+    }*/
+
+    template <class Kernel> inline
+    execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
+        CV_Assert(max_threads > 0);
+
+        int grid_size = 0, block_size = 0;
+        CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+        if (grid_size * block_size > max_threads) {
+            grid_size = (max_threads + block_size - 1) / block_size;
+            if (block_size > max_threads)
+                block_size = max_threads;
+        }
+
+        CV_Assert(grid_size >= 1 && block_size >= 1);
+        return execution_policy(grid_size, block_size, sharedMem, stream);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, Args ...args) {
+        auto policy = make_policy(kernel);
+        kernel <<<policy.grid, policy.block>>> (args...);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
+        kernel <<<grid, block>>> (args...);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
+        kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (args...);
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fill_copy.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fill_copy.cu
@ -0,0 +1,98 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void fill_vec(Span<T> output, T value) {
+            using vector_type = get_vector_type_t<T, N>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = value;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void  copy_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+            auto input_vPtr = vector_type::get_pointer(input.data());
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+
+        auto kernel = raw::fill_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, value);
+    }
+
+    template <class T>
+    void fill(const Stream& stream, Span<T> output, T value) {
+        if (is_fully_aligned<T>(output, 4)) {
+            launch_vectorized_fill<T, 4>(stream, output, value);
+        } else if (is_fully_aligned<T>(output, 2)) {
+            launch_vectorized_fill<T, 2>(stream, output, value);
+        } else {
+            launch_vectorized_fill<T, 1>(stream, output, value);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void fill(const Stream&, Span<__half>, __half);
+#endif
+    template void fill(const Stream&, Span<float>, float);
+    template void fill(const Stream&, Span<int>, int);
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_copy(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::copy_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void copy(const Stream& stream, Span<T> output, View<T> input) {
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_copy<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_copy<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_copy<T, 1>(stream, output, input);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void copy(const Stream&, Span<__half>, View<__half>);
+#endif
+    template void copy(const Stream&, Span<float>, View<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fp_conversion.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fp_conversion.cu
@ -0,0 +1,102 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <std::size_t N>
+        __global__ void fp32_to_fp16(Span<__half> output, View<float> input) {
+            using output_vector_type = get_vector_type_t<__half, N>;
+            using input_vector_type = get_vector_type_t<float, N>;
+
+            auto output_vPtr = output_vector_type::get_pointer(output.data());
+            auto input_vPtr = input_vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
+                input_vector_type in_vec;
+                v_load(in_vec, input_vPtr[i]);
+
+                output_vector_type out_vec;
+                for (int j = 0; j < output_vector_type::size(); j++)
+                    out_vec.data[j] = __float2half(in_vec.data[j]);
+
+                v_store(output_vPtr[i], out_vec);
+            }
+        }
+
+        template <std::size_t N>
+        __global__ void fp16_to_fp32(Span<float> output, View<__half> input) {
+            using output_vector_type = get_vector_type_t<float, N>;
+            using input_vector_type = get_vector_type_t<__half, N>;
+
+            auto output_vPtr = output_vector_type::get_pointer(output.data());
+            auto input_vPtr = input_vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
+                input_vector_type in_vec;
+                v_load(in_vec, input_vPtr[i]);
+
+                output_vector_type out_vec;
+                for (int j = 0; j < output_vector_type::size(); j++)
+                    out_vec.data[j] = __half2float(in_vec.data[j]);
+
+                v_store(output_vPtr[i], out_vec);
+            }
+        }
+    }
+
+    template <std::size_t N> static
+    void launch_vectorized_fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
+        CV_Assert(is_fully_aligned<__half>(output, N));
+        CV_Assert(is_fully_aligned<float>(input, N));
+
+        auto kernel = raw::fp32_to_fp16<N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    void fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
+        if (is_fully_aligned<__half>(output, 4) && is_fully_aligned<float>(input, 4)) {
+            launch_vectorized_fp32_to_fp16<4>(stream, output, input);
+        } else if (is_fully_aligned<__half>(output, 2) && is_fully_aligned<float>(input, 2)) {
+            launch_vectorized_fp32_to_fp16<2>(stream, output, input);
+        } else {
+            launch_vectorized_fp32_to_fp16<1>(stream, output, input);
+        }
+    }
+
+    template <std::size_t N> static
+    void launch_vectorized_fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
+        CV_Assert(is_fully_aligned<float>(output, N));
+        CV_Assert(is_fully_aligned<__half>(input, N));
+
+        auto kernel = raw::fp16_to_fp32<N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    void fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
+        if (is_fully_aligned<float>(output, 4) && is_fully_aligned<__half>(input, 4)) {
+            launch_vectorized_fp16_to_fp32<4>(stream, output, input);
+        } else if (is_fully_aligned<float>(output, 2) && is_fully_aligned<__half>(input, 2)) {
+            launch_vectorized_fp16_to_fp32<2>(stream, output, input);
+        } else {
+            launch_vectorized_fp16_to_fp32<1>(stream, output, input);
+        }
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/functors.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/functors.hpp
@ -0,0 +1,334 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
+#define OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
+
+#include <cuda_runtime.h>
+
+#include "math.hpp"
+
+#include "../cuda4dnn/csl/nvcc_defs.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+template <class T>
+struct IdentityFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE IdentityFunctor() { }
+    CUDA4DNN_DEVICE IdentityFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        return value;
+    };
+};
+
+template <class T>
+struct ReLUFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : slope(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T slope_) : slope(slope_) { }
+        T slope;
+    };
+
+    CUDA4DNN_DEVICE ReLUFunctor() : ReLUFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ReLUFunctor(const Params& params) : slope(params.slope) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::log1pexp;
+        return value >= T(0) ? value : slope * value;
+    }
+
+    T slope;
+};
+
+template <class T>
+struct ClippedReLUFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : floor(0), ceiling(6) { }
+        CUDA4DNN_HOST_DEVICE Params(T floor_, T ceiling_) : floor(floor_), ceiling(ceiling_) { }
+        T floor, ceiling;
+    };
+
+    CUDA4DNN_DEVICE ClippedReLUFunctor() : ClippedReLUFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ClippedReLUFunctor(const Params& params) : floor{params.floor}, ceiling{params.ceiling} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::clamp;
+        return clamp(value, floor, ceiling);
+    }
+
+    T floor, ceiling;
+};
+
+template <class T>
+struct TanHFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE TanHFunctor() { }
+    CUDA4DNN_DEVICE TanHFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::tanh;
+        return tanh(value);
+    }
+};
+
+template <class T>
+struct SwishFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SwishFunctor() { }
+    CUDA4DNN_DEVICE SwishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        // f(x) = x * sigmoid(x)
+        using csl::device::fast_divide;
+        using csl::device::fast_exp;
+        return fast_divide(value, static_cast<T>(1) + fast_exp(-value));
+    }
+};
+
+template <class T>
+struct MishFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MishFunctor() { }
+    CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::tanh;
+        using csl::device::log1pexp;
+        return value * tanh(log1pexp(value));
+    }
+};
+
+template <>
+struct MishFunctor<float> {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MishFunctor() { }
+    CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE float operator()(float value) {
+        // f(x) = x * tanh(log1pexp(x));
+        using csl::device::fast_divide;
+        using csl::device::fast_exp;
+
+        auto e = fast_exp(value);
+        auto n = e * e + 2 * e;
+        if (value <= -0.6f)
+            return value * fast_divide(n, n + 2);
+        return value - 2 * fast_divide(value, n + 2);
+    }
+};
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template <>
+struct MishFunctor<__half> {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MishFunctor() { }
+    CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE __half operator()(__half value) {
+        return MishFunctor<float>()(value);
+    }
+};
+#endif
+
+template <class T>
+struct SigmoidFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SigmoidFunctor() { }
+    CUDA4DNN_DEVICE SigmoidFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::fast_sigmoid;
+        return fast_sigmoid(value);
+    }
+};
+
+template <class T>
+struct ELUFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE ELUFunctor() { }
+    CUDA4DNN_DEVICE ELUFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::expm1;
+        return value >= T(0) ? value : expm1(value);
+    }
+};
+
+template <class T>
+struct AbsFunctor {
+    struct Params { };
+
+    CUDA4DNN_DEVICE AbsFunctor() { }
+    CUDA4DNN_DEVICE AbsFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::abs;
+        return abs(value);
+    }
+};
+
+template <class T>
+struct BNLLFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE BNLLFunctor() { }
+    CUDA4DNN_DEVICE BNLLFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::log1pexp;
+        return value > T(0) ? value + log1pexp(-value) : log1pexp(value);
+    }
+};
+
+template <class T>
+struct PowerFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : exp(1), scale(1), shift(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T exp_, T scale_, T shift_) : exp(exp_), scale(scale_), shift(shift_) { }
+        T exp, scale, shift;
+    };
+
+    CUDA4DNN_DEVICE PowerFunctor() : PowerFunctor(Params{}) { }
+    CUDA4DNN_DEVICE PowerFunctor(const Params& params) : exp{params.exp}, scale{params.scale}, shift{params.shift} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::pow;
+        return pow(shift + scale * value, exp);
+    }
+
+    T exp, scale, shift;
+};
+
+template <class T>
+struct ExpFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : normScale(1), normShift(0) { }
+        CUDA4DNN_HOST_DEVICE Params(T nScale_, T nShift_) : normScale(nScale_), normShift(nShift_) { }
+        T normScale, normShift;
+    };
+
+    CUDA4DNN_DEVICE ExpFunctor() : ExpFunctor(Params{}) { }
+    CUDA4DNN_DEVICE ExpFunctor(const Params& params) : normScale{params.normScale}, normShift{params.normShift} { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::fast_exp;
+        return fast_exp(normShift + normScale * value);
+    }
+
+    T normScale, normShift;
+};
+
+template <class T>
+struct MaxFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MaxFunctor() { }
+    CUDA4DNN_DEVICE MaxFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::max;
+        return max(x, y);
+    }
+};
+
+template <class T>
+struct MinFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE MinFunctor() { }
+    CUDA4DNN_DEVICE MinFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::min;
+        return min(x, y);
+    }
+};
+
+template <class T>
+struct SumFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SumFunctor() { }
+    CUDA4DNN_DEVICE SumFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <class T>
+struct ScaledSumFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() : scale_x(1), scale_y(1) { }
+        CUDA4DNN_HOST_DEVICE Params(T scale_x_, T scale_y_) : scale_x(scale_x_), scale_y(scale_y_) { }
+        T scale_x, scale_y;
+    };
+
+    CUDA4DNN_DEVICE ScaledSumFunctor() : scale_x(1), scale_y(1) { }
+    CUDA4DNN_DEVICE ScaledSumFunctor(const Params& params) : scale_x{params.scale_x}, scale_y{params.scale_y} { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return scale_x * x + scale_y * y; }
+
+    T scale_x, scale_y;
+};
+
+template <class T>
+struct ProductFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE ProductFunctor() { }
+    CUDA4DNN_DEVICE ProductFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x * y; }
+};
+
+template <class T>
+struct DivFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE DivFunctor() { }
+    CUDA4DNN_DEVICE DivFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
+};
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_nms.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_nms.cu
@ -0,0 +1,467 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "bbox_utils.hpp"
+#include "grid_stride_range.hpp"
+#include "block_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, bool NORMALIZED_BBOX, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void grid_nms(Span<unsigned int> mask_, Span<int> count_, View<T> bboxes_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs, float nms_threshold)
+    {
+        // topK_gs is topK rounded upwards to some size
+
+        // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
+        // bboxes: [batch_size, num_classes, topK, 4]
+        // count: [batch_size, num_classes]
+
+        const index_type c = blockIdx.y;
+        const index_type b = blockIdx.z;
+
+        if (c == background_class_id)
+            return;
+
+        auto mask = mask_.data() + (b * num_classes + c) * topK_gs * topK_gs / 32;
+        auto bboxes = bboxes_.data() + (b * num_classes + c) * topK * 4;
+        auto count = count_.data() + b * num_classes + c;
+
+        const auto boxes = *count;
+        if (boxes == 0)
+            return;
+
+        /* We divide the set of boxes into groups containing BLOCK_SIZE boxes */
+        const auto num_groups = (boxes + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        /* We need to calculate IOUs for every pair of boxes. We can generalize and say that
+         * we need to compute IOUs of every group with every other group including itself.
+         */
+        // Each block processes a pair of groups.
+        const index_type group_i = blockIdx.x % num_groups;
+        const index_type group_j = blockIdx.x / num_groups;
+
+        /* we use __syncthreads() later but note that the following condition will cause all threads
+         * in the block to exit; hence, no thread will execute a divergent __syncthreads()
+         */
+        if (group_i >= num_groups || group_j >= num_groups)
+            return;
+
+        /* Note that IOU(A, B) = IOU(B, A). Hence, if we compute IOU(GROUP_A, GROUP_B), we do not need
+         * to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
+         * each group has many boxes and we need IOUs amongst boxes within a group.
+         *
+         * We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
+         * compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
+         * early on and the working blocks are compute heavy.
+         */
+        if (group_i > group_j)
+            return;
+
+        /* the following variables contain the absolute box number of the first box of their respective groups */
+        const auto group_i_offset = group_i * BLOCK_SIZE;
+        const auto group_j_offset = group_j * BLOCK_SIZE;
+
+        /* MAIN LOOP LOGIC:
+         * We compare a box `i` from group_i with all boxes in group_j in each iteration. The box `j` is fixed
+         * for each thread. The `j` exactly maps to the thread index. Hence, the `j` is a loop invariant. Each
+         * thread of the block computes the overlap between box `i` and its box `j`.
+         *
+         * for (int i = 0; i < BLOCK_SIZE; i++)
+         * {
+         *    // i = box 1
+         *    // j = threadIdx.x = box 2
+         * }
+         */
+
+        /* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
+         * We store the `i` boxes in shared memory to allow global memory coalesing.
+         */
+        using vector_type = get_vector_type_t<T, 4>;
+        __shared__ vector_type group_i_boxes[BLOCK_SIZE];
+
+        /* We will precompute the sizes of `i` boxes in the code where we load them. The size computation
+         * is distributed across the block. Otherwise, all threads will have to compute the size of the same
+         * box simultaneously in the main loop. The size is computed while the memory subsystem is busy
+         * servicing requests for box coordinates; the compute resources would otherwise be idle in this phase.
+         */
+        /* we store the size as a float since the size can exceed fp16 limits for unnormalized boxes */
+        __shared__ float group_i_size[BLOCK_SIZE];
+
+        const auto bboxes_vPtr = vector_type::get_pointer(bboxes);
+
+        // load `i` boxes and precompute their sizes
+        {
+            int i = threadIdx.x;
+            if (group_i_offset + i < boxes)
+            {
+                vector_type box;
+                v_load(box, bboxes_vPtr[group_i_offset + i]);
+                v_store(group_i_boxes[i], box);
+
+                BoundingBox bbox;
+                bbox.xmin = box.data[0];
+                bbox.ymin = box.data[1];
+                bbox.xmax = box.data[2];
+                bbox.ymax = box.data[3];
+
+                group_i_size[i] = compute_bbox_size<NORMALIZED_BBOX>(bbox);
+            }
+        }
+
+        __syncthreads();
+
+        /* We compute overlap between boxes and check if the IOU exceeds the nms threshold.
+         * We store the result (exceeds or below nms_thresold) in a two-dimensional matrix.
+         * (i, j) is set to one if the overlap between i and j is within the nms threshold.
+         * We pack 32 results into one 32-bit integer. The effective memory layout of the
+         * matrix hence is (BLOCK_SIZE, BLOCK_SIZE / 32).
+         */
+        __shared__ unsigned int mask_shared[BLOCK_SIZE * BLOCK_SIZE / 32];
+
+        // load box `j` and precompute its size (fixed per thread)
+        BoundingBox bbox_j;
+        float bbox_j_size = 0;
+        if (group_j_offset + threadIdx.x < boxes)
+        {
+            vector_type box;
+            v_load(box, bboxes_vPtr[group_j_offset + threadIdx.x]);
+
+            bbox_j.xmin = box.data[0];
+            bbox_j.ymin = box.data[1];
+            bbox_j.xmax = box.data[2];
+            bbox_j.ymax = box.data[3];
+
+            bbox_j_size = compute_bbox_size<NORMALIZED_BBOX>(bbox_j);
+        }
+
+        /* Each thread computes a predicate which is broadcasted across the warp to obtain a 32-bit mask.
+         * The lane zero thread of each warp saves the mask. We store the offset to the mask array beforehand
+         * to save cycles in the compute-intensive main loop.
+         */
+        auto mask_offset = threadIdx.x / 32;
+
+        /* The main loop is compute intensive and causes the kernel to be overall compute-bound. Hence,
+         * this loop has been highly tuned. Please profile and verify carefully before making changes.
+         */
+        /* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
+         * the loop since the compiler cannot effectively unroll on its own preassumably due to presence
+         * of instructions forcing warp synchronization.
+         */
+        constexpr int UNROLL_SIZE = 4;
+
+        #pragma unroll 8
+        for (int s = 0; s < BLOCK_SIZE; s += UNROLL_SIZE)
+        {
+            bool do_not_reject_j[UNROLL_SIZE];
+
+            #pragma unroll
+            for (int k = 0; k < UNROLL_SIZE; k++)
+            {
+                int i = s + k;
+
+                /* The number of boxes need not necessarily be a multiple of BLOCK_SIZE.
+                 * However, the shared memory allocated can hold BLOCK_SIZE boxes from
+                 * each group. Accessing the undefined regions of shared memory is
+                 * a valid memory operation as long as the memory has been allocated.
+                 *
+                 * The condition below is only required when one of the groups does not
+                 * fully filled with valid boxes. This situations are relatively rare. It's
+                 * more common to see both groups completely filled.
+                 *
+                 * We comment this condition to improve the performance of the common case.
+                 * This leads to a net improvement.
+                 */
+                // if (group_i_offset + i < boxes && group_j_offset + threadIdx.x < boxes)
+                {
+                    BoundingBox bbox_i;
+                    float bbox_i_size;
+                    {
+                        vector_type box;
+                        v_load(box, group_i_boxes[i]);
+                        bbox_i.xmin = box.data[0];
+                        bbox_i.ymin = box.data[1];
+                        bbox_i.xmax = box.data[2];
+                        bbox_i.ymax = box.data[3];
+
+                        bbox_i_size = group_i_size[i];
+                    }
+
+                    using device::min;
+                    using device::max;
+
+                    BoundingBox intersect_bbox;
+                    intersect_bbox.xmin = max(bbox_i.xmin, bbox_j.xmin);
+                    intersect_bbox.ymin = max(bbox_i.ymin, bbox_j.ymin);
+                    intersect_bbox.xmax = min(bbox_i.xmax, bbox_j.xmax);
+                    intersect_bbox.ymax = min(bbox_i.ymax, bbox_j.ymax);
+
+                    float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
+
+                    using device::fast_divide_ftz;
+                    float iou = fast_divide_ftz(intersect_size, bbox_i_size + bbox_j_size - intersect_size);
+                    do_not_reject_j[k] = iou <= nms_threshold;
+                }
+            }
+
+            #pragma unroll
+            for (int k = 0; k < UNROLL_SIZE; k++)
+            {
+                // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
+                auto predicate = __ballot_sync(0xFFFFFFFF, do_not_reject_j[k]);
+                if (threadIdx.x % 32 == 0)
+                    mask_shared[mask_offset] = predicate;
+
+                /* The following operation should logically be inside the previous if branch. Note that `mask_offset`
+                 * is only used by lane zero threads. Hence, there is no harm in executing it other threads as it is
+                 * unused there.
+                 *
+                 * Keeping it inside prevents the compiler from treating it as a constexpr addition to the address in
+                 * successive unrolled iterations. A register is used and instructions are emitted to multiply the
+                 * addend by four to obtain the byte offset. Pulling it out of the branch makes the compiler do constexpr
+                 * addition on the address in successive unrolled iterations.
+                 */
+                mask_offset += BLOCK_SIZE / 32;
+            }
+        }
+
+        __syncthreads();
+
+        /* The mask data is organized as a two-dimensional bit matrix of size topK_gs * topK_gs.
+         * (i, j) is set to true if the overlap between `i` and `j` is beyond the nms threshold.
+         * We pack 32 results into one 32-bit integer. So the effective memory layout is topK_gs * topK_gs / 32.
+         */
+
+        /* Each box `i` was compared with BLOCK_SIZE `j` boxes. This amounts to BLOCK_SIZE / 32
+         * 32-bit integers per box `i`.
+         */
+        using mask_vector_type = get_vector_type_t<unsigned int, BLOCK_SIZE / 32>;
+
+        const int i = threadIdx.x;
+
+        auto mask_shared_vPtr = mask_vector_type::get_pointer(DevicePtr<unsigned>(mask_shared));
+        mask_vector_type temp;
+        v_load(temp, mask_shared_vPtr[i]);
+        for (int i = 0; i < mask_vector_type::size(); i++)
+            temp.data[i] = __brev(temp.data[i]);
+
+        auto mask_vPtr = mask_vector_type::get_pointer(mask);
+        v_store(mask_vPtr[((group_i_offset + i) * topK_gs + group_j_offset) / 32 / mask_vector_type::size()], temp);
+    }
+
+    template <int ITEMS_PER_THREAD, int BLOCK_SIZE>
+    __launch_bounds__(BLOCK_SIZE)
+    __global__ void grid_nms_collect(Span<int> indices_, Span<int> count_, View<unsigned int> mask_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs_by32)
+    {
+        const index_type c = blockIdx.x;
+        if (c == background_class_id)
+            return;
+
+        const index_type b = blockIdx.y;
+
+        // topK_gs is topK rounded upwards to some size
+
+        // indices: [batch_size, num_classes, topK]
+        // count: [batch_size, num_classes]
+        // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
+
+        auto indices = indices_.data() + (b * num_classes + c) * topK;
+        auto count = count_.data() + (b * num_classes + c);
+        auto mask = mask_.data() + (b * num_classes + c) * topK_gs_by32 * 32 * topK_gs_by32;
+
+        const auto boxes = *count;
+        if (boxes == 0)
+            return;
+
+        /* We have a fixed number of threads and an arbitary number of boxes. We use an array of
+         * bits to store which boxes haven't been eliminated and which are still active. We organize
+         * the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
+         * is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
+         * `num_rows` is the minimum number of rows required to cover all the boxes.
+         *
+         * Each thread handles a specific column in the matrix. To improve performance, we process
+         * `ITEMS_PER_THREAD` number of elements per thread. This changes the shape to (num_rows,
+         * ROW_WIDTH) where ROW_WIDTH is BLOCK_SIZE * ITEMS_PER_THREAD.
+         */
+         constexpr int ROW_WIDTH = BLOCK_SIZE * ITEMS_PER_THREAD;
+
+         const index_type num_32b_masks = static_cast<unsigned>(boxes + 31) / 32;
+         const index_type num_rows = static_cast<unsigned>(num_32b_masks + ROW_WIDTH - 1) / ROW_WIDTH;
+
+        extern __shared__ unsigned int active_boxes[]; // the matrix described earlier
+
+        #pragma unroll 1
+        for (auto idx : block_stride_range<BLOCK_SIZE>(num_32b_masks))
+            active_boxes[idx] = (idx == num_32b_masks - 1) ? __brev((1u << (boxes % 32)) - 1) : 0xFFFFFFFF;
+
+        __syncthreads();
+
+        using vector_type = get_vector_type_t<unsigned int, ITEMS_PER_THREAD>;
+        auto mask_vPtr = vector_type::get_pointer(mask);
+        auto shared_vPtr = vector_type::get_pointer(DevicePtr<unsigned>(active_boxes));
+
+        int index_temp;
+        int thread0_count = 0;
+        int thread_id = threadIdx.x;
+
+        for (int step = 0; step < num_32b_masks; step++)
+        {
+            auto current_active = active_boxes[step];
+            while (current_active)
+            {
+                const index_type bit = __clz(current_active);
+                const index_type i = step * 32 + bit;
+
+                const int mask_offset = static_cast<unsigned>(i * topK_gs_by32) / ITEMS_PER_THREAD;
+
+                /* We fetch the index from the memory and store it in a register. We will not use it until
+                 * much later. This helps avoid a long scoreboard stall.
+                 */
+                if (thread_id == 0)
+                    index_temp = indices[i];
+
+                __syncthreads();
+
+                if (threadIdx.x == 0)
+                    active_boxes[step] = current_active ^ (0x80000000 >> bit);
+
+                __syncthreads();
+
+                #pragma unroll 1
+                for (int r = 0; r < num_rows; r++)
+                {
+                    const int idx = r * BLOCK_SIZE + thread_id;
+                    if ((step & ~(ITEMS_PER_THREAD - 1)) <= idx * ITEMS_PER_THREAD && idx * ITEMS_PER_THREAD < num_32b_masks)
+                    {
+                        auto active_boxes_vec = shared_vPtr[idx];
+                        auto mask_vec = mask_vPtr[mask_offset + idx];
+                        for (int i = 0; i < vector_type::size(); i++)
+                            active_boxes_vec.data[i] &= mask_vec.data[i];
+                        shared_vPtr[idx] = active_boxes_vec;
+                    }
+                }
+
+                __syncthreads();
+
+                if (thread_id == 0)
+                {
+                    indices[thread0_count] = index_temp;
+                    thread0_count++;
+                }
+
+                current_active = active_boxes[step];
+            }
+        }
+
+        if (threadIdx.x == 0)
+            *count = thread0_count;
+    }
+}
+
+constexpr int GROUP_SIZE = 128;
+
+static std::size_t getAlignedTopK(std::size_t topK)
+{
+    auto remainder = topK % GROUP_SIZE;
+    if (remainder == 0)
+        return topK;
+    return topK + (GROUP_SIZE - remainder);
+}
+
+std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK)
+{
+    auto topK_gs = getAlignedTopK(classwise_topK);
+    return num_classes * topK_gs * topK_gs / 32 * sizeof(unsigned int);
+}
+
+template <class T>
+void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold)
+{
+    // workspace: [batch_size, num_classes, topK_gs, topK_gs / 32]
+    // indices: [batch_size, num_classes, topK]
+    // count: [batch_size, num_classes]
+    // bboxes: [batch_size, num_classes, topK, 4] (only first count[b][c] boxes are read)
+
+    const auto batch_size = indices.get_axis_size(0);
+    CV_Assert(count.get_axis_size(0) == batch_size);
+    CV_Assert(bboxes.get_axis_size(0) == batch_size);
+
+    const auto num_classes = indices.get_axis_size(1);
+    CV_Assert(count.get_axis_size(1) == num_classes);
+    CV_Assert(bboxes.get_axis_size(1) == num_classes);
+
+    const auto topK = indices.get_axis_size(2);
+    CV_Assert(bboxes.get_axis_size(2) == topK);
+
+    CV_Assert(bboxes.get_axis_size(3) == 4);
+
+    const auto topK_gs = getAlignedTopK(topK);
+    CV_Assert(workspace.size() >= topK_gs * topK_gs / 32);
+
+    const auto boxes = topK;
+    const auto num_groups = (boxes + GROUP_SIZE - 1) / GROUP_SIZE;
+
+    {
+        // grid = (num_groups * num_groups, num_classes, batch_size)
+        // if the background class is the last class, we can reduce grid y dim by one
+        auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
+
+        constexpr int BLOCK_SIZE = GROUP_SIZE;
+
+        dim3 grid_size(num_groups * num_groups, grid_num_classes, batch_size);
+        dim3 block_size(BLOCK_SIZE);
+        auto policy = execution_policy(grid_size, block_size, stream);
+
+        if (normalized_bbox)
+        {
+            auto kernel = raw::grid_nms<T, true, BLOCK_SIZE>;
+            launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
+        }
+        else
+        {
+            auto kernel = raw::grid_nms<T, false, BLOCK_SIZE>;
+            launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
+        }
+    }
+
+    {
+        // grid = (num_classes, batch_size)
+        // if the background class is the last class, we can reduce grid x dim by one
+        auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
+
+        constexpr int BLOCK_SIZE = 64;
+
+        constexpr int ITEMS_PER_THREAD = 4;
+        auto kernel = raw::grid_nms_collect<ITEMS_PER_THREAD, BLOCK_SIZE>;
+
+        dim3 grid_size(grid_num_classes, batch_size);
+
+        auto sharedMem = topK_gs / 32 * 4;
+        auto policy = execution_policy(grid_size, BLOCK_SIZE, sharedMem, stream);
+        launch_kernel(kernel, policy, indices, count, workspace, num_classes, background_class_id, topK, topK_gs / 32);
+    }
+}
+
+std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
+
+template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
+template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_stride_range.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_stride_range.hpp
@ -0,0 +1,68 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+
+#include "types.hpp"
+#include "index_helpers.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+template <int dim, class index_type = device::index_type, class size_type = device::size_type>
+class grid_stride_range_generic {
+public:
+    __device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { }
+    __device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
+
+    class iterator
+    {
+    public:
+        __device__ iterator(index_type pos_) : pos(pos_) {}
+
+        /* these iterators return the index when dereferenced; this allows us to loop
+            * through the indices using a range based for loop
+            */
+        __device__ index_type operator*() const { return pos; }
+
+        __device__ iterator& operator++() {
+            pos += getGridDim<dim>() * static_cast<index_type>(getBlockDim<dim>());
+            return *this;
+        }
+
+        __device__ bool operator!=(const iterator& other) const {
+            /* NOTE HACK
+                * 'pos' can move in large steps (see operator++)
+                * expansion of range for loop uses != as the loop conditioion
+                * => operator!= must return false if 'pos' crosses the end
+                */
+            return pos < other.pos;
+        }
+
+    private:
+        index_type pos;
+    };
+
+    __device__ iterator begin() const {
+        return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>());
+    }
+
+    __device__ iterator end() const {
+        return iterator(to);
+    }
+
+private:
+    index_type from, to;
+};
+
+using grid_stride_range_x = grid_stride_range_generic<0>;
+using grid_stride_range_y = grid_stride_range_generic<1>;
+using grid_stride_range_z = grid_stride_range_generic<2>;
+using grid_stride_range = grid_stride_range_x;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/index_helpers.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/index_helpers.hpp
@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
+#define OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
+
+#include "types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+namespace detail {
+    using dim3_member_type = decltype(dim3::x);
+    using uint3_member_type = decltype(uint3::x);
+}
+
+template <int>  __device__ detail::dim3_member_type getGridDim();
+template <> inline __device__ detail::dim3_member_type getGridDim<0>() { return gridDim.x; }
+template <> inline __device__ detail::dim3_member_type getGridDim<1>() { return gridDim.y; }
+template <> inline __device__ detail::dim3_member_type getGridDim<2>() { return gridDim.z; }
+
+template <int> __device__ detail::dim3_member_type getBlockDim();
+template <> inline __device__ detail::dim3_member_type getBlockDim<0>() { return blockDim.x; }
+template <> inline __device__ detail::dim3_member_type getBlockDim<1>() { return blockDim.y; }
+template <> inline __device__ detail::dim3_member_type getBlockDim<2>() { return blockDim.z; }
+
+template <int> __device__ detail::uint3_member_type getBlockIdx();
+template <> inline __device__ detail::uint3_member_type getBlockIdx<0>() { return blockIdx.x; }
+template <> inline __device__ detail::uint3_member_type getBlockIdx<1>() { return blockIdx.y; }
+template <> inline __device__ detail::uint3_member_type getBlockIdx<2>() { return blockIdx.z; }
+
+template <int> __device__ detail::uint3_member_type getThreadIdx();
+template <> inline __device__ detail::uint3_member_type getThreadIdx<0>() { return threadIdx.x; }
+template <> inline __device__ detail::uint3_member_type getThreadIdx<1>() { return threadIdx.y; }
+template <> inline __device__ detail::uint3_member_type getThreadIdx<2>() { return threadIdx.z; }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/kernel_dispatcher.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/kernel_dispatcher.hpp
@ -0,0 +1,94 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
+ * one kernel which can work with the maximally ranked tensors, we make one kernel for each supported
+ * tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a
+ * toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor
+ * rank as a template parameter.
+ *
+ * The kernel is a template and we have different instantiations for each rank. This causes the following pattern
+ * to arise frequently:
+ *
+ * if(rank == 3)
+ *     kernel<T, 3>();
+ * else if(rank == 2)
+ *     kernel<T, 2>();
+ * else
+ *     kernel<T, 1>();
+ *
+ * The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER.
+ * This macro creates a function which selects the correct kernel instantiation at runtime.
+ *
+ * Example:
+ *
+ * // function which setups the kernel and launches it
+ * template <class T, std::size_t Rank>
+ * void launch_some_kernel(...);
+ *
+ * // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
+ * GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel);
+ *
+ * // internal API function
+ * template <class T>
+ * void some(...) {
+ *    // ...
+ *    auto rank = input.rank();
+ *    some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...);
+ * }
+ */
+
+/*
+ * name     name of the dispatcher function that is generated
+ * func     template function that requires runtime selection
+ *
+ * T        first template parameter to `func`
+ * start    starting rank
+ * end      ending rank (inclusive)
+ *
+ * Executes func<T, selector> based on runtime `selector` argument given `selector` lies
+ * within the range [start, end]. If outside the range, no instantiation of `func` is executed.
+ */
+#define GENERATE_KERNEL_DISPATCHER(name,func);                                          \
+    template <class T, std::size_t start, std::size_t end, class... Args> static        \
+    typename std::enable_if<start == end, void>                                         \
+    ::type name(int selector, Args&& ...args) {                                         \
+        if(selector == start)                                                           \
+            func<T, start>(std::forward<Args>(args)...);                                \
+    }                                                                                   \
+                                                                                        \
+    template <class T, std::size_t start, std::size_t end, class... Args> static        \
+    typename std::enable_if<start != end, void>                                         \
+    ::type name(int selector, Args&& ...args) {                                         \
+        if(selector == start)                                                           \
+            func<T, start>(std::forward<Args>(args)...);                                \
+        else                                                                            \
+            name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...);    \
+    }
+
+// Same as GENERATE_KERNEL_DISPATCHER but takes two class template parameters T and TP1 instead of just T
+#define GENERATE_KERNEL_DISPATCHER_2TP(name,func);                                              \
+    template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static   \
+    typename std::enable_if<start == end, void>                                                 \
+    ::type name(int selector, Args&& ...args) {                                                 \
+        if(selector == start)                                                                   \
+            func<TP1, TP2, start>(std::forward<Args>(args)...);                                 \
+    }                                                                                           \
+                                                                                                \
+    template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static   \
+    typename std::enable_if<start != end, void>                                                 \
+    ::type name(int selector, Args&& ...args) {                                                 \
+        if(selector == start)                                                                   \
+            func<TP1, TP2, start>(std::forward<Args>(args)...);                                 \
+        else                                                                                    \
+            name<TP1, TP2, start + 1, end, Args...>(selector, std::forward<Args>(args)...);     \
+    }
+
+#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/limits.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/limits.hpp
@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <cfloat>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T>
+    struct numeric_limits;
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <>
+    struct numeric_limits<__half> {
+        __device__ static __half min() { return 0.0000610; }
+        __device__ static __half max() { return 65504.0; }
+        __device__ static __half lowest() { return -65504.0; }
+    };
+#endif
+
+    template <>
+    struct numeric_limits<float> {
+        __device__ static float min() { return FLT_MIN; }
+        __device__ static float max() { return FLT_MAX; }
+        __device__ static float lowest() { return -FLT_MAX; }
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/math.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/math.hpp
@ -0,0 +1,154 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP
+#define OPENCV_DNN_SRC_CUDA_MATH_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); }
+    template <> inline __device__ float abs(float val) { return fabsf(val); }
+    template <> inline __device__ double abs(double val) { return fabs(val); }
+
+    template <class T> __device__ T exp(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half exp(__half val) { return hexp(val); }
+#endif
+    template <> inline __device__ float exp(float val) { return expf(val); }
+    template <> inline __device__ double exp(double val) { return ::exp(val); }
+
+    template <class T> __device__ T expm1(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half expm1(__half val) { return hexp(val) - __half(1); }
+#endif
+    template <> inline __device__ float expm1(float val) { return expm1f(val); }
+    template <> inline __device__ double expm1(double val) { return ::expm1(val); }
+
+    template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); }
+    template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); }
+    template <> inline __device__ double max(double x, double y) { return fmax(x, y); }
+
+    template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); }
+    template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
+    template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
+
+    template <class T> __device__ T log1p(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
+#endif
+    template <> inline __device__ float log1p(float val) { return log1pf(val); }
+
+    template <class T> __device__ T log1pexp(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half log1pexp(__half val) {
+        if (val <= __half(-4.0))
+            return exp(val);
+        else if (val <= __half(8.0))
+            return log1p(exp(val));
+        else if (val <= __half(8.7))
+            return val + exp(-val);
+        else
+            return val;
+    }
+#endif
+    template <> inline __device__ float log1pexp(float val) {
+        if (val <= -20)
+            return expf(val);
+        else if (val <= 9.0)
+            return log1pf(expf(val));
+        else if (val <= 14.6)
+            return val + exp(-val);
+        else
+            return val;
+    }
+    template <> inline __device__ double log1pexp(double val) {
+        if (val <= -37)
+            return exp(val);
+        else if (val <= 18)
+            return log1p(exp(val));
+        else if (val <= 33.3)
+            return val + exp(-val);
+        else
+            return val;
+    }
+
+    template <class T> __device__ T tanh(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half tanh(__half val) { return tanhf(val); }
+#endif
+    template <> inline __device__ float tanh(float val) { return tanhf(val); }
+    template <> inline __device__ double tanh(double val) { return ::tanh(val); }
+
+    template <class T> __device__ T pow(T val, T exp);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); }
+#endif
+    template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); }
+    template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); }
+
+    template <class T> __device__ T sqrt(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); }
+#endif
+    template <> inline __device__ float sqrt(float val) { return sqrtf(val); }
+    template <> inline __device__ double sqrt(double val) { return ::sqrt(val); }
+
+    template <class T> __device__ T rsqrt(T val);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); }
+#endif
+    template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); }
+    template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); }
+
+    template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); }
+
+    template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
+
+    template <class T> __device__ long lround(T value);
+    template <> inline __device__ long lround(double value) { return ::lround(value); }
+    template <> inline __device__ long lround(float value) { return lroundf(value); }
+
+    template <class T> __device__ T round(T value);
+    template <> inline __device__ double round(double value) { return ::round(value); }
+    template <> inline __device__ float round(float value) { return roundf(value); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half round(__half value) { return hrint(value); }
+#endif
+
+    template <class T> __device__ T ceil(T value);
+    template <> inline __device__ double ceil(double value) { return ::ceil(value); }
+    template <> inline __device__ float ceil(float value) { return ceilf(value); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ __half ceil(__half value) { return hceil(value); }
+#endif
+
+    template <class T> __device__ T mul_ftz(T x, T y) { return x * y; }
+    template <> inline __device__ float mul_ftz(float x, float y) {
+        float result;
+        asm("mul.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
+        return result;
+    }
+
+    template <class T> __device__ T fast_divide(T x, T y) { return x / y; }
+    template <> inline __device__ float fast_divide(float x, float y) { return __fdividef(x, y); }
+
+    template <class T> __device__ T fast_divide_ftz(T x, T y) { return fast_divide(x, y); }
+    template <> inline __device__ float fast_divide_ftz(float x, float y) {
+        float result;
+        asm("div.approx.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
+        return result;
+    }
+
+    template <class T> __device__ T fast_exp(T value) { return exp(value); }
+    template <> inline __device__ float fast_exp(float value) { return __expf(value); }
+
+    template <class T> __device__ T fast_sigmoid(T value) { return sigmoid(value); }
+    template <> inline __device__ float fast_sigmoid(float value) { return __fdividef(1, 1 + __expf(-value)); }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/max_unpooling.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/max_unpooling.cu
@ -0,0 +1,328 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "array.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <type_traits>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Order,
+        typename std::enable_if<Order == 1 || Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
+        __global__ void max_pooling_with_indices(
+            Span<T> output, Span<T> indices, View<T> input, size_type channels,
+            array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+            array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+        {
+            /* every element in the output is mapped to a window in the input and each thread processes several windows */
+            for (auto idx : grid_stride_range(output.size())) {
+                size_type out_spatial_size = 1;
+                array<index_type, Order> window_idx;
+                for (int i = Order - 1; i >= 0; i--) {
+                    window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
+                    out_spatial_size *= out_spatial_dims[i];
+                }
+
+                const index_type n = idx / (out_spatial_size * channels);
+                const index_type c = (idx / out_spatial_size) % channels;
+
+                array<index_type, Order> start;
+                for(int i = 0; i < Order; i++)
+                    start[i] = window_idx[i] * strides[i] - padding_left[i];
+
+                array<index_type, Order> end;
+                for (int i = 0; i < Order; i++) {
+                    using device::min;
+                    end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
+                }
+
+                for (int i = 0; i < Order; i++) {
+                    using device::max;
+                    start[i] = max(start[i], 0);
+                }
+
+                T max_value = numeric_limits<T>::lowest();
+                index_type max_idx = -1;
+
+                size_type in_spatial_size = 1;
+                for (int i = 0; i < Order; i++)
+                    in_spatial_size *= in_spatial_dims[i];
+
+                const auto outer_offset =  (n * channels + c) * in_spatial_size;
+                if (Order == 1) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        index_type offset = 0;
+                        index_type stride = 1;
+                        for (int i = Order - 1; i >= 0; i--) {
+                            offset += stride * idx[i];
+                            stride *= in_spatial_dims[i];
+                        }
+
+                        if (input[outer_offset + offset] > max_value) {
+                            max_idx = offset;
+                            max_value = input[outer_offset + offset];
+                        }
+                    }
+                } else if (Order == 2) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+                            index_type offset = 0;
+                            index_type stride = 1;
+                            for (int i = Order - 1; i >= 0; i--) {
+                                offset += stride * idx[i];
+                                stride *= in_spatial_dims[i];
+                            }
+
+                            if (input[outer_offset + offset] > max_value) {
+                                max_idx = offset;
+                                max_value = input[outer_offset + offset];
+                            }
+                        }
+                    }
+                } else if(Order == 3) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+                            for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
+                                index_type offset = 0;
+                                index_type stride = 1;
+                                for (int i = Order - 1; i >= 0; i--) {
+                                    offset += stride * idx[i];
+                                    stride *= in_spatial_dims[i];
+                                }
+
+                                if (input[outer_offset + offset] > max_value) {
+                                    max_idx = offset;
+                                    max_value = input[outer_offset + offset];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                output[idx] = max_value;
+                indices[idx] = max_idx;
+            }
+        }
+
+        template <class T, std::size_t Order>
+        __global__ void max_unpooling(
+            Span<T> output, View<T> input, View<T> indices, size_type channels,
+            array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+            array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+        {
+            /* the output has already been zero filled */
+            /* Every input value represents a window in the output. The max unpooling operation
+             * copies the input value to exactly one location in the output window which is given
+             * by the indices tensor.
+             */
+            for (auto idx : grid_stride_range(input.size())) {
+                size_type in_spatial_size = 1;
+                array<index_type, Order> window_idx;
+                for (int i = Order - 1; i >= 0; i--) {
+                    window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
+                    in_spatial_size *= in_spatial_dims[i];
+                }
+
+                const index_type n = idx / (in_spatial_size * channels);
+                const index_type c = (idx / in_spatial_size) % channels;
+
+                array<index_type, Order> start;
+                for (int i = 0; i < Order; i++) {
+                    using device::min;
+                    using device::max;
+                    start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
+                }
+
+                size_type out_spatial_size = 1;
+                for (int i = 0; i < Order; i++)
+                    out_spatial_size *= out_spatial_dims[i];
+
+                index_type outer_offset = (n * channels + c) * out_spatial_size;
+                output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Order> static
+    void launch_max_pooling_kernel(
+        const Stream& stream,
+        Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
+        const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+        const std::vector<std::size_t>& window_size,
+        const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(indices.size() == output.size());
+        CV_Assert(out_spatial_dims.size() == Order);
+        CV_Assert(in_spatial_dims.size() == Order);
+        CV_Assert(window_size.size() == Order);
+        CV_Assert(strides.size() == Order);
+        CV_Assert(padding_left.size() == Order);
+
+        array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+        out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+        in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+        array<size_type, Order> window_size_k, strides_k, padding_left_k;
+        window_size_k.assign(std::begin(window_size), std::end(window_size));
+        strides_k.assign(std::begin(strides), std::end(strides));
+        padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+        auto kernel = raw::max_pooling_with_indices<T, Order>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, indices, input, channels,
+            out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+    }
+
+    template <class T>
+    void max_pooling_with_indices(
+        const Stream& stream,
+        TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(is_shape_same(output, indices));
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto order = window_size.size();
+        CV_Assert(strides.size() == order);
+        CV_Assert(padding_left.size() == order);
+        CV_Assert(output.rank() == order + 2);
+        CV_Assert(input.rank() == order + 2);
+
+        std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+        for (int i = 0; i < order; i++) {
+            in_spatial_dims[i] = input.get_axis_size(2 + i);
+            out_spatial_dims[i] = output.get_axis_size(2 + i);
+        }
+
+        CV_Assert(1 <= order && order <= 3);
+        std::size_t channels = input.get_axis_size(1);
+        if (order == 3) {
+            launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 2) {
+            launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 1) {
+            launch_max_pooling_kernel<T, 1>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void max_pooling_with_indices(const Stream&,
+        TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+#endif
+
+    template void max_pooling_with_indices(const Stream&,
+        TensorSpan<float>, TensorSpan<float>, TensorView<float>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+    template <class T, std::size_t Order> static
+    void launch_max_unpooling_kernel(
+        const Stream& stream,
+        Span<T> output, View<T> input, View<T> indices, std::size_t channels,
+        const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+        const std::vector<std::size_t>& window_size,
+        const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(out_spatial_dims.size() == Order);
+        CV_Assert(in_spatial_dims.size() == Order);
+        CV_Assert(window_size.size() == Order);
+        CV_Assert(strides.size() == Order);
+        CV_Assert(padding_left.size() == Order);
+        CV_Assert(indices.size() == input.size());
+
+        array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+        out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+        in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+        array<size_type, Order> window_size_k, strides_k, padding_left_k;
+        window_size_k.assign(std::begin(window_size), std::end(window_size));
+        strides_k.assign(std::begin(strides), std::end(strides));
+        padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+        auto kernel = raw::max_unpooling<T, Order>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, output, input, indices, channels,
+            out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+    }
+
+    template <class T>
+    void max_unpooling(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(is_shape_same(input, indices));
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto order = window_size.size();
+        CV_Assert(strides.size() == order);
+        CV_Assert(padding_left.size() == order);
+        CV_Assert(output.rank() == order + 2);
+        CV_Assert(input.rank() == order + 2);
+
+        std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+        for (int i = 0; i < order; i++) {
+            in_spatial_dims[i] = input.get_axis_size(2 + i);
+            out_spatial_dims[i] = output.get_axis_size(2 + i);
+        }
+
+        kernels::fill<T>(stream, output, 0.0);
+
+        /* only max_unpooling2d and max_unpooling3d are supported */
+        CV_Assert(2 <= order && order <= 3);
+        std::size_t channels = input.get_axis_size(1);
+        if (order == 3) {
+            launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 2) {
+            launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void max_unpooling(const Stream&,
+        TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+#endif
+
+    template void max_unpooling(const Stream&,
+        TensorSpan<float>, TensorView<float>, TensorView<float>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/memory.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/memory.hpp
@ -0,0 +1,32 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_MEMORY_HPP
+#define OPENCV_DNN_SRC_CUDA_MEMORY_HPP
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+template <class T>
+__device__ T load_ldg(const T& src) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+    return __ldg(&src);
+#else
+    return src;
+#endif
+}
+
+template <class T>
+__device__ T load_ldg(const T* src) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+    return __ldg(src);
+#else
+    return *src;
+#endif
+}
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_MEMORY_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/mvn.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/mvn.cu
@ -0,0 +1,145 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "atomics.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T>
+    __global__ void reduce_mean(Span<float> means, View<T> input, size_type inner_size) {
+        for (auto idx : grid_stride_range(input.size())) {
+            const index_type outer_idx = idx / inner_size;
+            atomicAdd(&means[outer_idx], static_cast<float>(input[idx]) / inner_size);
+        }
+    }
+
+    template <class T>
+    __global__ void reduce_mean_sqr_sum(Span<float> means, Span<float> sum_sqrs, View<T> input, size_type inner_size) {
+        for (auto idx : grid_stride_range(input.size())) {
+            const index_type outer_idx = idx / inner_size;
+            auto x = static_cast<float>(input[idx]);
+            atomicAdd(&means[outer_idx], x / inner_size);
+            atomicAdd(&sum_sqrs[outer_idx], x * x);
+        }
+    }
+
+    __global__ void compute_normalization_scale(Span<float> scale, View<float> means, View<float> sums_sqr, size_type inner_size, float eps) {
+        for (auto idx : grid_stride_range(scale.size())) {
+            auto mean = means[idx];
+            auto var = sums_sqr[idx] / inner_size - mean * mean;
+            using device::rsqrt;
+            scale[idx] = rsqrt(eps + var);
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean(Span<T> output, View<T> input, View<float> means, size_type inner_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            output[idx] = static_cast<float>(input[idx]) - means[outer_idx];
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean_variance(Span<T> output, View<T> input, View<float> means, View<float> scale, size_type inner_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * scale[outer_idx];
+        }
+    }
+}
+
+template <class T>
+void reduce_mean(const Stream& stream, Span<float> means, View<T> input, std::size_t inner_size)
+{
+    CV_Assert(input.size() / inner_size == means.size());
+
+    auto kernel = raw::reduce_mean<T>;
+    auto policy = make_policy(kernel, input.size(), 0, stream);
+    launch_kernel(kernel, policy, means, input, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void reduce_mean(const Stream&, Span<float>, View<__half>, std::size_t);
+#endif
+template void reduce_mean(const Stream&, Span<float>, View<float>, std::size_t);
+
+template <class T>
+void reduce_mean_sqr_sum(const Stream& stream, Span<float> means, Span<float> sum_sqrs, View<T> input, std::size_t inner_size)
+{
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(input.size() / inner_size == sum_sqrs.size());
+
+    auto kernel = raw::reduce_mean_sqr_sum<T>;
+    auto policy = make_policy(kernel, input.size(), 0, stream);
+    launch_kernel(kernel, policy, means, sum_sqrs, input, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<__half>, std::size_t);
+#endif
+template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<float>, std::size_t);
+
+void compute_normalization_scale(const Stream& stream, Span<float> scale, View<float> means, View<float> sum_sqrs, std::size_t inner_size, float eps)
+{
+    CV_Assert(scale.size() == means.size());
+    CV_Assert(scale.size() == sum_sqrs.size());
+
+    auto kernel = raw::compute_normalization_scale;
+    auto policy = make_policy(kernel, scale.size(), 0, stream);
+    launch_kernel(kernel, policy, scale, means, sum_sqrs, inner_size, eps);
+}
+
+template <class T>
+void normalize_mean(const Stream& stream, Span<T> output, View<T> input, View<float> means, std::size_t inner_size)
+{
+    CV_Assert(output.size() == input.size());
+    CV_Assert(input.size() / inner_size == means.size());
+
+    auto kernel = raw::normalize_mean<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, means, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean(const Stream&, Span<__half>, View<__half>, View<float>, std::size_t);
+#endif
+template void normalize_mean(const Stream&, Span<float>, View<float>, View<float>, std::size_t);
+
+template <class T>
+void normalize_mean_variance(const Stream& stream, Span<T> output, View<T> input, View<float> means, View<float> scale, std::size_t inner_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(input.size() / inner_size == scale.size());
+
+    auto kernel = raw::normalize_mean_variance<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, means, scale, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance(const Stream&, Span<__half>, View<__half>, View<float>, View<float>, std::size_t);
+#endif
+template void normalize_mean_variance(const Stream&, Span<float>, View<float>, View<float>, View<float>, std::size_t);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu
@ -0,0 +1,123 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "atomics.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T>
+        __global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+            for (auto idx : grid_stride_range(input.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                atomicAdd(&output[sum_idx], device::abs(input[idx]));
+            }
+        }
+
+        template <class T>
+        __global__ void reciprocal(Span<T> output, T epsilon) {
+            for (auto idx : grid_stride_range(output.size()))
+                output[idx] = T(1) / (output[idx] + epsilon);
+        }
+
+        template <class T>
+        __global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+           for (auto idx : grid_stride_range(input.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                atomicAdd(&output[sum_idx], input[idx] * input[idx]);
+           }
+        }
+
+        template <class T>
+        __global__ void rsqrt(Span<T> output, T epsilon) {
+            for (auto idx : grid_stride_range(output.size())) {
+                using device::sqrt;
+                output[idx] = T(1) / sqrt(output[idx] + epsilon);
+            }
+        }
+
+        template <class T>
+        __global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
+            for (auto idx : grid_stride_range(output.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                output[idx] = input[idx] * sums[sum_idx];
+            }
+        }
+    }
+
+    template <class T>
+    void normalize(
+        const Stream& stream,
+        Span<T> output,
+        View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
+        Span<T> workspace)
+    {
+        CV_Assert(output.size() == input.size());
+        CV_Assert(output.size() == outer_size * mid_size * inner_size);
+        CV_Assert(norm == 1 || norm == 2);
+        CV_Assert(workspace.size() >= outer_size * inner_size);
+
+        auto sums = Span<T>(workspace.data(), outer_size * inner_size);
+
+        fill<T>(stream, sums, 0.0);
+
+        if (norm == 1) {
+            auto reduce_kernel = raw::reduce_sum_abs<T>;
+            auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+            launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+            auto reciprocal_kernel = raw::reciprocal<T>;
+            policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
+            launch_kernel(reciprocal_kernel, policy, sums, epsilon);
+        } else {
+            auto reduce_kernel = raw::reduce_sum_squared<T>;
+            auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+            launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+            auto rsqrt_kernel = raw::rsqrt<T>;
+            policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
+            launch_kernel(rsqrt_kernel, policy, sums, epsilon);
+        }
+
+        auto scale_kernel = raw::apply_norm<T>;
+        auto policy = make_policy(scale_kernel, output.size(), 0, stream);
+        launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
+#endif
+    template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/padding.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/padding.cu
@ -0,0 +1,201 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void copy_with_reflection101(
+            Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
+            View<T> input, array<size_type, Rank> in_strides)
+        {
+            for (auto i : grid_stride_range(output.size())) {
+                /* compute output axis indices corresponding to element 'i' */
+                array<index_type, Rank> out_index;
+                out_index[0] = i / out_strides[0];
+                for (int j = 1; j < Rank; j++)
+                    out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
+
+                /* compute input axis indices corresponding to output axis indices */
+                array<index_type, Rank> in_index;
+                for (int j = 0; j < Rank; j++) {
+                    /* if out_index < start, the point is in the left reflection region
+                     * the reflected value's index is the absolute value of the difference
+                     *
+                     * otherwise, if the value is in the copy region, out_index - start gives the input index
+                     */
+                    using device::abs;
+                    in_index[j] = abs(out_index[j] - start[j]);
+
+                    /* if out_index >= end, it's in the right reflection region */
+                    if (out_index[j] >= end[j])
+                        in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
+                }
+
+                /* compute input element number from input axis indices */
+                index_type iidx = 0;
+                for (int j = 0; j < Rank; j++)
+                    iidx += in_index[j] * in_strides[j];
+
+                output[i] = input[iidx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_copy_with_reflection101(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride,
+        const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+        CV_Assert(ranges.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> start_k, end_k;
+        for (int i = 0; i < Rank; i++) {
+            start_k[i] = ranges[i].first;
+            end_k[i] = ranges[i].second;
+        }
+
+        auto kernel = raw::copy_with_reflection101<T, Rank>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
+
+    template <class T>
+    void copy_with_reflection101(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::pair<std::size_t, std::size_t>> ranges)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == ranges.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+         * output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
+         * The padding operation essentially copies items from the input tensor to new locations in the output tensor
+         * and pads the remaining.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output indices
+         * for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
+         * there cannot be extra padding since the axes have unit size. The first index does not contribute to the
+         * element's address calculation and hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            ranges.erase(std::begin(ranges));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == ranges.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes which do not have any padding can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
+         * padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+         *
+         * Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
+         * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any padding */
+            if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
+                CV_Assert(inShape[i] == outShape[i]);
+
+                /* we now iterate through the axes which follow and try to merge */
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
+                    CV_Assert(inShape[j] == outShape[j]);
+
+                    /* `j` is also unpadded; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    ranges[i].second = new_size;
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    ranges.erase(std::begin(ranges) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == ranges.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+#endif
+    template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+
+}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/permute.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/permute.cu
@ -0,0 +1,288 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void permute(
+            array<index_type, Rank> axis_order,
+            Span<T> output, array<size_type, Rank> outStrides,
+            View<T> input, array<size_type, Rank> inStrides)
+        {
+            for (auto i : grid_stride_range(input.size())) {
+                index_type oldPosition = 0;
+                index_type newPosition = i;
+
+                for (int j = 0; j < Rank; j++)
+                {
+                    auto order = axis_order[j];
+                    oldPosition += (newPosition / outStrides[j]) * inStrides[order];
+                    newPosition %= outStrides[j];
+                }
+
+                output[i] = input[oldPosition];
+            }
+        }
+
+        template <class T, int TILE_SIZE, int ROWS_PER_THREAD>
+        __global__ void transpose(Span<T> output, View<T> input, size_type in_width, size_type out_width)
+        {
+            __shared__ T tile[TILE_SIZE][TILE_SIZE + 1];
+
+            /* blockDim.y = TILE_SIZE / ROWS_PER_THREAD, blockDim.x = TILE_SIZE */
+            const index_type in_x = blockIdx.x * TILE_SIZE + threadIdx.x;
+            const index_type in_y_begin = blockIdx.y * TILE_SIZE + threadIdx.y;
+
+            /* Every valid input location has a corresponding output location and vice versa.
+             * Hence, if we do not load values into the shared memory for a given location, we
+             * also won't read them for storing in the output.
+             */
+            for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
+            {
+                const auto in_y_current = in_y_begin + j;
+                if (in_x < in_width && in_y_current < out_width)
+                    tile[threadIdx.y + j][threadIdx.x] = input[in_y_current * in_width + in_x];
+            }
+
+            __syncthreads();
+
+            /* We interchange `threadIdx.x` and `threadIdx.y` so that consecutive output indices map to
+             * consecutive threads. This would allow writes across threds in a warp to be coalesced.
+             */
+            const index_type out_x = blockIdx.y * TILE_SIZE + threadIdx.x;
+            const index_type out_y_begin = blockIdx.x * TILE_SIZE + threadIdx.y;
+
+            for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
+            {
+                const auto out_y_current = out_y_begin + j;
+                if (out_x < out_width && out_y_current < in_width)
+                    output[out_y_current * out_width + out_x] = tile[threadIdx.x][threadIdx.y + j];
+            }
+        }
+    }
+
+    template <class T>
+    void transpose(const Stream& stream, Span<T> output, View<T> input, std::size_t in_width, std::size_t out_width)
+    {
+        /* Each block processes a TILE_SIZE x TILE_SIZE piece */
+        constexpr int TILE_SIZE = 32;
+
+        /* Each thread processes ROWS_PER_THREAD rows. We do this to decrease the number of threads required
+         * in a block so that the cost of the block-wide synchronization is minimized.
+         */
+        constexpr int ROWS_PER_THREAD = 4;
+
+        dim3 grid_size((in_width + TILE_SIZE - 1) / TILE_SIZE, (out_width + TILE_SIZE - 1) / TILE_SIZE);
+        dim3 block_size(TILE_SIZE, TILE_SIZE / ROWS_PER_THREAD);
+        auto policy = execution_policy(grid_size, block_size, stream);
+
+        auto kernel = raw::transpose<T, TILE_SIZE, ROWS_PER_THREAD>;
+        launch_kernel(kernel, policy, output, input, in_width, out_width);
+    }
+
+    template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t);
+    template void transpose(const Stream&, Span<float>, View<float>, std::size_t, std::size_t);
+
+    template <class T, std::size_t Rank> static
+    void launch_permute_kernel(
+        const Stream& stream,
+        const std::vector<std::size_t>& order,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride)
+    {
+        CV_Assert(order.size() == Rank);
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+
+        array<index_type, Rank> order_k;
+        order_k.assign(std::begin(order), std::end(order));
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        auto kernel = raw::permute<T, Rank>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel);
+
+    template <class T>
+    void permute(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> order)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(input.rank() == order.size());
+        CV_Assert(input.size() == output.size());
+
+        auto rank = output.rank();
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* singleton axes do not contribute towards address calculation
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+         * output tensor will be some permutation of the input tensor indices. Let the output
+         * tensor indices be [o1, o2, ...]. The permutation operation essentially copies items
+         * from the input tensor to new locations in the output tensor as dictated by the indices.
+         *
+         * If the size of the nth axis (say i2) of the input is one the input and output indicies for
+         * all the elements will be of the form be [i1, 0, ...] and [..., 0, ...] respectively.
+         * The index does not contribute to the element's address calculation and hence would give
+         * identical result if it weren't there.
+         */
+        for (int i = 0; i < rank; i++)
+        {
+            /* index `i` corresponds to the axis index in the output; order[i] has the corresponding axis index in the input */
+            while (i < rank && outShape[i] == 1)
+            {
+                int in_i = order[i];
+                CV_Assert(inShape[in_i] == 1);
+
+                /* delete axis `i` */
+                inShape.erase(std::begin(inShape) + in_i);
+                outShape.erase(std::begin(outShape) + i);
+
+                /* deletion of an axis reduces an axis in the input tensor which would cause the indices
+                 * of the axes that come after the deleted axis to reduce by one
+                 */
+                order.erase(order.begin() + i);
+                for (auto& axis : order)
+                    if (axis > in_i)
+                        axis--;
+
+                rank--;
+
+                /* optimizations should not break the invariants */
+                CV_Assert(rank == order.size());
+                CV_Assert(inShape.size() == order.size());
+                CV_Assert(outShape.size() == order.size());
+                CV_Assert(input.size() == output.size());
+            }
+        }
+
+        /* contiguous axes whose relative ordering stays same before and after permutation can be merged into one axis
+         * example: in permute order 0 2 3 1, axes 2 and 3 can be grouped into a single axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i0, i1, i2, i3, ...]. Let the permutation order be [0, 3, 1, 2, ...].
+         * Note that i1 and i2 are adjacent axes in the same order in input as well as output. The indices in the output tensor
+         * will be [i0, i3, i1, i2, ...].
+         *
+         * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * (size2 * stride2) + i2 * stride2` which is `(i1 * size2 + i2) * stride2`,
+         * in both input and output. Note stride2 can be different in the input and output. We can merge the two axes into one axis
+         * with a size of `size1 * size2`. The new offset added will be `i12 * stride12` as the kernel iterates through `i12`. Note
+         * that `i12` is actually `(i1 * size2 + i2)` and `stride12` is `stride2`.
+         */
+         for (int i = 0; i < rank; i++) {
+            /* the indices used in the loops such as `i` and `j` are axis indices in the output tensor */
+            /* the corresponding input axis indices are `order[i]` and `order[j]`*/
+
+            /* loop invariant: `i` is the first axis in the contiguous unpermuted axis sequence */
+
+            int j = i + 1; /* `j` is the axis which we will attempt to merge */
+            while (j < rank && (order[i] + 1) == order[j]) {
+                /* axis `i` and axis `j` do not change relative order */
+
+                auto in_i = order[i], in_j = order[j];
+
+                auto new_size = inShape[in_i] * inShape[in_j];
+                inShape[in_i] = new_size;
+                outShape[i] = new_size;
+
+                /* delete axis `j` */
+                inShape.erase(std::begin(inShape) + in_j);
+                outShape.erase(std::begin(outShape) + j);
+
+                /* deletion of an axis reduces an axis in the input tensor which would cause the indices
+                 * of the axes that come after the deleted axis to reduce by one
+                 */
+                order.erase(order.begin() + j);
+                for (auto& axis : order)
+                    if (axis > order[i])
+                        axis--;
+
+                rank--;
+
+                /* optimizations should not break the invariants */
+                CV_Assert(rank == order.size());
+                CV_Assert(inShape.size() == order.size());
+                CV_Assert(outShape.size() == order.size());
+                CV_Assert(input.size() == output.size());
+            }
+        }
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        const bool is_in_order = [&order] {
+            for (int i = 0; i < order.size(); i++)
+                if (order[i] != i)
+                    return false;
+            return true;
+        }();
+
+        if (is_in_order)
+        {
+            kernels::copy<T>(stream, output, input);
+        }
+        else if(rank == 2)
+        {
+            /* use the more efficient transpose kernel */
+            transpose<T>(stream, output, input, inShape[1], outShape[1]);
+        }
+        else
+        {
+            CV_Assert(3 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+            permute_dispatcher<T, 3, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+#endif
+    template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/prior_box.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/prior_box.cu
@ -0,0 +1,176 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, bool Normalize>
+        __global__ void prior_box(
+            Span<T> output,
+            View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+            size_type layerWidth, size_type layerHeight,
+            size_type imageWidth, size_type imageHeight)
+        {
+            /* each box consists of two pair of coordinates and hence 4 values in total */
+            /* since the entire output consists (first channel at least) of these boxes,
+             * we are garunteeed that the output is aligned to a boundary of 4 values
+             */
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+
+            /* num_points contains the number of points in the feature map of interest
+             * each iteration of the stride loop selects a point and generates prior boxes for it
+             */
+            size_type num_points = layerWidth * layerHeight;
+            for (auto idx : grid_stride_range(num_points)) {
+                const index_type x = idx % layerWidth,
+                                 y = idx / layerWidth;
+
+                index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size();
+                for (int i = 0; i < boxWidth.size(); i++) {
+                    for (int j = 0; j < offsetX.size(); j++) {
+                        float center_x = (x + offsetX[j]) * stepX;
+                        float center_y = (y + offsetY[j]) * stepY;
+
+                        vector_type vec;
+                        if(Normalize) {
+                            vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth;
+                            vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight;
+                            vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth;
+                            vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight;
+                        } else {
+                            vec.data[0] = center_x - boxWidth[i] * 0.5f;
+                            vec.data[1] = center_y - boxHeight[i] * 0.5f;
+                            vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f;
+                            vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f;
+                        }
+
+                        v_store(output_vPtr[output_offset_v4], vec);
+                        output_offset_v4++;
+                    }
+                }
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_clip(Span<T> output) {
+            for (auto i : grid_stride_range(output.size())) {
+                using device::clamp;
+                output[i] = clamp<T>(output[i], 0.0, 1.0);
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_set_variance1(Span<T> output, float variance) {
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / 4)) {
+                vector_type vec;
+                for (int j = 0; j < 4; j++)
+                    vec.data[j] = variance;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) {
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / 4)) {
+                vector_type vec;
+                for(int j = 0; j < 4; j++)
+                    vec.data[j] = variance[j];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, bool Normalize> static
+    void launch_prior_box_kernel(
+        const Stream& stream,
+        Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+        std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight)
+    {
+        auto num_points = layerWidth * layerHeight;
+        auto kernel = raw::prior_box<T, Normalize>;
+        auto policy = make_policy(kernel, num_points, 0, stream);
+        launch_kernel(kernel, policy,
+            output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+            layerWidth, layerHeight, imageWidth, imageHeight);
+    }
+
+    template <class T>
+    void generate_prior_boxes(
+        const Stream& stream,
+        Span<T> output,
+        View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+        std::vector<float> variance,
+        std::size_t numPriors,
+        std::size_t layerWidth, std::size_t layerHeight,
+        std::size_t imageWidth, std::size_t imageHeight,
+        bool normalize, bool clip)
+    {
+        if (normalize) {
+            launch_prior_box_kernel<T, true>(
+                stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+                layerWidth, layerHeight, imageWidth, imageHeight
+            );
+        } else {
+            launch_prior_box_kernel<T, false>(
+                stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+                layerWidth, layerHeight, imageWidth, imageHeight
+            );
+        }
+
+        std::size_t channel_size = layerHeight * layerWidth * numPriors * 4;
+        CV_Assert(channel_size * 2 == output.size());
+
+        if (clip) {
+            auto output_span_c1 = Span<T>(output.data(), channel_size);
+            auto kernel = raw::prior_box_clip<T>;
+            auto policy = make_policy(kernel, output_span_c1.size(), 0, stream);
+            launch_kernel(kernel, policy, output_span_c1);
+        }
+
+        auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size);
+        if (variance.size() == 1) {
+            auto kernel = raw::prior_box_set_variance1<T>;
+            auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+            launch_kernel(kernel, policy, output_span_c2, variance[0]);
+        } else {
+            array<float, 4> variance_k;
+            variance_k.assign(std::begin(variance), std::end(variance));
+            auto kernel = raw::prior_box_set_variance4<T>;
+            auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+            launch_kernel(kernel, policy, output_span_c2, variance_k);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float,
+        std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+#endif
+
+    template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float,
+        std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/region.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/region.cu
@ -0,0 +1,216 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "limits.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T>
+        __global__ void region_box(
+            Span<T> output, View<T> input, View<T> bias,
+            size_type boxes_per_cell, size_type box_size,
+            size_type rows, size_type cols, T scale_x_y,
+            size_type height_norm, size_type width_norm,
+            T object_prob_cutoff, bool new_coords)
+        {
+            using vector2_type = get_vector_type_t<T, 2>;
+            auto bias_vPtr = vector2_type::get_pointer(bias.data());
+
+            for (auto box_index : grid_stride_range(output.size() / box_size)) {
+                const auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
+                const auto box_offset = box_index * box_size;
+
+                const auto batch_inner_size = rows * cols * boxes_per_cell;
+                const auto row_inner_size = cols * boxes_per_cell;
+                const auto col_inner_size = boxes_per_cell;
+
+                const auto y = (box_index % batch_inner_size) / row_inner_size;
+                const auto x = (box_index % row_inner_size) / col_inner_size;
+
+                /* When new_coords is true, we shouldn't use logistic activation again */
+                T objectness_prob;
+                if (new_coords)
+                {
+                    const auto tmp_x = (input[box_offset + 0] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+                    const auto tmp_y = (input[box_offset + 1] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+
+                    output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
+                    output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
+
+                    vector2_type bias_xy;
+                    v_load(bias_xy, bias_vPtr[box_of_the_cell]);
+
+                    output[box_offset + 2] = input[box_offset + 2] * input[box_offset + 2] *
+                                             static_cast<T>(4) * bias_xy.data[0] / static_cast<T>(width_norm);
+                    output[box_offset + 3] = input[box_offset + 3] * input[box_offset + 3] *
+                                             static_cast<T>(4) * bias_xy.data[1] / static_cast<T>(height_norm);
+
+                    objectness_prob = input[box_offset + 4];
+                }
+                else
+                {
+                    const auto tmp_x = (fast_sigmoid(input[box_offset + 0]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+                    const auto tmp_y = (fast_sigmoid(input[box_offset + 1]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
+
+                    output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
+                    output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
+
+                    vector2_type bias_xy;
+                    v_load(bias_xy, bias_vPtr[box_of_the_cell]);
+
+                    output[box_offset + 2] = fast_exp(input[box_offset + 2]) * bias_xy.data[0] / static_cast<T>(width_norm);
+                    output[box_offset + 3] = fast_exp(input[box_offset + 3]) * bias_xy.data[1] / static_cast<T>(height_norm);
+
+                    /* squash objectness score into a probability */
+                    objectness_prob = fast_sigmoid(input[box_offset + 4]);
+                }
+
+                /* ignore prediction if the objectness probability is less than the cutoff */
+                if (objectness_prob < object_prob_cutoff)
+                    objectness_prob = 0;
+
+                output[box_offset + 4] = objectness_prob;
+            }
+        }
+
+        template <class T>
+        __global__ void region_sigmoid_class_score(Span<T> output, View<T> input, T class_prob_cutoff,
+                                                   size_type box_size, bool new_coords)
+        {
+            for (auto idx : grid_stride_range(output.size())) {
+                const index_type box_no = idx / box_size;
+                const index_type start_of_box = box_no * box_size;
+                const index_type box_offset = idx % box_size;
+
+                if (box_offset < 5) {
+                    /* continue as we have already processed these in region_box */
+                    continue;
+                }
+
+                auto objectness_prob = output[start_of_box + 4];
+
+                /* the class probabilities we currently have are conditional class probabilities
+                 * given the object
+                 *
+                 * to obtain the actual class probability, we multiply the conditional probability
+                 * with the object probability
+                 *
+                 * when new_coords is true, we shouldn't use logistic activation again.
+                 */
+
+                T actual_class_prob;
+                if (new_coords)
+                {
+                    actual_class_prob = objectness_prob * input[idx];
+                }
+                else
+                {
+                    actual_class_prob = objectness_prob * fast_sigmoid(input[idx]);
+                }
+
+                if (actual_class_prob <= class_prob_cutoff)
+                    actual_class_prob = T(0);
+                output[idx] = actual_class_prob;
+            }
+        }
+
+        template <class T>
+        __global__ void region_softmax_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size) {
+            for (auto box_no : grid_stride_range(output.size() / box_size)) {
+                const index_type start_of_box = box_no * box_size;
+                const index_type start_idx = start_of_box + 5;
+                const index_type end_idx = start_of_box + box_size;
+
+                auto largest = numeric_limits<T>::lowest();
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    using device::max;
+                    largest = max(largest, input[idx]);
+                }
+
+                auto sum = T(0);
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    using device::exp;
+                    auto temp = exp(input[idx] - largest);
+                    sum += temp;
+                    output[idx] = temp;
+                }
+
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    auto softmax_score = output[idx] / sum;
+
+                    /* the class probabilities we currently have are conditional class probabilities
+                     * given the object
+                     *
+                     * to obtain the actual class probability, we multiply the conditional probability
+                     * with the object probability
+                     */
+                    auto objectness_prob = output[start_of_box + 4];
+                    auto actual_class_prob = objectness_prob * softmax_score;
+                    if (actual_class_prob <= class_prob_cutoff)
+                        actual_class_prob = T(0);
+                    output[idx] = actual_class_prob;
+                }
+            }
+        }
+    }
+
+    template <class T>
+    void region(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
+        T object_prob_cutoff, T class_prob_cutoff,
+        std::size_t boxes_per_cell, std::size_t box_size,
+        std::size_t rows, std::size_t cols, T scale_x_y,
+        std::size_t height_norm, std::size_t width_norm,
+        bool if_true_sigmoid_else_softmax, /* true = sigmoid, false = softmax */
+        bool new_coords)
+    {
+        CV_Assert(output.size() == input.size());
+        CV_Assert(output.size() % box_size == 0);
+        CV_Assert(is_fully_aligned(bias, 2));
+
+        auto box_kernel = raw::region_box<T>;
+        auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream);
+        launch_kernel(box_kernel, box_policy,
+            output, input, bias, boxes_per_cell, box_size,
+            rows, cols, scale_x_y, height_norm, width_norm,
+            object_prob_cutoff, new_coords);
+
+        if (if_true_sigmoid_else_softmax) {
+            auto kernel_score = raw::region_sigmoid_class_score<T>;
+            auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
+            launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size, new_coords);
+        } else {
+            auto kernel_score = raw::region_softmax_class_score<T>;
+            auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
+            launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void region(const Stream&, Span<__half>, View<__half>, View<__half>,
+        __half, __half, std::size_t, std::size_t, std::size_t, std::size_t, __half, std::size_t, std::size_t, bool, bool);
+#endif
+
+    template void region(const Stream&, Span<float>, View<float>, View<float>,
+        float, float, std::size_t, std::size_t, std::size_t, std::size_t, float, std::size_t, std::size_t, bool, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/resize.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/resize.cu
@ -0,0 +1,245 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void resize_nn(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            float o2i_fy, float o2i_fx, bool round, bool half_pixel_centers)
+        {
+            auto in_image_size = in_height * in_width;
+            auto out_image_size = out_height * out_width;
+
+            /* think of the output and input as a collection of 2d images with the last axis
+             * representing the width and the last but one axis representing the height
+             *
+             * the remaining axis together form a collection of these images/channels
+             */
+            auto num_effective_channels = output.size() / out_image_size;
+
+            /* we process multiple channels every iteration to reuse the identical computation
+             * involved with the spatial dimensions
+             *
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
+             */
+            auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
+
+            /* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
+             * combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
+             * iterations in total to finish the resize operation
+             */
+            auto iters_required = num_channel_iters_per_xy * out_image_size;
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                auto in_yf = half_pixel_centers ? (y + 0.5f) * o2i_fy : y * o2i_fy;
+                auto in_xf = half_pixel_centers ? (x + 0.5f) * o2i_fx : x * o2i_fx;
+
+                using device::lround;
+                index_type in_y = round ? lround(in_yf) : static_cast<index_type>(in_yf);
+                index_type in_x = round ? lround(in_xf) : static_cast<index_type>(in_xf);
+
+                using device::min;
+                in_y = min(in_y, in_height - 1);
+                in_x = min(in_x, in_width - 1);
+
+                index_type in_idx = c_start * in_image_size + in_y * in_width + in_x;
+                index_type out_idx = c_start * out_image_size + y * out_width + x;
+
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    output[out_idx] = load_ldg(input[in_idx]);
+
+                    in_idx += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void resize_bilinear(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            float o2i_fy, float o2i_fx, bool half_pixel_centers)
+        {
+            auto in_image_size = in_height * in_width;
+            auto out_image_size = out_height * out_width;
+
+            /* think of the output and input as a collection of 2d images with the last axis
+             * representing the width and the last but one axis representing the height
+             *
+             * the remaining axis together form a collection of these images/channels
+             */
+            auto num_effective_channels = output.size() / out_image_size;
+
+            /* we process multiple channels every iteration to reuse the identical computation
+             * involved with the spatial dimensions
+             *
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
+             */
+            auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
+
+            /* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
+             * combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
+             * iterations in total to finish the resize operation
+             */
+            auto iters_required = num_channel_iters_per_xy * out_image_size;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
+                const index_type c_end = c_start + CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                using device::max;
+                auto in_x = half_pixel_centers ? max<float>((x + 0.5f) * o2i_fx - 0.5f, 0.0f) : x * o2i_fx;
+                auto in_y = half_pixel_centers ? max<float>((y + 0.5f) * o2i_fy - 0.5f, 0.0f) : y * o2i_fy;
+
+                auto in_x0 = static_cast<index_type>(in_x);
+                auto in_y0 = static_cast<index_type>(in_y);
+
+                using device::min;
+                auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling to reduce register pressure; not sure how but it works */
+                for (auto c = c_start; c < c_end; c++) {
+                    auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
+                         v_01 = load_ldg(input[in_offset_r0 + in_x1]),
+                         v_10 = load_ldg(input[in_offset_r1 + in_x0]),
+                         v_11 = load_ldg(input[in_offset_r1 + in_x1]);
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - in_y0) * T(v_10 - v_00) +
+                        T(in_x - in_x0) * T(v_01 - v_00) +
+                        T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_resize_nn(const Stream& stream,
+        Span<T> output, size_type out_height, size_type out_width,
+        View<T> input, size_type in_height, size_type in_width,
+        float scale_y, float scale_x, bool round, bool half_pixel_centers)
+    {
+        auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width,  scale_y, scale_x, round, half_pixel_centers);
+    }
+
+    template <class T>
+    void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers) {
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_effective_channels = input.size_range(0, 2);
+        auto num_iters = num_effective_channels * out_height * out_width;
+
+        if (num_effective_channels % 32 == 0 && num_iters > 655360) {
+            launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 16 == 0 && num_iters > 327680) {
+            launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 8 == 0 && num_iters > 163840) {
+            launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 4 == 0 && num_iters > 81920) {
+            launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else if (num_effective_channels % 2 == 0) {
+            launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        } else {
+            launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool, bool);
+#endif
+    template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool,bool);
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_resize_bilinear(const Stream& stream,
+        Span<T> output, size_type out_height, size_type out_width,
+        View<T> input, size_type in_height, size_type in_width,
+        float scale_y, float scale_x, bool half_pixel_centers)
+    {
+        auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+    }
+
+    template <class T>
+    void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers) {
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_effective_channels = input.size_range(0, 2);
+        auto num_iters = num_effective_channels * out_height * out_width;
+
+        if (num_effective_channels % 16 == 0 && num_iters > 163840) {
+            launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else if (num_effective_channels % 8 == 0 && num_iters > 81920) {
+            launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else if (num_effective_channels % 4 == 0 && num_iters > 40960) {
+            launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else if (num_effective_channels % 2 == 0) {
+            launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        } else {
+            launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool);
+#endif
+    template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/roi_pooling.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/roi_pooling.cu
@ -0,0 +1,181 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void roi_pooling(
+            Span<T> output, size_type pooled_height, size_type pooled_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> rois, size_type num_channels, float spatial_scale)
+        {
+            // input: [1, num_channels, in_height, in_width]
+            const auto in_image_size = in_height * in_width;
+
+            // rois: [num_rois, 5]
+            auto num_rois = rois.size() / 5;
+
+            // output: [num_rois, num_channels, pooled_height, pooled_width]
+            const auto out_spatial_size = pooled_height * pooled_width;
+            const auto out_roi_size = num_channels * out_spatial_size;
+
+            /* we have to compute the output value for every combination of (roi, c, y, x) in the output
+             *
+             * the computation involving (y, x) are identical for all non-spatial dimensions
+             * the computation and memory requests involving the roi are identical for remaining three axes
+             *
+             * we process multiple channels every iteration to reuse the identical computation
+             * and memory requests involved with the roi and spatial dimensions
+             */
+            /*
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_channels / CHANNELS_PER_ITER) iterations per (roi, x, y)
+             */
+            auto num_channel_iters_per_roi_xy = num_channels / CHANNELS_PER_ITER;
+
+            /* we need `num_channel_iters_per_roi_xy` iterations per (roi, x, y) and there are
+             * `num_rois` rois and `out_spatial_size` combinations of (x, y)
+             */
+            auto iters_per_roi = num_channel_iters_per_roi_xy * out_spatial_size;
+            auto iters_required = num_rois * iters_per_roi;
+
+            for (auto iter : grid_stride_range(iters_required))
+            {
+                const index_type roi_no = iter / iters_per_roi;
+                const index_type c_start = ((iter % iters_per_roi) / out_spatial_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_spatial_size) / pooled_width;
+                const index_type x = iter % pooled_width;
+
+                const index_type roi_offset = roi_no * 5;
+
+                using device::round;
+                const index_type batch_id = rois[roi_offset + 0];
+                const index_type x_start_roi = round(static_cast<float>(rois[roi_offset + 1]) * spatial_scale);
+                const index_type y_start_roi = round(static_cast<float>(rois[roi_offset + 2]) * spatial_scale);
+                const index_type x_end_roi = round(static_cast<float>(rois[roi_offset + 3]) * spatial_scale);
+                const index_type y_end_roi = round(static_cast<float>(rois[roi_offset + 4]) * spatial_scale);
+
+                using device::max;
+                const auto roi_width = max<index_type>(x_end_roi - x_start_roi + 1, 1);
+                const auto roi_height = max<index_type>(y_end_roi - y_start_roi + 1, 1);
+
+                const auto roi_width_ratio = static_cast<float>(roi_width) / pooled_width;
+                const auto roi_height_ratio = static_cast<float>(roi_height) / pooled_height;
+
+                auto x_start = x_start_roi + static_cast<index_type>(x * roi_width_ratio);
+                auto y_start = y_start_roi + static_cast<index_type>(y * roi_height_ratio);
+
+                using device::ceil;
+                auto x_end = x_start_roi + static_cast<index_type>(ceil((x + 1) * roi_width_ratio));
+                auto y_end = y_start_roi + static_cast<index_type>(ceil((y + 1) * roi_height_ratio));
+
+                using device::max;
+                x_start = max<index_type>(x_start, 0);
+                y_start = max<index_type>(y_start, 0);
+
+                using device::min;
+                x_end = min<index_type>(x_end, in_width);
+                y_end = min<index_type>(y_end, in_height);
+
+                index_type in_offset = (batch_id * num_channels + c_start) * in_height * in_width;
+                index_type out_idx = roi_no * out_roi_size + c_start * out_spatial_size + y * pooled_width + x;
+
+                for (int i = 0; i < CHANNELS_PER_ITER; i++)
+                {
+                    /* We have to set the output to zero if (x_start >= x_end) or (y_start >= y_end). If either
+                     * condition is true, the loops below won't execute even a single iteration. Hence, by setting
+                     * `max_val` to zero in this case, we can combine it with the `else` code.
+                     */
+                    T max_val = (x_start >= x_end || y_start >= y_end) ? T(0) : device::numeric_limits<T>::lowest();
+
+                    for (auto iy = y_start; iy < y_end; iy++)
+                    {
+                        const auto in_idx = in_offset + iy * in_width;
+                        for (auto ix = x_start; ix < x_end; ix++)
+                        {
+                            max_val = max(max_val, load_ldg(input[in_idx + ix]));
+                        }
+                    }
+
+                    output[out_idx] = max_val;
+
+                    in_offset += in_image_size;
+                    out_idx += out_spatial_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_roi_pooling(const Stream& stream,
+        Span<T> output, size_type pooled_height, size_type pooled_width,
+        View<T> input, size_type in_height, size_type in_width,
+        View<T> rois, size_type num_channels, float spatial_scale)
+    {
+        auto kernel = raw::roi_pooling<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+    }
+
+    template <class T>
+    void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois, float spatial_scale)
+    {
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        size_type num_channels = output.get_axis_size(1);
+
+        size_type pooled_height = output.get_axis_size(2);
+        size_type pooled_width = output.get_axis_size(3);
+
+        size_type in_height = input.get_axis_size(2);
+        size_type in_width = input.get_axis_size(3);
+
+        if (num_channels % 64 == 0) {
+            launch_multichannel_roi_pooling<T, 64>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 32 == 0) {
+            launch_multichannel_roi_pooling<T, 32>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 16 == 0) {
+            launch_multichannel_roi_pooling<T, 16>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 8 == 0) {
+            launch_multichannel_roi_pooling<T, 8>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 4 == 0) {
+            launch_multichannel_roi_pooling<T, 4>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else if (num_channels % 2 == 0) {
+            launch_multichannel_roi_pooling<T, 2>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        } else {
+            launch_multichannel_roi_pooling<T, 1>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois, float spatial_scale);
+#endif
+    template void roi_pooling(const Stream& stream, TensorSpan<float> output, TensorView<float> input, View<float> rois, float spatial_scale);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/scale_shift.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/scale_shift.cu
@ -0,0 +1,235 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type bias_idx = (i / inner_size) % bias.size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for(int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] + bias[bias_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type scale_idx = (i / inner_size) % weights.size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * weights[scale_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = alpha * vec.data[j] + beta;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type scale_idx = (i / inner_size) % weights.size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::biasN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, bias);
+    }
+
+    template <class T>
+    void biasN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> bias)
+    {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
+        } else {
+            launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+#endif
+    template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+    template <class T, std::size_t N> static
+    void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::scaleN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, weights);
+    }
+
+    template <class T>
+    void scaleN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> weights)
+    {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
+        } else {
+            launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+#endif
+    template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+    template <class T, std::size_t N> static
+    void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::scale1_with_bias1_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, alpha, beta);
+    }
+
+    template <class T>
+    void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+        CV_Assert(output.size() == input.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
+        } else {
+            launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+#endif
+    template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
+
+    template <class T, std::size_t N> static
+    void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::scaleN_with_biasN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
+    }
+
+    template <class T>
+    void scaleN_with_biasN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> weights, TensorView<T> bias)
+    {
+        CV_Assert(is_shape_same(input, output));
+        CV_Assert(weights.size() == bias.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
+        } else {
+            launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
+        }
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
+#endif
+    template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/shortcut.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/shortcut.cu
@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+    template <class T, std::size_t N>
+    __global__ void input_shortcut_vec(
+        Span<T> output,
+        View<T> input, index_type c_input, /* `c_input` = number of channels in `input` */
+        View<T> from, index_type c_from, /* `c_from` = number of channels in `from` */
+        size_type channel_stride /* common for both `input` and `from` */)
+    {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto output_vPtr = vector_type::get_pointer(output.data());
+        auto input_vPtr = vector_type::get_pointer(input.data());
+        auto from_vPtr = vector_type::get_pointer(from.data());
+
+        auto batch_stride_input = c_input * channel_stride;
+        auto batch_stride_from = c_from * channel_stride;
+
+        for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+            const auto actual_idx = i * vector_type::size();
+            const auto b = actual_idx / batch_stride_input; /* `input` and `output` have the same shape */
+            const auto c = (actual_idx % batch_stride_input) / channel_stride;
+            const auto c_offset = actual_idx % channel_stride;
+
+            vector_type vec_input;
+            v_load(vec_input, input_vPtr[i]);
+
+            /* We can break down the shortcut operation into two steps:
+             * - copy `input` to `output`
+             * - add `from` to corresponding channels in `output`
+             *
+             * In this scheme, only some channels in the `output` differ from `input`. They differ in the channels
+             * which have a corresponding channel in `from`.
+             */
+            if (c < c_from) {
+                const auto from_actual_idx = b * batch_stride_from + c * channel_stride + c_offset;
+                const auto from_vec_idx = from_actual_idx / vector_type::size();
+
+                vector_type vec_from;
+                v_load(vec_from, from_vPtr[from_vec_idx]);
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec_input.data[j] += vec_from.data[j];
+            }
+
+            v_store(output_vPtr[i], vec_input);
+        }
+    }
+}
+
+template <class T, std::size_t N>
+void launch_vectorized_input_shortcut(const Stream& stream, Span<T> output, View<T> input, std::size_t c_input, View<T> from, std::size_t c_from, std::size_t channel_stride) {
+    CV_Assert(is_fully_aligned<T>(output, N));
+    CV_Assert(is_fully_aligned<T>(input, N));
+    CV_Assert(is_fully_aligned<T>(from, N));
+    CV_Assert(channel_stride % N == 0);
+
+    auto kernel = raw::input_shortcut_vec<T, N>;
+    auto policy = make_policy(kernel, output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, output, input, c_input, from, c_from, channel_stride);
+}
+
+template <class T>
+void input_shortcut(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> from) {
+    CV_Assert(is_shape_same(output, input));
+    CV_Assert(output.rank() == from.rank());
+    for (int i = 0; i < output.rank(); i++) {
+        if (i != 1) {
+            CV_Assert(from.get_axis_size(i) == output.get_axis_size(i));
+        }
+    }
+
+    auto channel_stride = output.size_range(2, output.rank()); /* same for `output`, `input` and `from` */
+    auto c_input = input.get_axis_size(1);
+    auto c_from = from.get_axis_size(1);
+
+    if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_fully_aligned<T>(from, 4) && channel_stride % 4 == 0) {
+        launch_vectorized_input_shortcut<T, 4>(stream, output, input, c_input, from, c_from, channel_stride);
+    } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_fully_aligned<T>(from, 2) && channel_stride % 2 == 0) {
+        launch_vectorized_input_shortcut<T, 2>(stream, output, input, c_input, from, c_from, channel_stride);
+    } else {
+        launch_vectorized_input_shortcut<T, 1>(stream, output, input, c_input, from, c_from, channel_stride);
+    }
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void input_shortcut(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<__half>);
+#endif
+template void input_shortcut(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/slice.cu
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/slice.cu
@ -0,0 +1,203 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill_copy.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void slice(
+            Span<T> output, array<size_type, Rank> out_strides,
+            View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset)
+        {
+            for (auto i : grid_stride_range(output.size())) {
+                index_type out_index = i / out_strides[0];
+                index_type in_index = in_offset[0] + out_index;
+                index_type iidx = in_index * in_strides[0];
+                for (int j = 1; j < Rank; j++) {
+                    out_index = (i % out_strides[j - 1]) / out_strides[j];
+                    in_index = in_offset[j] + out_index;
+                    iidx += in_index * in_strides[j];
+                }
+
+                output[i] = input[iidx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_slice(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+        CV_Assert(inOffset.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> inOffset_k;
+        inOffset_k.assign(std::begin(inOffset), std::end(inOffset));
+
+        auto kernel = raw::slice<T, Rank>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice);
+
+    template <class T>
+    void slice(const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> offsets)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == offsets.size());
+
+        /* copy directly if no slicing is required */
+        if (is_shape_same(output, input))
+        {
+            CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
+            kernels::copy<T>(stream, output, input);
+            return;
+        }
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input
+         * tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are ignored.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output indices
+         * for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that
+         * there cannot be any ignored items since the axes have unit size. The first index does not contribute to the
+         * element's address calculation and hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(offsets[0] == 0);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            offsets.erase(std::begin(offsets));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == offsets.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes which do not undergo slicing can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any
+         * slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...].
+         *
+         * Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`.
+         * Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any slicing */
+            if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+                    /* `j` axis is also unsliced; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    offsets[i] = 0; /* redundant */
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    offsets.erase(std::begin(offsets) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == offsets.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(offsets[i] == 0);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        /* We can do a copy if the reduced rank is two and only the first axis is sliced.
+         * The general requirement is that only one axis is sliced and all the axes that
+         * preceed the sliced axis are singleton. However, the reductions above will remove
+         * all the leading singleton axes and merge the trailing unsliced axes into one, or
+         * zero if there are no trailing unsliced axes. The latter is handled separately.
+         */
+        if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
+        {
+            auto stride = inShape[1];
+            auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
+            kernels::copy<T>(stream, output, sliced_input);
+            return;
+        }
+
+        if (rank == 1)
+        {
+            auto sliced_input = View<T>(input.get() + offsets[0], output.size());
+            kernels::copy<T>(stream, output, sliced_input);
+            return;
+        }
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets);
+    }
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+#endif
+    template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/types.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/types.hpp
@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP
+#define OPENCV_DNN_SRC_CUDA_TYPES_HPP
+
+#include <cstdint>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    /* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
+     * Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions.
+     *
+     * If we do not need to handle huge tensors, we can use 32-bit indices and get better performance.
+     */
+#ifdef __CUDACC__
+    using size_type = int;
+    using index_type = int;
+#else
+    using size_type = std::int32_t;
+    using index_type = std::int32_t;
+#endif
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */
--- a/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/vector_traits.hpp
+++ b/3rdparty/opencv-4.5.4/modules/dnn/src/cuda/vector_traits.hpp
@ -0,0 +1,120 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+#include "memory.hpp"
+
+#include "../cuda4dnn/csl/pointer.hpp"
+
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    /** \file vector_traits.hpp
+     *  \brief utility classes and functions for vectorized memory loads/stores
+     *
+     * Example:
+     * using vector_type = get_vector_type_t<float, 4>;
+     *
+     * auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
+     * auto output_vPtr = type::get_pointer(optr);  // optr is of type DevicePtr<float>
+     *
+     * vector_type vec;
+     * v_load(vec, input_vPtr);
+     *
+     * for(int i = 0; i < vector_type::size(); i++)
+     *      vec[i] = do_something(vec[i]);
+     *
+     * v_store(output_vPtr, vec);
+     */
+
+    namespace detail {
+        template <size_type N> struct raw_type_ { };
+        template <> struct raw_type_<256> { typedef ulonglong4 type; };
+        template <> struct raw_type_<128> { typedef uint4 type; };
+        template <> struct raw_type_<64> { typedef uint2 type; };
+        template <> struct raw_type_<32> { typedef uint1 type; };
+        template <> struct raw_type_<16> { typedef uchar2 type; };
+        template <> struct raw_type_<8> { typedef uchar1 type; };
+
+        template <size_type N> struct raw_type {
+            using type = typename raw_type_<N>::type;
+            static_assert(sizeof(type) * 8 == N, "");
+        };
+    }
+
+    /* \tparam T    type of element in the vector
+     * \tparam N    "number of elements" of type T in the vector
+     */
+    template <class T, size_type N>
+    union vector_type {
+        using value_type = T;
+        using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type;
+
+        __device__ vector_type() { }
+
+        __device__ static constexpr size_type size() { return N; }
+
+        raw_type raw;
+        T data[N];
+
+        template <class U> static __device__
+        typename std::enable_if<std::is_const<U>::value, const vector_type*>
+        ::type get_pointer(csl::DevicePtr<U> ptr) {
+            return reinterpret_cast<const vector_type*>(ptr.get());
+        }
+
+        template <class U> static __device__
+        typename std::enable_if<!std::is_const<U>::value, vector_type*>
+        ::type get_pointer(csl::DevicePtr<U> ptr) {
+            return reinterpret_cast<vector_type*>(ptr.get());
+        }
+    };
+
+    template <class V>
+    __device__ void v_load(V& dest, const V& src) {
+        dest.raw = src.raw;
+    }
+
+    template <class V>
+    __device__ void v_load(V& dest, const V* src) {
+        dest.raw = src->raw;
+    }
+
+    template <class V>
+    __device__ void v_load_ldg(V& dest, const V& src) {
+        dest.raw = load_ldg(src.raw);
+    }
+
+    template <class V>
+    __device__ void v_load_ldg(V& dest, const V* src) {
+        dest.raw = load_ldg(src->raw);
+    }
+
+    template <class V>
+    __device__ void v_store(V* dest, const V& src) {
+        dest->raw = src.raw;
+    }
+
+    template <class V>
+    __device__ void v_store(V& dest, const V& src) {
+        dest.raw = src.raw;
+    }
+
+    template <class T, size_type N>
+    struct get_vector_type {
+        typedef vector_type<T, N> type;
+    };
+
+    template <class T, size_type N>
+    using get_vector_type_t = typename get_vector_type<T, N>::type;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */