feat: 切换后端至PaddleOCR-NCNN,切换工程为CMake
1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试 2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程 3.重整权利声明文件,重整代码工程,确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
This commit is contained in:
368
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cublas.hpp
vendored
Normal file
368
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cublas.hpp
vendored
Normal file
@ -0,0 +1,368 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
#include "stream.hpp"
|
||||
#include "pointer.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cublas_v2.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#define CUDA4DNN_CHECK_CUBLAS(call) \
|
||||
::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
|
||||
|
||||
/** @brief exception class for errors thrown by the cuBLAS API */
|
||||
class cuBLASException : public CUDAException {
|
||||
public:
|
||||
using CUDAException::CUDAException;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
static void check(cublasStatus_t status, const char* func, const char* file, int line) {
|
||||
auto cublasGetErrorString = [](cublasStatus_t err) {
|
||||
switch (err) {
|
||||
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
return "UNKNOWN_CUBLAS_ERROR";
|
||||
};
|
||||
|
||||
if (status != CUBLAS_STATUS_SUCCESS)
|
||||
throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
|
||||
}
|
||||
}
|
||||
|
||||
/** non-copyable cuBLAS smart handle
|
||||
*
|
||||
* UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
|
||||
* is destroyed after use. The handle must always be associated with a non-default stream. The stream
|
||||
* must be specified during construction.
|
||||
*
|
||||
* Refer to stream API for more information for the choice of forcing non-default streams.
|
||||
*/
|
||||
class UniqueHandle {
|
||||
public:
|
||||
UniqueHandle() noexcept : handle{ nullptr } { }
|
||||
UniqueHandle(UniqueHandle&) = delete;
|
||||
UniqueHandle(UniqueHandle&& other) noexcept {
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
|
||||
/** creates a cuBLAS handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
UniqueHandle(Stream strm) : stream(std::move(strm)) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
|
||||
} catch (...) {
|
||||
/* cublasDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
~UniqueHandle() noexcept {
|
||||
if (handle) {
|
||||
/* cublasDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
|
||||
}
|
||||
}
|
||||
|
||||
UniqueHandle& operator=(const UniqueHandle&) = delete;
|
||||
UniqueHandle& operator=(UniqueHandle&& other) noexcept {
|
||||
CV_Assert(other);
|
||||
if (&other != this) {
|
||||
UniqueHandle(std::move(*this)); /* destroy current handle */
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** returns the raw cuBLAS handle */
|
||||
cublasHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle;
|
||||
}
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
private:
|
||||
Stream stream;
|
||||
cublasHandle_t handle;
|
||||
};
|
||||
|
||||
/** @brief sharable cuBLAS smart handle
|
||||
*
|
||||
* Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
|
||||
* is destroyed after all references to the handle are destroyed. The handle must always
|
||||
* be associated with a non-default stream. The stream must be specified during construction.
|
||||
*
|
||||
* @note Moving a Handle object to another invalidates the former
|
||||
*/
|
||||
class Handle {
|
||||
public:
|
||||
Handle() = default;
|
||||
Handle(const Handle&) = default;
|
||||
Handle(Handle&&) = default;
|
||||
|
||||
/** creates a cuBLAS handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
|
||||
|
||||
Handle& operator=(const Handle&) = default;
|
||||
Handle& operator=(Handle&&) = default;
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
/** returns the raw cuBLAS handle */
|
||||
cublasHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle->get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<UniqueHandle> handle;
|
||||
};
|
||||
|
||||
/** @brief GEMM for colummn-major matrices
|
||||
*
|
||||
* \f$ C = \alpha AB + \beta C \f$
|
||||
*
|
||||
* @tparam T matrix element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuBLAS Handle
|
||||
* @param transa use transposed matrix of A for computation
|
||||
* @param transb use transposed matrix of B for computation
|
||||
* @param rows_c number of rows in C
|
||||
* @param cols_c number of columns in C
|
||||
* @param common_dim common dimension of A (or trans A) and B (or trans B)
|
||||
* @param alpha scale factor for AB
|
||||
* @param[in] A pointer to column-major matrix A in device memory
|
||||
* @param lda leading dimension of matrix A
|
||||
* @param[in] B pointer to column-major matrix B in device memory
|
||||
* @param ldb leading dimension of matrix B
|
||||
* @param beta scale factor for C
|
||||
* @param[in,out] C pointer to column-major matrix C in device memory
|
||||
* @param ldc leading dimension of matrix C
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void gemm(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
T alpha, const DevicePtr<const T> A, std::size_t lda,
|
||||
const DevicePtr<const T> B, std::size_t ldb,
|
||||
T beta, const DevicePtr<T> C, std::size_t ldc);
|
||||
|
||||
template <> inline
|
||||
void gemm<half>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
half alpha, const DevicePtr<const half> A, std::size_t lda,
|
||||
const DevicePtr<const half> B, std::size_t ldb,
|
||||
half beta, const DevicePtr<half> C, std::size_t ldc)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
int irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasHgemm(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda,
|
||||
B.get(), ildb,
|
||||
&beta, C.get(), ildc
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void gemm<float>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
float alpha, const DevicePtr<const float> A, std::size_t lda,
|
||||
const DevicePtr<const float> B, std::size_t ldb,
|
||||
float beta, const DevicePtr<float> C, std::size_t ldc)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
int irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasSgemm(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda,
|
||||
B.get(), ildb,
|
||||
&beta, C.get(), ildc
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief Strided batched GEMM for colummn-major matrices
|
||||
*
|
||||
* \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i
|
||||
*
|
||||
* @tparam T matrix element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuBLAS Handle
|
||||
* @param transa use transposed matrix of A_i for computation
|
||||
* @param transb use transposed matrix of B_i for computation
|
||||
* @param rows_c number of rows in C_i
|
||||
* @param cols_c number of columns in C_i
|
||||
* @param common_dim common dimension of A_i (or trans A_i) and B_i (or trans B_i)
|
||||
* @param alpha scale factor for A_i B_i
|
||||
* @param[in] A pointer to stack of column-major matrices A in device memory
|
||||
* @param lda leading dimension of matrix A_i
|
||||
* @param strideA stride between matrices in A
|
||||
* @param[in] B pointer to stack of column-major matrices B in device memory
|
||||
* @param ldb leading dimension of matrix B_i
|
||||
* @param strideB stride between matrices in B
|
||||
* @param beta scale factor for C_i
|
||||
* @param[in,out] C pointer to stack of column-major matrices C in device memory
|
||||
* @param ldc leading dimension of matrix C_i
|
||||
* @param strideC stride between matrices in C
|
||||
* @param batchCount number of matrices in the batch
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void gemmStridedBatched(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
T alpha, const DevicePtr<const T> A, std::size_t lda, std::size_t strideA,
|
||||
const DevicePtr<const T> B, std::size_t ldb, std::size_t strideB,
|
||||
T beta, const DevicePtr<T> C, std::size_t ldc, std::size_t strideC,
|
||||
std::size_t batchCount);
|
||||
|
||||
template <> inline
|
||||
void gemmStridedBatched<half>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
half alpha, const DevicePtr<const half> A, std::size_t lda, std::size_t strideA,
|
||||
const DevicePtr<const half> B, std::size_t ldb, std::size_t strideB,
|
||||
half beta, const DevicePtr<half> C, std::size_t ldc, std::size_t strideC,
|
||||
std::size_t batchCount)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
const auto irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
const auto batch_count = static_cast<int>(batchCount);
|
||||
const auto stride_a = static_cast<long long int>(strideA),
|
||||
stride_b = static_cast<long long int>(strideB),
|
||||
stride_c = static_cast<long long int>(strideC);
|
||||
|
||||
CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasHgemmStridedBatched(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda, stride_a,
|
||||
B.get(), ildb, stride_b,
|
||||
&beta, C.get(), ildc, stride_c,
|
||||
batch_count
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void gemmStridedBatched<float>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
float alpha, const DevicePtr<const float> A, std::size_t lda, std::size_t strideA,
|
||||
const DevicePtr<const float> B, std::size_t ldb, std::size_t strideB,
|
||||
float beta, const DevicePtr<float> C, std::size_t ldc, std::size_t strideC,
|
||||
std::size_t batchCount)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
const auto irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
const auto batch_count = static_cast<int>(batchCount);
|
||||
const auto stride_a = static_cast<long long int>(strideA),
|
||||
stride_b = static_cast<long long int>(strideB),
|
||||
stride_c = static_cast<long long int>(strideC);
|
||||
|
||||
CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasSgemmStridedBatched(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda, stride_a,
|
||||
B.get(), ildb, stride_b,
|
||||
&beta, C.get(), ildc, stride_c,
|
||||
batch_count
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
|
10
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn.hpp
vendored
Normal file
10
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn.hpp
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
|
||||
|
||||
#include "cudnn/cudnn.hpp"
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP */
|
80
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
vendored
Normal file
80
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
vendored
Normal file
@ -0,0 +1,80 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
class ActivationDescriptor {
|
||||
public:
|
||||
enum class ActivationType {
|
||||
IDENTITY,
|
||||
RELU,
|
||||
CLIPPED_RELU,
|
||||
TANH,
|
||||
SIGMOID,
|
||||
ELU
|
||||
};
|
||||
|
||||
ActivationDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
ActivationDescriptor(const ActivationDescriptor&) = delete;
|
||||
ActivationDescriptor(ActivationDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/* `relu_ceiling_or_elu_alpha`:
|
||||
* - `alpha` coefficient in ELU activation
|
||||
* - `ceiling` for CLIPPED_RELU activation
|
||||
*/
|
||||
ActivationDescriptor(ActivationType type, double relu_ceiling_or_elu_alpha = 0.0) {
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateActivationDescriptor(&descriptor));
|
||||
try {
|
||||
const auto mode = [type] {
|
||||
switch(type) {
|
||||
case ActivationType::IDENTITY: return CUDNN_ACTIVATION_IDENTITY;
|
||||
case ActivationType::RELU: return CUDNN_ACTIVATION_RELU;
|
||||
case ActivationType::CLIPPED_RELU: return CUDNN_ACTIVATION_CLIPPED_RELU;
|
||||
case ActivationType::SIGMOID: return CUDNN_ACTIVATION_SIGMOID;
|
||||
case ActivationType::TANH: return CUDNN_ACTIVATION_TANH;
|
||||
case ActivationType::ELU: return CUDNN_ACTIVATION_ELU;
|
||||
}
|
||||
CV_Assert(0);
|
||||
return CUDNN_ACTIVATION_IDENTITY;
|
||||
} ();
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetActivationDescriptor(descriptor, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling_or_elu_alpha));
|
||||
} catch(...) {
|
||||
/* cudnnDestroyActivationDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
~ActivationDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyActivationDescriptor will not fail */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
ActivationDescriptor& operator=(const ActivationDescriptor&) = delete;
|
||||
ActivationDescriptor& operator=(ActivationDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnActivationDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
cudnnActivationDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP */
|
637
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
vendored
Normal file
637
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
vendored
Normal file
@ -0,0 +1,637 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
#include "activation.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
#include "../workspace.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** describe convolution filters
|
||||
*
|
||||
* @tparam T type of elements in the kernels
|
||||
*/
|
||||
template <class T>
|
||||
class FilterDescriptor {
|
||||
public:
|
||||
FilterDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
FilterDescriptor(const FilterDescriptor&) = delete;
|
||||
FilterDescriptor(FilterDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided in \p shape
|
||||
*
|
||||
* Shape dimensions:
|
||||
* 0: number of filters
|
||||
* 1: number of input feature maps
|
||||
* 2..n: kernel dimensions
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
FilterDescriptor(const SequenceContainer& shape) {
|
||||
constructor(shape.begin(), shape.end());
|
||||
}
|
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided in [begin, end)
|
||||
*
|
||||
* Shape dimensions:
|
||||
* 0: number of filters
|
||||
* 1: number of input feature maps
|
||||
* 2..n: kernel dimensions
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
|
||||
FilterDescriptor(ForwardItr begin, ForwardItr end) {
|
||||
constructor(begin, end);
|
||||
}
|
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided as arguments
|
||||
*
|
||||
* Shape dimensions:
|
||||
* 0: number of filters
|
||||
* 1: number of input feature maps
|
||||
* 2..n: kernel dimensions
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class ...Sizes>
|
||||
FilterDescriptor(Sizes ...sizes) {
|
||||
static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
|
||||
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
|
||||
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
|
||||
constructor(std::begin(dims), std::end(dims));
|
||||
}
|
||||
|
||||
~FilterDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
FilterDescriptor& operator=(const FilterDescriptor&) = delete;
|
||||
FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class ForwardItr>
|
||||
void constructor(ForwardItr start, ForwardItr end) {
|
||||
CV_Assert(start != end);
|
||||
CV_Assert(std::distance(start, end) >= 3);
|
||||
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
|
||||
try {
|
||||
const auto rank = std::distance(start, end);
|
||||
if (rank == 4) {
|
||||
std::array<int, 4> dims;
|
||||
std::copy(start, end, std::begin(dims));
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetFilter4dDescriptor(
|
||||
descriptor,
|
||||
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
|
||||
dims[0], dims[1], dims[2], dims[3]
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> dims(start, end);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetFilterNdDescriptor(
|
||||
descriptor,
|
||||
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
|
||||
dims.size(), dims.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (...) {
|
||||
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnFilterDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
/** describes a convolution operation
|
||||
*
|
||||
* @tparam T type of element participating in convolution
|
||||
*/
|
||||
template <class T>
|
||||
class ConvolutionDescriptor {
|
||||
public:
|
||||
ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
|
||||
ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a convolution descriptor
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p zero_padding, \p stride and \p dilation must have the same size
|
||||
*
|
||||
* The length of the containers is interpreted as the order of the convolution.
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
ConvolutionDescriptor(
|
||||
const SequenceContainer& zero_padding,
|
||||
const SequenceContainer& stride,
|
||||
const SequenceContainer& dilation,
|
||||
std::size_t group_count)
|
||||
{
|
||||
constructor(zero_padding, stride, dilation, group_count);
|
||||
}
|
||||
|
||||
~ConvolutionDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
|
||||
ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class SequenceContainer>
|
||||
void constructor(
|
||||
const SequenceContainer& zero_padding,
|
||||
const SequenceContainer& stride,
|
||||
const SequenceContainer& dilation,
|
||||
std::size_t group_count)
|
||||
{
|
||||
CV_Assert(zero_padding.size() == stride.size());
|
||||
CV_Assert(zero_padding.size() == dilation.size());
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
|
||||
try {
|
||||
const auto rank = zero_padding.size();
|
||||
if (rank == 2) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetConvolution2dDescriptor(
|
||||
descriptor,
|
||||
zero_padding[0], zero_padding[1],
|
||||
stride[0], stride[1],
|
||||
dilation[0], dilation[1],
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
detail::get_data_type<T>()
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
|
||||
std::vector<int> istride(std::begin(stride), std::end(stride));
|
||||
std::vector<int> idilation(std::begin(dilation), std::end(dilation));
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetConvolutionNdDescriptor(
|
||||
descriptor,
|
||||
rank, ipadding.data(), istride.data(), idilation.data(),
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
detail::get_data_type<T>()
|
||||
)
|
||||
);
|
||||
}
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
|
||||
|
||||
#if CUDNN_MAJOR >= 8
|
||||
/* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
|
||||
* in the default setting. TF32 convolutions have lower precision than FP32.
|
||||
* Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
|
||||
*/
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
|
||||
#endif
|
||||
|
||||
if (std::is_same<T, half>::value)
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
|
||||
} catch (...) {
|
||||
/* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnConvolutionDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
/** wrapper around a convolution algorithm
|
||||
*
|
||||
* @tparam T type of elements being convolved
|
||||
*/
|
||||
template <class T>
|
||||
class ConvolutionAlgorithm {
|
||||
public:
|
||||
ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
|
||||
ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
|
||||
ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
|
||||
|
||||
/** selects a good algorithm for convolution for given configuration
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
ConvolutionAlgorithm(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
const TensorDescriptor<T>& outputDesc)
|
||||
{
|
||||
#if CUDNN_MAJOR >= 8
|
||||
int requestedAlgoCount = 0, returnedAlgoCount = 0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
|
||||
std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionForwardAlgorithm_v7(
|
||||
handle.get(),
|
||||
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
requestedAlgoCount,
|
||||
&returnedAlgoCount,
|
||||
&results[0]
|
||||
)
|
||||
);
|
||||
|
||||
size_t free_memory, total_memory;
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
|
||||
|
||||
bool found_conv_algorithm = false;
|
||||
for (int i = 0; i < returnedAlgoCount; i++)
|
||||
{
|
||||
if (results[i].status == CUDNN_STATUS_SUCCESS &&
|
||||
results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
|
||||
results[i].memory < free_memory)
|
||||
{
|
||||
found_conv_algorithm = true;
|
||||
algo = results[i].algo;
|
||||
workspace_size = results[i].memory;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found_conv_algorithm)
|
||||
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
|
||||
#else
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionForwardAlgorithm(
|
||||
handle.get(),
|
||||
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
|
||||
0, /* no memory limit */
|
||||
&algo
|
||||
)
|
||||
);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionForwardWorkspaceSize(
|
||||
handle.get(),
|
||||
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
algo, &workspace_size
|
||||
)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
|
||||
ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
|
||||
|
||||
cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
|
||||
|
||||
/** number of bytes of workspace memory required by the algorithm */
|
||||
std::size_t get_workspace_size() const noexcept { return workspace_size; }
|
||||
|
||||
private:
|
||||
cudnnConvolutionFwdAlgo_t algo;
|
||||
std::size_t workspace_size;
|
||||
};
|
||||
|
||||
/** gives the shape of the output tensor of convolution
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void getConvolutionForwardOutputDim(
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
std::vector<int>& output)
|
||||
{
|
||||
output.clear();
|
||||
output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
|
||||
|
||||
std::vector<int> temp(CUDNN_DIM_MAX);
|
||||
cudnnDataType_t tempDataType;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetTensorNdDescriptor(
|
||||
inputDesc.get(),
|
||||
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
|
||||
&tempDataType,
|
||||
output.data(),
|
||||
temp.data(),
|
||||
temp.data()
|
||||
)
|
||||
);
|
||||
|
||||
const auto rank = output[0];
|
||||
output.resize(rank);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionNdForwardOutputDim(
|
||||
convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief performs convolution
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param convAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const ConvolutionAlgorithm<T>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionForward(
|
||||
handle.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const ConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionForward(
|
||||
handle.get(),
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief performs convolution, bias addition and activation simultaneously
|
||||
*
|
||||
* dstValue = act(alpha * conv(input) + bias)
|
||||
*
|
||||
* @tparam T convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param convAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param alpha convolution scale factor
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param biasDesc tensor descriptor describing the bias
|
||||
* @param[in] biasPtr pointer to bias tensor in device memory
|
||||
* @param actDesc activation descriptor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void convolve_with_bias_activation(
|
||||
const Handle& handle,
|
||||
T alpha,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const ConvolutionAlgorithm<T>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
const TensorDescriptor<T>& biasDesc,
|
||||
DevicePtr<const T> biasPtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
T alpha2 = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2, outputDesc.get(), outputPtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void convolve_with_bias_activation(
|
||||
const Handle& handle,
|
||||
half alpha,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const ConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
const TensorDescriptor<half>& biasDesc,
|
||||
DevicePtr<const half> biasPtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
float alpha_ = alpha, alpha2 = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2, outputDesc.get(), outputPtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
/** @brief performs convolution, bias addition, eltwise addition and activation simultaneously
|
||||
*
|
||||
* dstValue = act(alpha1 * conv(input) + bias + alpha2 * eltwise)
|
||||
*
|
||||
* @tparam T convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param convAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param alpha1 convolution scale factor
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param biasDesc tensor descriptor describing the bias
|
||||
* @param[in] biasPtr pointer to bias tensor in device memory
|
||||
* @param alpha2 eltwise scale factor
|
||||
* @param eltwiseDesc tensor descriptor describing the eltwise tensor
|
||||
* @param[in] eltwisePtr pointer to the eltwise tensor in device memory
|
||||
* @param actDesc activation descriptor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void convolve_with_bias_eltwise_activation(
|
||||
const Handle& handle,
|
||||
T alpha1,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const ConvolutionAlgorithm<T>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
const TensorDescriptor<T>& biasDesc,
|
||||
DevicePtr<const T> biasPtr,
|
||||
T alpha2,
|
||||
const TensorDescriptor<T>& eltwiseDesc,
|
||||
DevicePtr<const T> eltwisePtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha1, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2, eltwiseDesc.get(), eltwisePtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void convolve_with_bias_eltwise_activation(
|
||||
const Handle& handle,
|
||||
half alpha1,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const ConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
const TensorDescriptor<half>& biasDesc,
|
||||
DevicePtr<const half> biasPtr,
|
||||
half alpha2,
|
||||
const TensorDescriptor<half>& eltwiseDesc,
|
||||
DevicePtr<const half> eltwisePtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
float alpha1_ = alpha1, alpha2_ = alpha2;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha1_, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2_, eltwiseDesc.get(), eltwisePtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */
|
292
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp
vendored
Normal file
292
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp
vendored
Normal file
@ -0,0 +1,292 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
#define CUDA4DNN_CHECK_CUDNN(call) \
|
||||
::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__)
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** @brief exception class for errors thrown by the cuDNN API */
|
||||
class cuDNNException : public CUDAException {
|
||||
public:
|
||||
cuDNNException(cudnnStatus_t code, const std::string& msg, const std::string& func, const std::string& file, int line)
|
||||
: CUDAException(Error::GpuApiCallError, msg, func, file, line), cudnnError{code}
|
||||
{
|
||||
}
|
||||
|
||||
cudnnStatus_t getCUDNNStatus() const noexcept { return cudnnError; }
|
||||
|
||||
private:
|
||||
cudnnStatus_t cudnnError;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
inline void check(cudnnStatus_t status, const char* func, const char* file, int line) {
|
||||
if (status != CUDNN_STATUS_SUCCESS)
|
||||
throw cuDNNException(status, cudnnGetErrorString(status), func, file, line);
|
||||
}
|
||||
|
||||
/** get_data_type<T> returns the equivalent cudnn enumeration constant for type T */
|
||||
using cudnn_data_enum_type = decltype(CUDNN_DATA_FLOAT);
|
||||
template <class> cudnn_data_enum_type get_data_type();
|
||||
template <> inline cudnn_data_enum_type get_data_type<half>() { return CUDNN_DATA_HALF; }
|
||||
template <> inline cudnn_data_enum_type get_data_type<float>() { return CUDNN_DATA_FLOAT; }
|
||||
}
|
||||
|
||||
/** @brief noncopyable cuDNN smart handle
|
||||
*
|
||||
* UniqueHandle is a smart non-sharable wrapper for cuDNN handle which ensures that the handle
|
||||
* is destroyed after use.
|
||||
*/
|
||||
class UniqueHandle {
|
||||
public:
|
||||
UniqueHandle() noexcept : handle{ nullptr } { }
|
||||
UniqueHandle(UniqueHandle&) = delete;
|
||||
UniqueHandle(UniqueHandle&& other) noexcept {
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
|
||||
/** creates a cuDNN handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
UniqueHandle(Stream strm) : stream(std::move(strm)) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetStream(handle, stream.get()));
|
||||
} catch (...) {
|
||||
/* cudnnDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
~UniqueHandle() noexcept {
|
||||
if (handle != nullptr) {
|
||||
/* cudnnDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
|
||||
}
|
||||
}
|
||||
|
||||
UniqueHandle& operator=(const UniqueHandle&) = delete;
|
||||
UniqueHandle& operator=(UniqueHandle&& other) noexcept {
|
||||
CV_Assert(other);
|
||||
if (&other != this) {
|
||||
UniqueHandle(std::move(*this)); /* destroy current handle */
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** returns the raw cuDNN handle */
|
||||
cudnnHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle;
|
||||
}
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
private:
|
||||
Stream stream;
|
||||
cudnnHandle_t handle;
|
||||
};
|
||||
|
||||
/** @brief sharable cuDNN smart handle
|
||||
*
|
||||
* Handle is a smart sharable wrapper for cuDNN handle which ensures that the handle
|
||||
* is destroyed after all references to the handle are destroyed. The handle must always
|
||||
* be associated with a non-default stream. The stream must be specified during construction.
|
||||
*
|
||||
* @note Moving a Handle object to another invalidates the former
|
||||
*/
|
||||
class Handle {
|
||||
public:
|
||||
Handle() = default;
|
||||
Handle(const Handle&) = default;
|
||||
Handle(Handle&&) = default;
|
||||
|
||||
/** creates a cuDNN handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
|
||||
|
||||
Handle& operator=(const Handle&) = default;
|
||||
Handle& operator=(Handle&&) = default;
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
/** returns the raw cuDNN handle */
|
||||
cudnnHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle->get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<UniqueHandle> handle;
|
||||
};
|
||||
|
||||
/** describe a tensor
|
||||
*
|
||||
* @tparam T type of elements in the tensor
|
||||
*/
|
||||
template <class T>
|
||||
class TensorDescriptor {
|
||||
public:
|
||||
TensorDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
TensorDescriptor(const TensorDescriptor&) = delete;
|
||||
TensorDescriptor(TensorDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided in \p shape
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
TensorDescriptor(const SequenceContainer& shape) {
|
||||
constructor(shape.begin(), shape.end());
|
||||
}
|
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided in [begin, end)
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
|
||||
TensorDescriptor(ForwardItr begin, ForwardItr end) {
|
||||
constructor(begin, end);
|
||||
}
|
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided as arguments
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class ...Sizes>
|
||||
TensorDescriptor(Sizes ...sizes) {
|
||||
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
|
||||
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
|
||||
constructor(std::begin(dims), std::end(dims));
|
||||
}
|
||||
|
||||
~TensorDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyTensorDescriptor will not fail */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
TensorDescriptor& operator=(const TensorDescriptor&) = delete;
|
||||
TensorDescriptor& operator=(TensorDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnTensorDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class ForwardItr>
|
||||
void constructor(ForwardItr start, ForwardItr end) {
|
||||
CV_Assert(start != end);
|
||||
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor));
|
||||
try {
|
||||
/* cuDNN documentation recommends using the 4d tensor API whenever possible
|
||||
* hence, we create a 4d tensor descriptors for 3d tensor
|
||||
*/
|
||||
const auto rank = std::distance(start, end);
|
||||
if (rank <= 4) {
|
||||
std::array<int, 4> dims;
|
||||
std::fill(std::begin(dims), std::end(dims), 1);
|
||||
|
||||
/* suppose we have a 3d tensor, the first axis is the batch axis and
|
||||
* the second axis is the channel axis (generally)
|
||||
*
|
||||
* cuDNN frequently assumes that the first axis is the batch axis and the
|
||||
* second axis is the channel axis; hence, we copy the shape of a lower rank
|
||||
* tensor to the beginning of `dims`
|
||||
*/
|
||||
std::copy(start, end, std::begin(dims));
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetTensor4dDescriptor(descriptor,
|
||||
CUDNN_TENSOR_NCHW, detail::get_data_type<T>(),
|
||||
dims[0], dims[1], dims[2], dims[3]
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> stride(rank);
|
||||
stride.back() = 1;
|
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1
|
||||
* stride[-2] = garbage
|
||||
* stride[-3] = garbage
|
||||
* stride[-4] = garbage
|
||||
* ...
|
||||
*/
|
||||
|
||||
std::copy(start + 1, end, stride.begin());
|
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1
|
||||
* stride[-2] = dim[-1]
|
||||
* stride[-3] = dim[-2]
|
||||
* stride[-4] = dim[-3]
|
||||
* ...
|
||||
*/
|
||||
|
||||
std::partial_sum(stride.rbegin(), stride.rend(), stride.rbegin(), std::multiplies<int>());
|
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1
|
||||
* stride[-2] = stride[-1] * dim[-1]
|
||||
* stride[-3] = stride[-2] * dim[-2]
|
||||
* stride[-4] = stride[-3] * dim[-3]
|
||||
* ...
|
||||
*/
|
||||
|
||||
std::vector<int> dims(start, end);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetTensorNdDescriptor(descriptor,
|
||||
detail::get_data_type<T>(), rank,
|
||||
dims.data(), stride.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (...) {
|
||||
/* cudnnDestroyTensorDescriptor will not fail */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnTensorDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */
|
205
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp
vendored
Normal file
205
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp
vendored
Normal file
@ -0,0 +1,205 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
#include "../workspace.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
class LRNDescriptor {
|
||||
public:
|
||||
enum class LRNType {
|
||||
ACROSS_CHANNELS,
|
||||
WITHIN_CHANNEL
|
||||
};
|
||||
|
||||
LRNDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
LRNDescriptor(const LRNDescriptor&) = delete;
|
||||
LRNDescriptor(LRNDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor }, type{ other.type } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** sets up a LRN descriptor
|
||||
*
|
||||
* @param local_size size of the normalization window
|
||||
* @param alpha variance scaling parameter
|
||||
* @param beta power parameter
|
||||
* @param k bias parameter
|
||||
*
|
||||
* @note \p alpha is divided by the window width in across channels mode
|
||||
* @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode
|
||||
*
|
||||
* @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
|
||||
constructor(local_size, alpha, beta, k, type_);
|
||||
}
|
||||
|
||||
~LRNDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
LRNDescriptor& operator=(const LRNDescriptor&) = delete;
|
||||
LRNDescriptor& operator=(LRNDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
type = other.type;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnLRNDescriptor_t get() const noexcept { return descriptor; }
|
||||
LRNType getType() const noexcept { return type; }
|
||||
|
||||
private:
|
||||
void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
|
||||
CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N);
|
||||
|
||||
type = type_;
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetLRNDescriptor(
|
||||
descriptor,
|
||||
local_size,
|
||||
alpha,
|
||||
beta,
|
||||
k
|
||||
)
|
||||
);
|
||||
} catch (...) {
|
||||
/* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnLRNDescriptor_t descriptor;
|
||||
LRNType type;
|
||||
};
|
||||
|
||||
/** @brief performs local response normalization
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param lrnDesc LRN description
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void LRNForward(
|
||||
const Handle& handle,
|
||||
const LRNDescriptor& lrnDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr,
|
||||
WorkspaceInstance workspace)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnLRNCrossChannelForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
} else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
|
||||
std::size_t size;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
|
||||
|
||||
DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
|
||||
DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnDivisiveNormalizationForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
NULL,
|
||||
static_cast<void*>(temp1), static_cast<void*>(temp2),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void LRNForward(
|
||||
const Handle& handle,
|
||||
const LRNDescriptor& lrnDesc,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr,
|
||||
WorkspaceInstance workspace)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnLRNCrossChannelForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
} else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
|
||||
std::size_t size;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
|
||||
|
||||
DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
|
||||
DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnDivisiveNormalizationForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
NULL,
|
||||
static_cast<void*>(temp1), static_cast<void*>(temp2),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */
|
236
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp
vendored
Normal file
236
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp
vendored
Normal file
@ -0,0 +1,236 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
class PoolingDescriptor {
|
||||
public:
|
||||
enum class PoolingType {
|
||||
MAX,
|
||||
MAX_DETERMINISTIC,
|
||||
AVERAGE_EXCLUDE_PADDING,
|
||||
AVERAGE_INCLUDE_PADDING
|
||||
};
|
||||
|
||||
PoolingDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
PoolingDescriptor(const PoolingDescriptor&) = delete;
|
||||
PoolingDescriptor(PoolingDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a pooling descriptor
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p window_size, \p padding and \p stride must have the same size
|
||||
*
|
||||
* The length of the containers is interpreted as the order of the pooling operation.
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
PoolingDescriptor(
|
||||
const SequenceContainer& window_size,
|
||||
const SequenceContainer& padding,
|
||||
const SequenceContainer& stride,
|
||||
PoolingType type)
|
||||
{
|
||||
constructor(window_size, padding, stride, type);
|
||||
}
|
||||
|
||||
~PoolingDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
|
||||
PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnPoolingDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class SequenceContainer>
|
||||
void constructor(
|
||||
const SequenceContainer& window_size,
|
||||
const SequenceContainer& padding,
|
||||
const SequenceContainer& stride,
|
||||
PoolingType type)
|
||||
{
|
||||
CV_Assert(window_size.size() == padding.size());
|
||||
CV_Assert(window_size.size() == stride.size());
|
||||
|
||||
auto get_pooling_type = [] (PoolingType type) {
|
||||
switch (type) {
|
||||
case PoolingType::MAX:
|
||||
return CUDNN_POOLING_MAX;
|
||||
case PoolingType::MAX_DETERMINISTIC:
|
||||
return CUDNN_POOLING_MAX_DETERMINISTIC;
|
||||
case PoolingType::AVERAGE_EXCLUDE_PADDING:
|
||||
return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
|
||||
case PoolingType::AVERAGE_INCLUDE_PADDING:
|
||||
return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
|
||||
}
|
||||
CV_Error(Error::StsBadArg, "unknown pooling type");
|
||||
};
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor));
|
||||
try {
|
||||
const auto rank = window_size.size();
|
||||
if (rank == 2) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetPooling2dDescriptor(
|
||||
descriptor,
|
||||
get_pooling_type(type), CUDNN_PROPAGATE_NAN,
|
||||
window_size[0], window_size[1],
|
||||
padding[0], padding[1],
|
||||
stride[0], stride[1]
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> iwindow_size(std::begin(window_size), std::end(window_size));
|
||||
std::vector<int> ipadding(std::begin(padding), std::end(padding));
|
||||
std::vector<int> istride(std::begin(stride), std::end(stride));
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetPoolingNdDescriptor(
|
||||
descriptor,
|
||||
get_pooling_type(type), CUDNN_PROPAGATE_NAN,
|
||||
rank, iwindow_size.data(), ipadding.data(), istride.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (...) {
|
||||
/* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnPoolingDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
/** gives the shape of the output tensor after pooling
|
||||
*
|
||||
* @note it's not required to enforce the this shape in the output tensor; slightly different shapes will work
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void getPoolingForwardOutputDim(
|
||||
const PoolingDescriptor& poolingDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
std::vector<int>& output_dim)
|
||||
{
|
||||
output_dim.clear();
|
||||
output_dim.resize(CUDNN_DIM_MAX); /* we use `output_dim` to hold temporaries */
|
||||
|
||||
std::vector<int> temp(CUDNN_DIM_MAX);
|
||||
cudnnDataType_t tempDataType;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetTensorNdDescriptor(
|
||||
inputDesc.get(),
|
||||
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
|
||||
&tempDataType,
|
||||
output_dim.data(),
|
||||
temp.data(),
|
||||
temp.data()
|
||||
)
|
||||
);
|
||||
|
||||
const auto rank = output_dim[0];
|
||||
output_dim.resize(rank);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetPoolingNdForwardOutputDim(poolingDesc.get(), inputDesc.get(), rank, output_dim.data())
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief performs pooling operation
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T pooling element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param poolingDesc pooling description
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void pool(
|
||||
const Handle& handle,
|
||||
const PoolingDescriptor& poolingDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
const DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnPoolingForward(
|
||||
handle.get(),
|
||||
poolingDesc.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void pool(
|
||||
const Handle& handle,
|
||||
const PoolingDescriptor& poolingDesc,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
const DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnPoolingForward(
|
||||
handle.get(),
|
||||
poolingDesc.get(),
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP */
|
68
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp
vendored
Normal file
68
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** @brief computes softmax (or log softmax)
|
||||
*
|
||||
* @tparam T element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN handle
|
||||
* @param outputDesc tensor descriptor for A
|
||||
* @param[out] output pointer to tensor in device memory
|
||||
* @param inputDesc tensor descriptor for C
|
||||
* @param[in] input pointer to tensor in device memory
|
||||
* @param log apply log on probabilities
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void softmax(const cudnn::Handle& handle,
|
||||
const TensorDescriptor<T>& outputDesc, DevicePtr<T> output,
|
||||
const TensorDescriptor<T>& inputDesc, DevicePtr<const T> input,
|
||||
bool log)
|
||||
{
|
||||
T alpha = 1.0, beta = 0.0;
|
||||
cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSoftmaxForward(
|
||||
handle.get(),
|
||||
algo, CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha, inputDesc.get(), input.get(),
|
||||
&beta, outputDesc.get(), output.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void softmax(const cudnn::Handle& handle,
|
||||
const TensorDescriptor<half>& outputDesc, DevicePtr<half> output,
|
||||
const TensorDescriptor<half>& inputDesc, DevicePtr<const half> input,
|
||||
bool log)
|
||||
{
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha = 1.0, beta = 0.0;
|
||||
cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSoftmaxForward(
|
||||
handle.get(),
|
||||
algo, CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha, inputDesc.get(), input.get(),
|
||||
&beta, outputDesc.get(), output.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP */
|
142
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp
vendored
Normal file
142
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp
vendored
Normal file
@ -0,0 +1,142 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** describes a tensor transform operation
|
||||
*
|
||||
* Supported transformations:
|
||||
* - add or remove asymmetric padding
|
||||
*/
|
||||
class TensorTransformDescriptor {
|
||||
public:
|
||||
TensorTransformDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
TensorTransformDescriptor(const TensorTransformDescriptor&) = delete;
|
||||
TensorTransformDescriptor(TensorTransformDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a convolution descriptor
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p padding_left and \p padding_right must have the same size
|
||||
*
|
||||
* The length of the containers is interpreted as the rank of the tensors which will be given.
|
||||
*
|
||||
* @note \p padding_left and \p padding_right may have negative values to remove padding
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
TensorTransformDescriptor(
|
||||
const SequenceContainer& padding_left,
|
||||
const SequenceContainer& padding_right)
|
||||
{
|
||||
constructor(padding_left, padding_right);
|
||||
}
|
||||
|
||||
~TensorTransformDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
TensorTransformDescriptor& operator=(const TensorTransformDescriptor&) = delete;
|
||||
TensorTransformDescriptor& operator=(TensorTransformDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnTensorTransformDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class SequenceContainer>
|
||||
void constructor(
|
||||
const SequenceContainer& padding_left,
|
||||
const SequenceContainer& padding_right
|
||||
)
|
||||
{
|
||||
CV_Assert(padding_left.size() == padding_right.size());
|
||||
|
||||
auto ipadding_left = std::vector<int32_t>(std::begin(padding_left), std::end(padding_left));
|
||||
auto ipadding_right = std::vector<int32_t>(std::begin(padding_right), std::end(padding_right));
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorTransformDescriptor(&descriptor));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetTensorTransformDescriptor(
|
||||
descriptor,
|
||||
ipadding_left.size(), CUDNN_TENSOR_NCHW,
|
||||
ipadding_left.data(), ipadding_right.data(),
|
||||
NULL, CUDNN_TRANSFORM_FOLD
|
||||
)
|
||||
);
|
||||
} catch (...) {
|
||||
/* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnTensorTransformDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
void transform(
|
||||
const Handle& handle,
|
||||
const TensorTransformDescriptor& transDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
T alpha = 1.0, beta = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnTransformTensorEx(
|
||||
handle.get(),
|
||||
transDesc.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void transform(
|
||||
const Handle& handle,
|
||||
const TensorTransformDescriptor& transDesc,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha = 1.0, beta = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnTransformTensorEx(
|
||||
handle.get(),
|
||||
transDesc.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP */
|
183
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
vendored
Normal file
183
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
vendored
Normal file
@ -0,0 +1,183 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
#include "convolution.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
#include "../workspace.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** wrapper around a transpose convolution algorithm
|
||||
*
|
||||
* @tparam T type of elements being transpose-convolved
|
||||
*/
|
||||
template <class T>
|
||||
class TransposeConvolutionAlgorithm {
|
||||
public:
|
||||
TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
|
||||
TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default;
|
||||
TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default;
|
||||
|
||||
TransposeConvolutionAlgorithm(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
const TensorDescriptor<T>& outputDesc)
|
||||
{
|
||||
#if CUDNN_MAJOR >= 8
|
||||
int requestedAlgoCount = 0, returnedAlgoCount = 0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
|
||||
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
||||
handle.get(),
|
||||
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
requestedAlgoCount,
|
||||
&returnedAlgoCount,
|
||||
&results[0]
|
||||
)
|
||||
);
|
||||
|
||||
size_t free_memory, total_memory;
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
|
||||
|
||||
bool found_conv_algorithm = false;
|
||||
for (int i = 0; i < returnedAlgoCount; i++)
|
||||
{
|
||||
if (results[i].status == CUDNN_STATUS_SUCCESS &&
|
||||
results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
|
||||
results[i].memory < free_memory)
|
||||
{
|
||||
found_conv_algorithm = true;
|
||||
dalgo = results[i].algo;
|
||||
workspace_size = results[i].memory;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found_conv_algorithm)
|
||||
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
|
||||
#else
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionBackwardDataAlgorithm(
|
||||
handle.get(),
|
||||
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
|
||||
0, /* no memory limit */
|
||||
&dalgo
|
||||
)
|
||||
);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionBackwardDataWorkspaceSize(
|
||||
handle.get(),
|
||||
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
dalgo, &workspace_size
|
||||
)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
|
||||
TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default;
|
||||
|
||||
cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; }
|
||||
|
||||
std::size_t get_workspace_size() const noexcept { return workspace_size; }
|
||||
|
||||
private:
|
||||
cudnnConvolutionBwdDataAlgo_t dalgo;
|
||||
std::size_t workspace_size;
|
||||
};
|
||||
|
||||
/** @brief performs transpose convolution
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T transpose convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param transConvAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void transpose_convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const TransposeConvolutionAlgorithm<T>& transConvAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionBackwardData(
|
||||
handle.get(),
|
||||
&alpha,
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
inputDesc.get(), inputPtr.get(),
|
||||
convDesc.get(), transConvAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void transpose_convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const TransposeConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionBackwardData(
|
||||
handle.get(),
|
||||
&alpha_,
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
inputDesc.get(), inputPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */
|
30
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/error.hpp
vendored
Normal file
30
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/error.hpp
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#define CUDA4DNN_CHECK_CUDA(call) \
|
||||
::cv::dnn::cuda4dnn::csl::detail::check((call), CV_Func, __FILE__, __LINE__)
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
/** @brief exception class for errors thrown by the CUDA APIs */
|
||||
class CUDAException : public cv::Exception {
|
||||
public:
|
||||
using cv::Exception::Exception;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
inline void check(cudaError_t err, const char* func, const char* file, int line) {
|
||||
if (err != cudaSuccess)
|
||||
throw CUDAException(Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
|
||||
}
|
||||
}
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP */
|
103
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/event.hpp
vendored
Normal file
103
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/event.hpp
vendored
Normal file
@ -0,0 +1,103 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
#include "stream.hpp"
|
||||
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief sharable CUDA event
|
||||
*
|
||||
* Event is a smart sharable wrapper for CUDA event handle which ensures that
|
||||
* the handle is destroyed after use.
|
||||
*
|
||||
* @note Moving an Event object to another invalidates the former
|
||||
*/
|
||||
class Event {
|
||||
public:
|
||||
Event() noexcept : event{ nullptr } { }
|
||||
Event(const Event&) = delete;
|
||||
Event(Event&& other) noexcept
|
||||
: event{ other.event } {
|
||||
other.event = nullptr;
|
||||
}
|
||||
|
||||
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
|
||||
Event(bool create, bool timing_event = false) : event{nullptr} {
|
||||
if (create) {
|
||||
unsigned int flags = (timing_event ? 0 : cudaEventDisableTiming);
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
|
||||
}
|
||||
}
|
||||
|
||||
~Event() {
|
||||
try {
|
||||
if (event != nullptr)
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventDestroy(event));
|
||||
} catch (const CUDAException& ex) {
|
||||
std::ostringstream os;
|
||||
os << "Asynchronous exception caught during CUDA event destruction.\n";
|
||||
os << ex.what();
|
||||
os << "Exception will be ignored.\n";
|
||||
CV_LOG_WARNING(0, os.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
Event& operator=(const Event&) noexcept = delete;
|
||||
Event& operator=(Event&& other) noexcept {
|
||||
event = other.event;
|
||||
other.event = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** mark a point in \p stream */
|
||||
void record(const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
|
||||
}
|
||||
|
||||
/** blocks the caller thread until all operations before the event finish */
|
||||
void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaEventSynchronize(event)); }
|
||||
|
||||
/** returns true if there are operations pending before the event completes */
|
||||
bool busy() const {
|
||||
auto status = cudaEventQuery(event);
|
||||
if (status == cudaErrorNotReady)
|
||||
return true;
|
||||
CUDA4DNN_CHECK_CUDA(status);
|
||||
return false;
|
||||
}
|
||||
|
||||
cudaEvent_t get() const noexcept { return event; }
|
||||
|
||||
/** returns true if the event is valid */
|
||||
explicit operator bool() const noexcept { return event; }
|
||||
|
||||
private:
|
||||
cudaEvent_t event;
|
||||
};
|
||||
|
||||
/** makes a stream wait on an event */
|
||||
inline void StreamWaitOnEvent(const Stream& stream, const Event& event) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
|
||||
}
|
||||
|
||||
/** returns the time elapsed between two events in milliseconds */
|
||||
inline float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
|
||||
float temp;
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
|
||||
return temp;
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP */
|
303
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/memory.hpp
vendored
Normal file
303
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/memory.hpp
vendored
Normal file
@ -0,0 +1,303 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
#include "pointer.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/* @brief smart device pointer with allocation/deallocation methods
|
||||
*
|
||||
* ManagedPtr is a smart shared device pointer which also handles memory allocation.
|
||||
*/
|
||||
template <class T>
|
||||
class ManagedPtr {
|
||||
static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
|
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
|
||||
|
||||
public:
|
||||
using element_type = T;
|
||||
|
||||
using pointer = DevicePtr<element_type>;
|
||||
using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
|
||||
|
||||
using size_type = std::size_t;
|
||||
|
||||
ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
|
||||
ManagedPtr(const ManagedPtr&) noexcept = default;
|
||||
ManagedPtr(ManagedPtr&& other) noexcept
|
||||
: wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
|
||||
{
|
||||
other.reset();
|
||||
}
|
||||
|
||||
/** allocates device memory for \p count number of element */
|
||||
ManagedPtr(size_type count) {
|
||||
if (count <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements is zero or negative");
|
||||
}
|
||||
|
||||
void* temp = nullptr;
|
||||
CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
|
||||
|
||||
auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
|
||||
wrapped.reset(ptr, [](element_type* ptr) {
|
||||
if (ptr != nullptr) {
|
||||
/* contract violation for std::shared_ptr if cudaFree throws */
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
|
||||
} catch (const CUDAException& ex) {
|
||||
std::ostringstream os;
|
||||
os << "Device memory deallocation failed in deleter.\n";
|
||||
os << ex.what();
|
||||
os << "Exception will be ignored.\n";
|
||||
CV_LOG_WARNING(0, os.str().c_str());
|
||||
}
|
||||
}
|
||||
});
|
||||
/* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
|
||||
* need to have a try-catch block to free the allocated device memory
|
||||
*/
|
||||
|
||||
n = capacity = count;
|
||||
}
|
||||
|
||||
ManagedPtr& operator=(ManagedPtr&& other) noexcept {
|
||||
wrapped = std::move(other.wrapped);
|
||||
n = other.n;
|
||||
capacity = other.capacity;
|
||||
|
||||
other.reset();
|
||||
return *this;
|
||||
}
|
||||
|
||||
size_type size() const noexcept { return n; }
|
||||
|
||||
void reset() noexcept { wrapped.reset(); n = capacity = 0; }
|
||||
|
||||
/**
|
||||
* deallocates any previously allocated memory and allocates device memory
|
||||
* for \p count number of elements
|
||||
*
|
||||
* @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
|
||||
* @note use move constructor to guarantee a deallocation of the previously allocated memory
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
void reset(size_type count) {
|
||||
/* we need to fully own the memory to perform optimizations */
|
||||
if (wrapped.use_count() == 1) {
|
||||
/* avoid reallocation if the existing capacity is sufficient */
|
||||
if (count <= capacity) {
|
||||
n = count;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* no optimization performed; allocate memory */
|
||||
ManagedPtr tmp(count);
|
||||
swap(tmp, *this);
|
||||
}
|
||||
|
||||
pointer get() const noexcept { return pointer(wrapped.get()); }
|
||||
|
||||
explicit operator bool() const noexcept { return wrapped; }
|
||||
|
||||
friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
|
||||
friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
|
||||
|
||||
friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.wrapped, rhs.wrapped);
|
||||
swap(lhs.n, rhs.n);
|
||||
swap(lhs.capacity, rhs.capacity);
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<element_type> wrapped;
|
||||
size_type n, capacity;
|
||||
};
|
||||
|
||||
/** copies entire memory block pointed by \p src to \p dest
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, const ManagedPtr<T>& src) {
|
||||
memcpy<T>(dest, src.get(), src.size());
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to fully fill \p dest
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p src must be at least as big as the memory block held by \p dest
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(const ManagedPtr<T>& dest, const T* src) {
|
||||
memcpy<T>(dest.get(), src, dest.size());
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest
|
||||
*
|
||||
* if the two \p src and \p dest have different sizes, the number of elements copied is
|
||||
* equal to the size of the smaller memory block
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
|
||||
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
|
||||
}
|
||||
|
||||
/** sets device memory block to a specific 8-bit value
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
|
||||
memset<T>(dest.get(), ch, dest.size());
|
||||
}
|
||||
|
||||
/** copies entire memory block pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
|
||||
* - \p dest points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memcpy<T>(dest, src.get(), src.size(), stream);
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
|
||||
* - \p src points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memcpy<T>(dest.get(), src, dest.size(), stream);
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* if the two \p src and \p dest have different sizes, the number of elements copied is
|
||||
* equal to the size of the smaller memory block
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
|
||||
}
|
||||
|
||||
/** sets device memory block to a specific 8-bit value asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
* \param stream CUDA stream that has to be used for the memory operation
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memset<T>(dest.get(), ch, dest.size(), stream);
|
||||
}
|
||||
|
||||
/** @brief registers host memory as page-locked and unregisters on destruction */
|
||||
class MemoryLockGuard {
|
||||
public:
|
||||
MemoryLockGuard() noexcept : ptr { nullptr } { }
|
||||
MemoryLockGuard(const MemoryLockGuard&) = delete;
|
||||
MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
|
||||
other.ptr = nullptr;
|
||||
}
|
||||
|
||||
/** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - host memory should be unregistered
|
||||
*/
|
||||
MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
|
||||
CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
|
||||
ptr = ptr_;
|
||||
}
|
||||
|
||||
MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
|
||||
MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
|
||||
if (&other != this) {
|
||||
if(ptr != nullptr) {
|
||||
/* cudaHostUnregister does not throw for a valid ptr */
|
||||
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
|
||||
}
|
||||
ptr = other.ptr;
|
||||
other.ptr = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
~MemoryLockGuard() {
|
||||
if(ptr != nullptr) {
|
||||
/* cudaHostUnregister does not throw for a valid ptr */
|
||||
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void *ptr;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */
|
20
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp
vendored
Normal file
20
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#ifdef __CUDACC__
|
||||
# define CUDA4DNN_HOST __host__
|
||||
# define CUDA4DNN_DEVICE __device__
|
||||
# define CUDA4DNN_HOST_DEVICE CUDA4DNN_HOST CUDA4DNN_DEVICE
|
||||
#else
|
||||
# define CUDA4DNN_HOST
|
||||
# define CUDA4DNN_DEVICE
|
||||
# define CUDA4DNN_HOST_DEVICE
|
||||
#endif
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP */
|
411
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/pointer.hpp
vendored
Normal file
411
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/pointer.hpp
vendored
Normal file
@ -0,0 +1,411 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
|
||||
|
||||
#include "nvcc_defs.hpp"
|
||||
#include "error.hpp"
|
||||
#include "stream.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
#include <ostream>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief provides a type-safe device pointer
|
||||
*
|
||||
* DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert
|
||||
* to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen.
|
||||
*
|
||||
* It is meant to point to locations in device memory. Hence, it provides dereferencing or
|
||||
* array subscript capability for device code only.
|
||||
*
|
||||
* A `const DevicePtr<T>` represents an immutable pointer to a mutable memory.
|
||||
* A `DevicePtr<const T>` represents a mutable pointer to an immutable memory.
|
||||
* A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory.
|
||||
*
|
||||
* A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`.
|
||||
*
|
||||
* Specializations:
|
||||
* - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided)
|
||||
* - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void>
|
||||
* - any device pointer is implicitly convertible to DevicePtr<const void>
|
||||
* - DevicePtr<void> can be explicitly converted to any device pointer
|
||||
* - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory
|
||||
*/
|
||||
template <class T>
|
||||
class DevicePtr {
|
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
|
||||
|
||||
public:
|
||||
using element_type = T;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using pointer = typename std::add_pointer<element_type>::type;
|
||||
using reference = typename std::add_lvalue_reference<element_type>::type;
|
||||
|
||||
DevicePtr() = default;
|
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
|
||||
|
||||
CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; }
|
||||
CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); }
|
||||
CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); }
|
||||
|
||||
template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
|
||||
CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept {
|
||||
return DevicePtr<typename std::add_const<U>::type>{ptr};
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept {
|
||||
++ptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept {
|
||||
auto tmp = DevicePtr(*this);
|
||||
ptr++;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept {
|
||||
--ptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept {
|
||||
auto tmp = DevicePtr(*this);
|
||||
ptr--;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept {
|
||||
ptr += offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept {
|
||||
ptr -= offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
|
||||
return lhs += offset;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
|
||||
return lhs -= offset;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept {
|
||||
return lhs.ptr - rhs.ptr;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.ptr, rhs.ptr);
|
||||
}
|
||||
|
||||
template <class U, class V>
|
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
|
||||
os << other.get() << " (device)";
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
};
|
||||
|
||||
template <>
|
||||
class DevicePtr<const void> {
|
||||
public:
|
||||
using element_type = const void;
|
||||
using pointer = typename std::add_pointer<element_type>::type;
|
||||
|
||||
DevicePtr() = default;
|
||||
|
||||
/* host const void pointer to const void device pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
|
||||
|
||||
/* allow any device pointer to be implicitly convereted to void device pointer */
|
||||
template <class T>
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
|
||||
|
||||
/* explicit conversion into host void pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
|
||||
|
||||
/* const void device pointer can be explicitly casted into any const device pointer type */
|
||||
template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true>
|
||||
CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
|
||||
return static_cast<T*>(ptr);
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.ptr, rhs.ptr);
|
||||
}
|
||||
|
||||
template <class U, class V>
|
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
|
||||
os << other.get() << " (device)";
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
};
|
||||
|
||||
template <>
|
||||
class DevicePtr<void> {
|
||||
public:
|
||||
using element_type = void;
|
||||
using pointer = typename std::add_pointer<element_type>::type;
|
||||
|
||||
DevicePtr() = default;
|
||||
|
||||
/* host pointer to device pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
|
||||
|
||||
/* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */
|
||||
template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false>
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
|
||||
|
||||
CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
|
||||
|
||||
/* explicit conversion into host void pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
|
||||
|
||||
/* void device pointer can be explicitly casted into any device pointer type */
|
||||
template <class T>
|
||||
CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
|
||||
return DevicePtr<T>(static_cast<T*>(ptr));
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.ptr, rhs.ptr);
|
||||
}
|
||||
|
||||
template <class U, class V>
|
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
|
||||
os << other.get() << " (device)";
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) {
|
||||
auto addr = reinterpret_cast<std::intptr_t>(ptr.get());
|
||||
return addr % alignment == 0;
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest4
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/** sets \p n elements to \p ch in \p dest
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T)));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
* - \p dest points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
* - \p src points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get()));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
|
||||
}
|
||||
|
||||
/** sets \p n elements to \p ch in \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
* \param stream CUDA stream that has to be used for the memory operation
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get()));
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */
|
82
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/span.hpp
vendored
Normal file
82
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/span.hpp
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
|
||||
|
||||
#include "pointer.hpp"
|
||||
#include "nvcc_defs.hpp"
|
||||
|
||||
#include "../../cuda/types.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief provides non-owning mutable access for device arrays
|
||||
*
|
||||
* const Span<T>/Span<T> provides mutable access to the elements unless T is const qualified
|
||||
* const Span<T> makes the span immutable but not the elements
|
||||
*/
|
||||
template <class T>
|
||||
class Span {
|
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
|
||||
|
||||
public:
|
||||
using value_type = T;
|
||||
using size_type = device::size_type;
|
||||
using index_type = device::index_type;
|
||||
|
||||
using pointer = DevicePtr<value_type>;
|
||||
using const_pointer = DevicePtr<typename std::add_const<value_type>::type>;
|
||||
using reference = typename std::add_lvalue_reference<value_type>::type;
|
||||
using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>;
|
||||
|
||||
Span() noexcept : ptr{ nullptr }, sz{ 0 } { }
|
||||
CUDA4DNN_HOST_DEVICE Span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { }
|
||||
CUDA4DNN_HOST_DEVICE Span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; }
|
||||
CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; }
|
||||
|
||||
CUDA4DNN_DEVICE reference operator[](index_type index) const { return ptr[index]; }
|
||||
CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; }
|
||||
|
||||
template<class U = T, class V = typename std::add_const<U>::type,
|
||||
typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
|
||||
CUDA4DNN_HOST_DEVICE operator Span<V>() const noexcept { return Span<V>{ptr, sz}; }
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
size_type sz;
|
||||
};
|
||||
|
||||
/** @brief provides non-owning immutable view for device arrays */
|
||||
template <class T>
|
||||
using View = Span<const T>;
|
||||
|
||||
/** returns true if the address of a span/view is aligned to \p alignment number of elements (not bytes) */
|
||||
template <class T>
|
||||
bool is_address_aligned(View<T> v, std::size_t alignment) {
|
||||
return is_aligned(v.data(), alignment * sizeof(T));
|
||||
}
|
||||
|
||||
/** returns true if the size of a span/view is a multiple of \p alignment */
|
||||
template <class T>
|
||||
bool is_size_aligned(View<T> v, std::size_t alignment) {
|
||||
return v.size() % alignment == 0;
|
||||
}
|
||||
|
||||
/** @brief returns true if the address and the size of the span/view is aligned
|
||||
* \p alignment refers to the number of elements (not bytes)
|
||||
*/
|
||||
template <class T>
|
||||
bool is_fully_aligned(View<T> v, std::size_t alignment) {
|
||||
return is_address_aligned(v, alignment) && is_size_aligned(v, alignment);
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP */
|
161
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/stream.hpp
vendored
Normal file
161
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/stream.hpp
vendored
Normal file
@ -0,0 +1,161 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** \file stream.hpp
|
||||
*
|
||||
* Default streams are not supported as they limit flexiblity. All operations are always
|
||||
* carried out in non-default streams in the CUDA backend. The stream classes sacrifice
|
||||
* the ability to support default streams in exchange for better error detection. That is,
|
||||
* a default constructed stream represents no stream and any attempt to use it will throw an
|
||||
* exception.
|
||||
*/
|
||||
|
||||
/** @brief non-copyable smart CUDA stream
|
||||
*
|
||||
* UniqueStream is a smart non-sharable wrapper for CUDA stream handle which ensures that
|
||||
* the handle is destroyed after use. Unless explicitly specified by a constructor argument,
|
||||
* the stream object does not represent any stream by default.
|
||||
*/
|
||||
class UniqueStream {
|
||||
public:
|
||||
UniqueStream() noexcept : stream{ 0 } { }
|
||||
UniqueStream(UniqueStream&) = delete;
|
||||
UniqueStream(UniqueStream&& other) noexcept {
|
||||
stream = other.stream;
|
||||
other.stream = 0;
|
||||
}
|
||||
|
||||
/** creates a non-default stream if `create` is true; otherwise, no stream is created */
|
||||
UniqueStream(bool create) : stream{ 0 } {
|
||||
if (create) {
|
||||
/* we create non-blocking streams to avoid inrerruptions from users using the default stream */
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
}
|
||||
}
|
||||
|
||||
~UniqueStream() {
|
||||
try {
|
||||
/* cudaStreamDestroy does not throw if a valid stream is passed unless a previous
|
||||
* asynchronous operation errored.
|
||||
*/
|
||||
if (stream != 0)
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamDestroy(stream));
|
||||
} catch (const CUDAException& ex) {
|
||||
std::ostringstream os;
|
||||
os << "Asynchronous exception caught during CUDA stream destruction.\n";
|
||||
os << ex.what();
|
||||
os << "Exception will be ignored.\n";
|
||||
CV_LOG_WARNING(0, os.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
UniqueStream& operator=(const UniqueStream&) = delete;
|
||||
UniqueStream& operator=(UniqueStream&& other) noexcept {
|
||||
CV_Assert(other);
|
||||
if (&other != this) {
|
||||
UniqueStream(std::move(*this)); /* destroy current stream */
|
||||
stream = other.stream;
|
||||
other.stream = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** returns the raw CUDA stream handle */
|
||||
cudaStream_t get() const noexcept {
|
||||
CV_Assert(stream);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/** blocks the calling thread until all pending operations in the stream finish */
|
||||
void synchronize() const {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
/** returns true if there are pending operations in the stream */
|
||||
bool busy() const {
|
||||
CV_Assert(stream);
|
||||
|
||||
auto status = cudaStreamQuery(stream);
|
||||
if (status == cudaErrorNotReady)
|
||||
return true;
|
||||
CUDA4DNN_CHECK_CUDA(status);
|
||||
return false;
|
||||
}
|
||||
|
||||
/** returns true if the stream is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(stream); }
|
||||
|
||||
private:
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
/** @brief sharable smart CUDA stream
|
||||
*
|
||||
* Stream is a smart sharable wrapper for CUDA stream handle which ensures that
|
||||
* the handle is destroyed after use. Unless explicitly specified in the constructor,
|
||||
* the stream object represents no stream.
|
||||
*/
|
||||
class Stream {
|
||||
public:
|
||||
Stream() { }
|
||||
Stream(const Stream&) = default;
|
||||
Stream(Stream&&) = default;
|
||||
|
||||
/** if \p create is `true`, a new stream will be created; otherwise, no stream is created */
|
||||
Stream(bool create) {
|
||||
if (create)
|
||||
stream = std::make_shared<UniqueStream>(create);
|
||||
}
|
||||
|
||||
Stream& operator=(const Stream&) = default;
|
||||
Stream& operator=(Stream&&) = default;
|
||||
|
||||
/** blocks the caller thread until all operations in the stream are complete */
|
||||
void synchronize() const {
|
||||
CV_Assert(stream);
|
||||
stream->synchronize();
|
||||
}
|
||||
|
||||
/** returns true if there are operations pending in the stream */
|
||||
bool busy() const {
|
||||
CV_Assert(stream);
|
||||
return stream->busy();
|
||||
}
|
||||
|
||||
/** returns true if the object points has a valid stream */
|
||||
explicit operator bool() const noexcept {
|
||||
if (!stream)
|
||||
return false;
|
||||
return stream->operator bool();
|
||||
}
|
||||
|
||||
cudaStream_t get() const noexcept {
|
||||
CV_Assert(stream);
|
||||
return stream->get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<UniqueStream> stream;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP */
|
1203
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor.hpp
vendored
Normal file
1203
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
477
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
vendored
Normal file
477
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
vendored
Normal file
@ -0,0 +1,477 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
|
||||
|
||||
#include "stream.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "pointer.hpp"
|
||||
#include "cublas.hpp"
|
||||
#include "cudnn.hpp"
|
||||
#include "workspace.hpp"
|
||||
|
||||
#include "cudnn/convolution.hpp"
|
||||
#include "cudnn/pooling.hpp"
|
||||
#include "cudnn/lrn.hpp"
|
||||
#include "cudnn/softmax.hpp"
|
||||
#include "cudnn/transform.hpp"
|
||||
#include "cudnn/transpose_convolution.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
namespace tensor_ops {
|
||||
|
||||
/** @brief copies data between tensors
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p dest and \p src must have the same shape
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
|
||||
CV_Assert(is_shape_same(dest, src));
|
||||
if (dest.get() != src.get())
|
||||
memcpy(dest.get(), src.get(), dest.size(), stream);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
template <class T>
|
||||
void assertGEMMCompatiblity(const TensorSpan<T>& result, bool transa, const TensorView<T>& A, bool transb, const TensorView<T>& B) {
|
||||
/* check dimension requirements for matrix multiplication */
|
||||
if (!transa && !transb) {
|
||||
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
|
||||
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
|
||||
} else if (!transa && transb) {
|
||||
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
|
||||
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
|
||||
} else if (transa && !transb) {
|
||||
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
|
||||
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
|
||||
} else {
|
||||
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
|
||||
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief performs generalized matrix-multiplication
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p A and \p B must meet the mathematical requirements for matrix multiplication
|
||||
* - \p result must be large enough to hold the result
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
|
||||
/* matrix operations can be performed only on tensors with rank two or below */
|
||||
CV_Assert(get_effective_rank(A) <= 2);
|
||||
CV_Assert(get_effective_rank(B) <= 2);
|
||||
CV_Assert(get_effective_rank(result) <= 2);
|
||||
|
||||
const auto result_nr = result.get_axis_size(-2);
|
||||
const auto result_nc = result.get_axis_size(-1);
|
||||
const auto common_dim = A.get_axis_size(transa ? -2 : -1);
|
||||
const auto A_nc = A.get_axis_size(-1);
|
||||
const auto B_nc = B.get_axis_size(-1);
|
||||
|
||||
detail::assertGEMMCompatiblity(result, transa, A, transb, B);
|
||||
|
||||
/* tensors are stored in row-major but cublas::gemm operates on column-major matrices
|
||||
* a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
|
||||
*
|
||||
* Required: C = AB
|
||||
* what cuBLAS sees: C^T = A^TB^T = (BA)^T
|
||||
*
|
||||
* By reversing operands, we effectively perform:
|
||||
* C^T = B^TA^T = (AB)^T
|
||||
*
|
||||
* which gives C = AB
|
||||
*/
|
||||
cublas::gemm<T>(handle,
|
||||
transb, transa,
|
||||
result_nc, result_nr, common_dim,
|
||||
alpha, B.get(), B_nc,
|
||||
A.get(), A_nc,
|
||||
beta, result.get(), result_nc);
|
||||
}
|
||||
|
||||
/** @brief performs generalized matrix-multiplication for a strided batch of matrices
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - A, B and C must be rank three tensors with dimensions (batch, rows, cols)
|
||||
* - the last two axes of \p A and \p B must meet the mathematical requirements for matrix multiplication
|
||||
* - \p result must be large enough to hold the result and the matrices must not overlap in memory
|
||||
* - batch dimension should be same in \p A, \p B and \p result
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void gemmStridedBatched(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
|
||||
CV_Assert(A.rank() == 3);
|
||||
CV_Assert(B.rank() == 3);
|
||||
CV_Assert(result.rank() == 3);
|
||||
|
||||
const auto batch_size = result.get_axis_size(0);
|
||||
CV_Assert(batch_size == A.get_axis_size(0));
|
||||
CV_Assert(batch_size == B.get_axis_size(0));
|
||||
|
||||
detail::assertGEMMCompatiblity(result, transa, A, transb, B);
|
||||
|
||||
const auto result_nr = result.get_axis_size(-2);
|
||||
const auto result_nc = result.get_axis_size(-1);
|
||||
const auto common_dim = A.get_axis_size(transa ? -2 : -1);
|
||||
const auto A_nc = A.get_axis_size(-1);
|
||||
const auto B_nc = B.get_axis_size(-1);
|
||||
|
||||
std::size_t strideA = (A.size() / batch_size),
|
||||
strideB = (B.size() / batch_size),
|
||||
strideC = (result.size() / batch_size);
|
||||
|
||||
cublas::gemmStridedBatched<T>(handle,
|
||||
transb, transa,
|
||||
result_nc, result_nr, common_dim,
|
||||
alpha, B.get(), B_nc, strideB,
|
||||
A.get(), A_nc, strideA,
|
||||
beta, result.get(), result_nc, strideC,
|
||||
batch_size);
|
||||
}
|
||||
|
||||
/** @brief performs element-wise addition with broadcasting
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p A and \p result must be compatible tensors
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
|
||||
CV_Assert(is_shape_same(output, input));
|
||||
|
||||
channel_axis = clamp_axis(channel_axis, input.rank());
|
||||
|
||||
std::size_t outer_size = input.size_range(0, channel_axis);
|
||||
auto channel_size = input.get_axis_size(channel_axis);
|
||||
std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
|
||||
|
||||
std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
|
||||
|
||||
using cudnn::TensorDescriptor;
|
||||
auto inputDesc = TensorDescriptor<T>(shape);
|
||||
auto outputDesc = TensorDescriptor<T>(shape);
|
||||
cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
class Convolution {
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
using FilterDescriptor = cudnn::FilterDescriptor<T>;
|
||||
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
|
||||
using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
|
||||
using ActivationDescriptor = cudnn::ActivationDescriptor;
|
||||
|
||||
public:
|
||||
using ActivationType = ActivationDescriptor::ActivationType;
|
||||
|
||||
struct params_type {
|
||||
/* convolution */
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> filter_shape;
|
||||
std::vector<std::size_t> padding;
|
||||
std::vector<std::size_t> stride;
|
||||
std::vector<std::size_t> dilation;
|
||||
std::size_t groups;
|
||||
|
||||
/* bias and activation (only RELU supported) */
|
||||
std::vector<std::size_t> bias_shape;
|
||||
ActivationType activation_type; /* MUST BE identity if there is no bias and ReLU if there is bias */
|
||||
bool eltwise;
|
||||
};
|
||||
|
||||
Convolution() = default;
|
||||
Convolution(const Convolution&) = delete;
|
||||
Convolution(Convolution&&) = default;
|
||||
Convolution(cudnn::Handle handle, const params_type& params) {
|
||||
cudnnHandle = std::move(handle);
|
||||
|
||||
inputTensorDesc = TensorDescriptor(params.input_shape);
|
||||
filterDesc = FilterDescriptor(params.filter_shape);
|
||||
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
|
||||
|
||||
std::vector<int> output_dims;
|
||||
getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
|
||||
outputTensorDesc = TensorDescriptor(output_dims);
|
||||
|
||||
algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
|
||||
|
||||
if (!params.bias_shape.empty()) {
|
||||
CV_Assert(params.activation_type == ActivationType::RELU);
|
||||
biasTensorDesc = TensorDescriptor(params.bias_shape);
|
||||
if (params.eltwise)
|
||||
eltwiseTensorDesc = TensorDescriptor(output_dims);
|
||||
activationDesc = ActivationDescriptor(params.activation_type, 0.0);
|
||||
} else {
|
||||
CV_Assert(params.activation_type == ActivationType::IDENTITY);
|
||||
}
|
||||
}
|
||||
|
||||
Convolution& operator=(const Convolution&) = delete;
|
||||
Convolution& operator=(Convolution&&) = default;
|
||||
|
||||
std::size_t get_workspace_size() const noexcept {
|
||||
return algo.get_workspace_size();
|
||||
}
|
||||
|
||||
void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
|
||||
cudnn::convolve<T>(
|
||||
cudnnHandle,
|
||||
convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
inputTensorDesc, input.get(),
|
||||
1.0, 0.0, outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
void convolve_with_bias_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, WorkspaceInstance scratchpad) {
|
||||
cudnn::convolve_with_bias_activation<T>(
|
||||
cudnnHandle,
|
||||
1.0, convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
inputTensorDesc, input.get(),
|
||||
biasTensorDesc, bias.get(),
|
||||
activationDesc,
|
||||
outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
void convolve_with_bias_eltwise_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, TensorView<T> eltwise, WorkspaceInstance scratchpad) {
|
||||
cudnn::convolve_with_bias_eltwise_activation<T>(
|
||||
cudnnHandle,
|
||||
1.0, convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
inputTensorDesc, input.get(),
|
||||
biasTensorDesc, bias.get(),
|
||||
1.0, eltwiseTensorDesc, eltwise.get(),
|
||||
activationDesc,
|
||||
outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorDescriptor inputTensorDesc, outputTensorDesc;
|
||||
FilterDescriptor filterDesc;
|
||||
ConvolutionDescriptor convDesc;
|
||||
ConvolutionAlgorithm algo;
|
||||
TensorDescriptor biasTensorDesc;
|
||||
TensorDescriptor eltwiseTensorDesc;
|
||||
ActivationDescriptor activationDesc;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TransposeConvolution {
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
using FilterDescriptor = cudnn::FilterDescriptor<T>;
|
||||
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
|
||||
using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
|
||||
|
||||
public:
|
||||
struct params_type {
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
std::vector<std::size_t> filter_shape;
|
||||
|
||||
std::vector<std::size_t> padding;
|
||||
std::vector<std::size_t> stride;
|
||||
std::vector<std::size_t> dilation;
|
||||
|
||||
std::size_t groups;
|
||||
};
|
||||
|
||||
TransposeConvolution() = default;
|
||||
TransposeConvolution(const TransposeConvolution&) = delete;
|
||||
TransposeConvolution(TransposeConvolution&&) = default;
|
||||
TransposeConvolution(cudnn::Handle handle, const params_type& params) {
|
||||
cudnnHandle = std::move(handle);
|
||||
|
||||
filterDesc = FilterDescriptor(params.filter_shape);
|
||||
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
|
||||
|
||||
/* input_shape is the output shape for convolution
|
||||
* output_shape is the input shape for convolution
|
||||
*/
|
||||
convInputTensorDesc = TensorDescriptor(params.output_shape);
|
||||
|
||||
std::vector<int> conv_output_dims;
|
||||
getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
|
||||
|
||||
/* the convolution output must be identical to what cuDNN expects */
|
||||
CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
|
||||
|
||||
convOutputTensorDesc = TensorDescriptor(params.input_shape);
|
||||
|
||||
algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
|
||||
}
|
||||
|
||||
TransposeConvolution& operator=(const TransposeConvolution&) = delete;
|
||||
TransposeConvolution& operator=(TransposeConvolution&&) = default;
|
||||
|
||||
std::size_t get_workspace_size() const noexcept {
|
||||
return algo.get_workspace_size();
|
||||
}
|
||||
|
||||
void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
|
||||
cudnn::transpose_convolve<T>(
|
||||
cudnnHandle,
|
||||
convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
convOutputTensorDesc, input.get(),
|
||||
1.0, 0.0, convInputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
|
||||
FilterDescriptor filterDesc;
|
||||
ConvolutionDescriptor convDesc;
|
||||
TransposeConvolutionAlgorithm algo;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class Pooling {
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
using PoolingDescriptor = cudnn::PoolingDescriptor;
|
||||
|
||||
public:
|
||||
using PoolingType = PoolingDescriptor::PoolingType;
|
||||
|
||||
struct params_type {
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
std::vector<std::size_t> window_size;
|
||||
std::vector<std::size_t> padding;
|
||||
std::vector<std::size_t> stride;
|
||||
|
||||
PoolingType type;
|
||||
};
|
||||
|
||||
Pooling() = default;
|
||||
Pooling(const Pooling&) = delete;
|
||||
Pooling(Pooling&&) = default;
|
||||
Pooling(cudnn::Handle handle, const params_type& params) {
|
||||
cudnnHandle = std::move(handle);
|
||||
|
||||
inputTensorDesc = TensorDescriptor(params.input_shape);
|
||||
poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
|
||||
|
||||
//std::vector<int> output_dim;
|
||||
//getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
|
||||
outputTensorDesc = TensorDescriptor(params.output_shape);
|
||||
}
|
||||
|
||||
Pooling& operator=(const Pooling&) = delete;
|
||||
Pooling& operator=(Pooling&&) = default;
|
||||
|
||||
void pool(TensorView<T> input, TensorSpan<T> output) {
|
||||
cudnn::pool<T>(
|
||||
cudnnHandle,
|
||||
poolingDesc,
|
||||
inputTensorDesc, input.get(),
|
||||
1.0, 0.0, outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorDescriptor inputTensorDesc, outputTensorDesc;
|
||||
PoolingDescriptor poolingDesc;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class LRN {
|
||||
using LRNDescriptor = cudnn::LRNDescriptor;
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
|
||||
public:
|
||||
using LRNType = LRNDescriptor::LRNType;
|
||||
|
||||
LRN() = default;
|
||||
LRN(const LRN&) = delete;
|
||||
LRN(LRN&&) = default;
|
||||
LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
|
||||
cudnnHandle = std::move(handle);
|
||||
lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
|
||||
}
|
||||
|
||||
LRN& operator=(const LRN&) = delete;
|
||||
LRN& operator=(LRN&&) = default;
|
||||
|
||||
void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
|
||||
cudnn::LRNForward<T>(
|
||||
cudnnHandle,
|
||||
lrnDesc,
|
||||
TensorDescriptor(input.shape_as_vector()), input.get(),
|
||||
1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
|
||||
workspace
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
LRNDescriptor lrnDesc;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TensorTransform {
|
||||
using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
|
||||
public:
|
||||
TensorTransform() = default;
|
||||
TensorTransform(const TensorTransform&) = delete;
|
||||
TensorTransform(TensorTransform&&) = default;
|
||||
|
||||
template <class SequenceContainer>
|
||||
TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
|
||||
cudnnHandle = std::move(handle);
|
||||
transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
|
||||
}
|
||||
|
||||
TensorTransform& operator=(const TensorTransform&) = delete;
|
||||
TensorTransform& operator=(TensorTransform&&) = default;
|
||||
|
||||
void transform(TensorView<T> input, TensorSpan<T> output) {
|
||||
cudnn::transform<T>(
|
||||
cudnnHandle,
|
||||
transDesc,
|
||||
TensorDescriptor(input.shape_as_vector()), input.get(),
|
||||
TensorDescriptor(output.shape_as_vector()), output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorTransformDescriptor transDesc;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */
|
166
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/workspace.hpp
vendored
Normal file
166
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/workspace.hpp
vendored
Normal file
@ -0,0 +1,166 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
|
||||
|
||||
#include "pointer.hpp"
|
||||
#include "span.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief maintains a single block of reusable device memory
|
||||
*
|
||||
* Each Workspace object is intended to be used by a single entity at a time but by
|
||||
* different entities at different times. It maintains a single reusable block of memory which
|
||||
* is sufficient for the largest consumer.
|
||||
*/
|
||||
class Workspace {
|
||||
public:
|
||||
|
||||
/** @brief reserve \p bytes of memory */
|
||||
void require(std::size_t bytes) {
|
||||
if (bytes > ptr.size())
|
||||
ptr.reset(bytes);
|
||||
}
|
||||
|
||||
/** @brief number of bytes reserved by the largest consumer */
|
||||
std::size_t size() const noexcept {
|
||||
return ptr.size();
|
||||
}
|
||||
|
||||
/** @brief returns the pointer to the workspace memory */
|
||||
DevicePtr<unsigned char> get() {
|
||||
return ptr.get();
|
||||
}
|
||||
|
||||
private:
|
||||
ManagedPtr<unsigned char> ptr;
|
||||
};
|
||||
|
||||
/** used to compute total workspace size from several workspace requests */
|
||||
class WorkspaceBuilder {
|
||||
public:
|
||||
WorkspaceBuilder() noexcept : max_size_in_bytes{ 0 } { }
|
||||
|
||||
/** request memory for \p count number of elements of the type \tparam T */
|
||||
template <class T = std::int8_t>
|
||||
void require(std::size_t count) noexcept {
|
||||
auto blocks256 = (count * sizeof(T) + 255) / 256;
|
||||
max_size_in_bytes += blocks256 * 256;
|
||||
}
|
||||
|
||||
/** returns the total workspace memory that is required */
|
||||
std::size_t required_workspace_size() const noexcept { return max_size_in_bytes; }
|
||||
|
||||
private:
|
||||
std::size_t max_size_in_bytes;
|
||||
};
|
||||
|
||||
/** general memory block from a workspace which can be passed on to the requester */
|
||||
class WorkspaceInstance {
|
||||
public:
|
||||
|
||||
/** returns a device pointer to the workspace memory */
|
||||
template <class T = void>
|
||||
DevicePtr<T> get() const noexcept {
|
||||
return static_cast<DevicePtr<T>>(ptr);
|
||||
}
|
||||
|
||||
/** returnss the size of the workspace memory in bytes */
|
||||
std::size_t size_in_bytes() const noexcept {
|
||||
return size_in_bytes_;
|
||||
}
|
||||
|
||||
/** creates a Span<T> of \p count elements from the workspace memory */
|
||||
template <class T>
|
||||
Span<T> get_span(std::size_t count = 0) const {
|
||||
if (count == 0)
|
||||
count = size_in_bytes_ / sizeof(T);
|
||||
|
||||
if (count * sizeof(T) > size_in_bytes_)
|
||||
CV_Error(Error::StsNoMem, "memory not sufficient");
|
||||
|
||||
return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
|
||||
}
|
||||
|
||||
/** creates a TensorSpan<T> of the given shape from the workspace memory */
|
||||
template <class T, class ForwardItr>
|
||||
TensorSpan<T> get_tensor_span(ForwardItr shape_begin, ForwardItr shape_end) const {
|
||||
using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
|
||||
auto required_size = std::accumulate(shape_begin, shape_end, 1, std::multiplies<ItrValueType>());
|
||||
if (required_size * sizeof(T) > size_in_bytes_)
|
||||
CV_Error(Error::StsNoMem, "memory not sufficient");
|
||||
return TensorSpan<T>(static_cast<DevicePtr<T>>(ptr), shape_begin, shape_end);
|
||||
}
|
||||
|
||||
private:
|
||||
DevicePtr<void> ptr;
|
||||
std::size_t size_in_bytes_;
|
||||
|
||||
friend class WorkspaceAllocator;
|
||||
WorkspaceInstance(DevicePtr<void> ptr_, std::size_t size_in_bytes__)
|
||||
: ptr{ ptr_ }, size_in_bytes_{ size_in_bytes__ } { }
|
||||
};
|
||||
|
||||
/** used to split a single workspace into constituents */
|
||||
class WorkspaceAllocator {
|
||||
public:
|
||||
WorkspaceAllocator() = default;
|
||||
WorkspaceAllocator(Workspace& workspace) noexcept
|
||||
: current{ workspace.get() }, bytes_remaining { workspace.size() }
|
||||
{
|
||||
CV_Assert(is_aligned<void>(current, 256));
|
||||
CV_Assert(bytes_remaining % 256 == 0);
|
||||
}
|
||||
|
||||
/** allocates a Span<T> of \p count elements from the workspace memory */
|
||||
template <class T>
|
||||
Span<T> get_span(std::size_t count = 0) {
|
||||
return accquire<T>(count);
|
||||
}
|
||||
|
||||
/** allocates a TensorSpan<T> of the given shape from the workspace memory */
|
||||
template <class T, class ForwardItr>
|
||||
TensorSpan<T> get_tensor_span(ForwardItr start, ForwardItr end) {
|
||||
using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
|
||||
auto required_size = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
|
||||
return TensorSpan<T>(accquire<T>(required_size).data(), start, end);
|
||||
}
|
||||
|
||||
/** allocates a WorkspaceInstance of size \p bytes from the workspace memory */
|
||||
WorkspaceInstance get_instance(std::size_t bytes = 0) {
|
||||
auto span = accquire(bytes);
|
||||
return WorkspaceInstance(DevicePtr<void>(span.data()), span.size());
|
||||
}
|
||||
|
||||
private:
|
||||
template <class T = std::int8_t>
|
||||
Span<T> accquire(std::size_t count = 0) {
|
||||
auto ptr = current;
|
||||
|
||||
if (count == 0)
|
||||
count = bytes_remaining / sizeof(T);
|
||||
|
||||
auto blocks256 = (count * sizeof(T) + 255) / 256;
|
||||
if (bytes_remaining < blocks256 * 256)
|
||||
CV_Error(Error::StsNoMem, "out of workspace memory");
|
||||
|
||||
bytes_remaining -= blocks256 * 256;
|
||||
current = static_cast<DevicePtr<std::int8_t>>(current) + blocks256 * 256;
|
||||
return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
|
||||
}
|
||||
|
||||
DevicePtr<void> current;
|
||||
std::size_t bytes_remaining;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP */
|
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp
vendored
Normal file
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
|
||||
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
|
||||
|
||||
namespace detail {
|
||||
template <class T, class Tag, class = void>
|
||||
struct is_iterator_helper : std::false_type {};
|
||||
|
||||
template <class T, class Tag>
|
||||
struct is_iterator_helper<T, Tag,
|
||||
typename std::enable_if<std::is_base_of<Tag, typename std::iterator_traits<T>::iterator_category>::value, void>::type
|
||||
> : std::true_type {};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using is_iterator = typename detail::is_iterator_helper<T, std::input_iterator_tag>;
|
||||
|
||||
template <class T>
|
||||
using is_forward_iterator = typename detail::is_iterator_helper<T, std::forward_iterator_tag>;
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP */
|
110
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/resizable_static_array.hpp
vendored
Normal file
110
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/resizable_static_array.hpp
vendored
Normal file
@ -0,0 +1,110 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
|
||||
|
||||
template <class T, std::size_t maxN>
|
||||
class resizable_static_array {
|
||||
using container_type = std::array<T, maxN>;
|
||||
|
||||
public:
|
||||
using value_type = typename container_type::value_type;
|
||||
using size_type = typename container_type::size_type;
|
||||
using difference_type = typename container_type::difference_type;
|
||||
using reference = typename container_type::reference;
|
||||
using const_reference = typename container_type::const_reference;
|
||||
using pointer = typename container_type::pointer;
|
||||
using const_pointer = typename container_type::const_pointer;
|
||||
using iterator = typename container_type::iterator;
|
||||
using const_iterator = typename container_type::const_iterator;
|
||||
using reverse_iterator = typename container_type::reverse_iterator;
|
||||
using const_reverse_iterator = typename container_type::const_reverse_iterator;
|
||||
|
||||
resizable_static_array() noexcept : size_{ 0 } { }
|
||||
explicit resizable_static_array(size_type sz) noexcept : size_{ sz } { }
|
||||
|
||||
bool empty() const noexcept { return static_cast<bool>(size_); }
|
||||
size_type size() const noexcept { return size_; }
|
||||
size_type capacity() const noexcept { return maxN; }
|
||||
|
||||
void resize(size_type sz) noexcept {
|
||||
assert(sz <= capacity());
|
||||
size_ = sz;
|
||||
}
|
||||
|
||||
void clear() noexcept { size_ = 0; }
|
||||
|
||||
template <class ForwardItr>
|
||||
void assign(ForwardItr first, ForwardItr last) {
|
||||
resize(std::distance(first, last));
|
||||
std::copy(first, last, begin());
|
||||
}
|
||||
|
||||
iterator begin() noexcept { return std::begin(arr); }
|
||||
iterator end() noexcept { return std::begin(arr) + size(); }
|
||||
|
||||
const_iterator begin() const noexcept { return arr.cbegin(); }
|
||||
const_iterator end() const noexcept { return arr.cbegin() + size(); }
|
||||
|
||||
const_iterator cbegin() const noexcept { return arr.cbegin(); }
|
||||
const_iterator cend() const noexcept { return arr.cbegin() + size(); }
|
||||
|
||||
reverse_iterator rbegin() noexcept { return std::begin(arr) + size(); }
|
||||
reverse_iterator rend() noexcept { return std::begin(arr); }
|
||||
|
||||
const_reverse_iterator rbegin() const noexcept { return arr.cbegin()+ size(); }
|
||||
const_reverse_iterator rend() const noexcept { return arr.cbegin(); }
|
||||
|
||||
const_reverse_iterator crbegin() const noexcept { return arr.cbegin() + size(); }
|
||||
const_reverse_iterator crend() const noexcept { return arr.cbegin(); }
|
||||
|
||||
reference operator[](size_type pos) {
|
||||
assert(pos < size());
|
||||
return arr[pos];
|
||||
}
|
||||
|
||||
const_reference operator[](size_type pos) const {
|
||||
assert(pos < size());
|
||||
return arr[pos];
|
||||
}
|
||||
|
||||
iterator insert(iterator pos, const T& value) {
|
||||
resize(size() + 1);
|
||||
std::move_backward(pos, end() - 1, end());
|
||||
*pos = value;
|
||||
return pos;
|
||||
}
|
||||
|
||||
iterator insert(iterator pos, T&& value) {
|
||||
resize(size() + 1);
|
||||
std::move_backward(pos, end() - 1, end());
|
||||
*pos = std::move(value);
|
||||
return pos;
|
||||
}
|
||||
|
||||
iterator erase(iterator pos) {
|
||||
std::move(pos + 1, end(), pos);
|
||||
resize(size() - 1);
|
||||
return pos;
|
||||
}
|
||||
|
||||
pointer data() noexcept { return arr.data(); }
|
||||
const_pointer data() const noexcept { return arr.data(); }
|
||||
|
||||
private:
|
||||
std::size_t size_;
|
||||
container_type arr;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP */
|
86
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/init.hpp
vendored
Normal file
86
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/init.hpp
vendored
Normal file
@ -0,0 +1,86 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
|
||||
|
||||
#include "csl/error.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <opencv2/core/cuda.hpp>
|
||||
#include <sstream>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
void checkVersions()
|
||||
{
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#programming-model
|
||||
// cuDNN API Compatibility
|
||||
// Beginning in cuDNN 7, the binary compatibility of a patch and minor releases is maintained as follows:
|
||||
// Any patch release x.y.z is forward or backward-compatible with applications built against another cuDNN patch release x.y.w (meaning, of the same major and minor version number, but having w!=z).
|
||||
// cuDNN minor releases beginning with cuDNN 7 are binary backward-compatible with applications built against the same or earlier patch release (meaning, an application built against cuDNN 7.x is binary compatible with cuDNN library 7.y, where y>=x).
|
||||
// Applications compiled with a cuDNN version 7.y are not guaranteed to work with 7.x release when y > x.
|
||||
auto cudnn_bversion = cudnnGetVersion();
|
||||
auto cudnn_major_bversion = cudnn_bversion / 1000, cudnn_minor_bversion = cudnn_bversion % 1000 / 100;
|
||||
if (cudnn_major_bversion != CUDNN_MAJOR || cudnn_minor_bversion < CUDNN_MINOR)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "cuDNN reports version " << cudnn_major_bversion << "." << cudnn_minor_bversion << " which is not compatible with the version " << CUDNN_MAJOR << "." << CUDNN_MINOR << " with which OpenCV was built";
|
||||
CV_LOG_WARNING(NULL, oss.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
int getDeviceCount()
|
||||
{
|
||||
return cuda::getCudaEnabledDeviceCount();
|
||||
}
|
||||
|
||||
int getDevice()
|
||||
{
|
||||
int device_id = -1;
|
||||
CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device_id));
|
||||
return device_id;
|
||||
}
|
||||
|
||||
bool isDeviceCompatible()
|
||||
{
|
||||
int device_id = getDevice();
|
||||
if (device_id < 0)
|
||||
return false;
|
||||
|
||||
int major = 0, minor = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
|
||||
|
||||
if (cv::cuda::TargetArchs::hasEqualOrLessPtx(major, minor))
|
||||
return true;
|
||||
|
||||
for (int i = minor; i >= 0; i--)
|
||||
if (cv::cuda::TargetArchs::hasBin(major, i))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool doesDeviceSupportFP16()
|
||||
{
|
||||
int device_id = getDevice();
|
||||
if (device_id < 0)
|
||||
return false;
|
||||
|
||||
int major = 0, minor = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
|
||||
|
||||
int version = major * 10 + minor;
|
||||
if (version < 53)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP */
|
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp
vendored
Normal file
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* inplace_output = activation(inplace_output) + eltwise */
|
||||
|
||||
template <class T>
|
||||
void relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise, T slope);
|
||||
|
||||
template <class T>
|
||||
void clipped_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void tanh_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void swish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void mish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void sigmoid_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void power_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP */
|
53
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activations.hpp
vendored
Normal file
53
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activations.hpp
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T slope);
|
||||
|
||||
template <class T>
|
||||
void clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void axiswise_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t inner_size, csl::View<T> slope);
|
||||
|
||||
template <class T>
|
||||
void tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void swish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void mish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void elu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void abs(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void bnll(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift);
|
||||
|
||||
template <class T>
|
||||
void exp(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T normScale, T normShift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP */
|
38
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
vendored
Normal file
38
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void biasN_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T slope);
|
||||
|
||||
template <class T>
|
||||
void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */
|
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp
vendored
Normal file
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* inplace_output = activation(inplace_output + bias) + eltwise
|
||||
* broadcasting on `bias` is allowed but not on `eltwise`
|
||||
*/
|
||||
|
||||
template <class T>
|
||||
void biasN_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T slope);
|
||||
|
||||
template <class T>
|
||||
void biasN_clipped_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void biasN_tanh_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_sigmoid_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_swish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_mish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_power_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP */
|
45
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp
vendored
Normal file
45
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* inplace_output = activation(inplace_output + bias + eltwise)
|
||||
* broadcasting on `bias` is allowed but not on `eltwise`
|
||||
*/
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_identity_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T slope);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP */
|
27
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/concat.hpp
vendored
Normal file
27
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/concat.hpp
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void concat(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, std::size_t output_axis_offset,
|
||||
csl::TensorView<T> input, std::size_t axis);
|
||||
|
||||
template <class T>
|
||||
void concat_with_offsets(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> axis_offsets);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP */
|
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
vendored
Normal file
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void crop_and_resize(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> boxes);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP */
|
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/detection_output.hpp
vendored
Normal file
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/detection_output.hpp
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_DETECTION_OUTPUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_DETECTION_OUTPUT_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void decode_bboxes(const csl::Stream& stream, csl::Span<T> output, csl::View<T> locations, csl::View<T> priors,
|
||||
std::size_t num_loc_classes, bool share_location, std::size_t background_label_id,
|
||||
bool transpose_location, bool variance_encoded_in_target,
|
||||
bool corner_true_or_center_false, bool normalized_bbox,
|
||||
bool clip_box, float clip_width, float clip_height);
|
||||
|
||||
template <class T>
|
||||
void findTopK(const csl::Stream& stream, csl::TensorSpan<int> indices, csl::TensorSpan<int> count, csl::TensorView<T> scores, std::size_t background_label_id, float threshold);
|
||||
|
||||
template <class T>
|
||||
void box_collect(const csl::Stream& stream, csl::TensorSpan<T> collected_bboxes, csl::TensorView<T> decoded_bboxes, csl::TensorView<int> indices, csl::TensorView<int> count, bool share_location, std::size_t background_label_id);
|
||||
|
||||
template <class T>
|
||||
void blockwise_class_nms(const csl::Stream& stream, csl::TensorSpan<int> indices, csl::TensorSpan<int> count, csl::TensorView<T> collected_bboxes,
|
||||
bool normalized_bbox, std::size_t background_label_id, float nms_threshold);
|
||||
|
||||
template <class T>
|
||||
void nms_collect(const csl::Stream& stream, csl::TensorSpan<int> kept_indices, csl::TensorSpan<int> kept_count,
|
||||
csl::TensorView<int> indices, csl::TensorView<int> count, csl::TensorView<T> scores, float, std::size_t background_label_id);
|
||||
|
||||
template <class T>
|
||||
void consolidate_detections(const csl::Stream& stream, csl::TensorSpan<T> output,
|
||||
csl::TensorView<int> kept_indices, csl::TensorView<int> kept_count,
|
||||
csl::TensorView<T> decoded_bboxes, csl::TensorView<T> scores, bool share_location, csl::DevicePtr<int> num_detections);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_DETECTION_OUTPUT_HPP */
|
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp
vendored
Normal file
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* output = activation(x + y) */
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y, T slope);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_swish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_mish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP */
|
35
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
vendored
Normal file
35
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void eltwise_max_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_min_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_coeff_2(const csl::Stream& stream, csl::TensorSpan<T> output, T coeff_x, csl::TensorView<T> x, T coeff_y, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_prod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_div_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
|
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp
vendored
Normal file
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void fill(const csl::Stream& stream, csl::Span<T> output, T value);
|
||||
|
||||
template <class T>
|
||||
void copy(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP */
|
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fp_conversion.hpp
vendored
Normal file
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fp_conversion.hpp
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
void fp32_to_fp16(const csl::Stream& stream, csl::Span<half> output, csl::View<float> input);
|
||||
void fp16_to_fp32(const csl::Stream& stream, csl::Span<float> output, csl::View<half> input);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP */
|
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/grid_nms.hpp
vendored
Normal file
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/grid_nms.hpp
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_GRID_NMS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_GRID_NMS_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
|
||||
|
||||
template <class T>
|
||||
void grid_nms(const csl::Stream& stream, csl::Span<unsigned int> workspace, csl::TensorSpan<int> indices, csl::TensorSpan<int> count, csl::TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_GRID_NMS_HPP */
|
32
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp
vendored
Normal file
32
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void max_pooling_with_indices(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorSpan<T> indices, csl::TensorView<T> input,
|
||||
const std::vector<std::size_t>& kernel_size, const std::vector<std::size_t>& strides,
|
||||
const std::vector<std::size_t>& padding_left);
|
||||
|
||||
template <class T>
|
||||
void max_unpooling(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> indices,
|
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
|
||||
const std::vector<std::size_t>& padding_left);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP */
|
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
vendored
Normal file
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void reduce_mean(const csl::Stream& stream, csl::Span<float> means, csl::View<T> input, std::size_t inner_size);
|
||||
|
||||
template <class T>
|
||||
void reduce_mean_sqr_sum(const csl::Stream& stream, csl::Span<float> means, csl::Span<float> sum_sqrs, csl::View<T> input, std::size_t inner_size);
|
||||
|
||||
void compute_normalization_scale(const csl::Stream& stream, csl::Span<float> scale, csl::View<float> means, csl::View<float> sum_sqrs, std::size_t inner_size, float eps);
|
||||
|
||||
template <class T>
|
||||
void normalize_mean(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<float> means, std::size_t inner_size);
|
||||
|
||||
template <class T>
|
||||
void normalize_mean_variance(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<float> means, csl::View<float> scale, std::size_t inner_size);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */
|
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/normalize.hpp
vendored
Normal file
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/normalize.hpp
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void normalize(
|
||||
const csl::Stream& stream,
|
||||
csl::Span<T> output, csl::View<T> input,
|
||||
std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
|
||||
csl::Span<T> workspace);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP */
|
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/padding.hpp
vendored
Normal file
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/padding.hpp
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void copy_with_reflection101(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorView<T> input,
|
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP */
|
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/permute.hpp
vendored
Normal file
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/permute.hpp
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void permute(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> order);
|
||||
|
||||
template <class T>
|
||||
void transpose(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t in_width, std::size_t out_width);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */
|
28
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/prior_box.hpp
vendored
Normal file
28
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/prior_box.hpp
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void generate_prior_boxes(
|
||||
const csl::Stream& stream,
|
||||
csl::Span<T> output,
|
||||
csl::View<float> boxWidth, csl::View<float> boxHeight, csl::View<float> offsetX, csl::View<float> offsetY, float stepX, float stepY,
|
||||
std::vector<float> variance,
|
||||
std::size_t numPriors,
|
||||
std::size_t layerWidth, std::size_t layerHeight,
|
||||
std::size_t imageWidth, std::size_t imageHeight,
|
||||
bool normalize, bool clip);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP */
|
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/region.hpp
vendored
Normal file
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/region.hpp
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void region(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias,
|
||||
T object_prob_cutoff, T class_prob_cutoff,
|
||||
std::size_t boxes_per_cell, std::size_t box_size,
|
||||
std::size_t rows, std::size_t cols, T scale_x_y,
|
||||
std::size_t height_norm, std::size_t width_norm,
|
||||
bool if_true_sigmoid_else_softmax, bool new_coords);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP */
|
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/resize.hpp
vendored
Normal file
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/resize.hpp
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers);
|
||||
|
||||
template <class T>
|
||||
void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP */
|
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
vendored
Normal file
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void roi_pooling(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> rois, float spatial_scale);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP */
|
39
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
vendored
Normal file
39
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void biasN(const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output,
|
||||
csl::TensorView<T> input, std::size_t inner_size,
|
||||
csl::TensorView<T> bias);
|
||||
|
||||
template <class T>
|
||||
void scaleN(const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output,
|
||||
csl::TensorView<T> input, std::size_t inner_size,
|
||||
csl::TensorView<T> weights);
|
||||
|
||||
template <class T>
|
||||
void scale1_with_bias1(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T beta);
|
||||
|
||||
template <class T>
|
||||
void scaleN_with_biasN(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output,
|
||||
csl::TensorView<T> input, std::size_t inner_size,
|
||||
csl::TensorView<T> weights, csl::TensorView<T> bias);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP */
|
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/shortcut.hpp
vendored
Normal file
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/shortcut.hpp
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SHORTCUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SHORTCUT_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void input_shortcut(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> from);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SHORTCUT_HPP */
|
22
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/slice.hpp
vendored
Normal file
22
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/slice.hpp
vendored
Normal file
@ -0,0 +1,22 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void slice(const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorView<T> input,
|
||||
std::vector<std::size_t> offsets);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP */
|
376
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/activation.hpp
vendored
Normal file
376
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/activation.hpp
vendored
Normal file
@ -0,0 +1,376 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/activations.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ReLUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ReLUOp(csl::Stream stream_, T slope_)
|
||||
: stream(std::move(stream_)), slope{ slope_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::relu<T>(stream, output, input, slope);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T slope;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ClippedReLUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ClippedReLUOp(csl::Stream stream_, T min_, T max_)
|
||||
: stream(std::move(stream_)), min{ min_ }, max{ max_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::clipped_relu<T>(stream, output, input, min, max);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T min, max;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ChannelwiseReLUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ChannelwiseReLUOp(csl::Stream stream_, const Mat& slope)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
CV_Assert(!slope.empty());
|
||||
slopeTensor = csl::makeTensorHeader<T>(slope);
|
||||
csl::copyMatToTensor<T>(slope, slopeTensor, stream);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
CV_Assert(input.get_axis_size(1) == slopeTensor.size());
|
||||
std::size_t inner_size = input.size_range(2, input.rank());
|
||||
kernels::axiswise_relu<T>(stream, output, input, inner_size, slopeTensor);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> slopeTensor;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TanHOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
TanHOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::tanh<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class SwishOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
SwishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::swish<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class MishOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
MishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::mish<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class SigmoidOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
SigmoidOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::sigmoid<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ELUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ELUOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::elu<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class AbsValOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
AbsValOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::abs<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class BNLLOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
BNLLOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::bnll<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class PowerOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
PowerOp(csl::Stream stream_, T exp_, T scale_, T shift_)
|
||||
: stream(std::move(stream_)), exp{ exp_ }, scale{ scale_ }, shift{ shift_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::power<T>(stream, output, input, exp, scale, shift);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T exp, scale, shift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ExpOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ExpOp(csl::Stream stream_, T nScale_, T nShift_)
|
||||
: stream(std::move(stream_)), normScale{ nScale_ }, normShift{ nShift_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::exp<T>(stream, output, input, normScale, normShift);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T normScale, normShift;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */
|
58
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp
vendored
Normal file
58
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class BatchNormOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
BatchNormOp(csl::Stream stream_, const cv::Mat& weights, const cv::Mat& bias)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
|
||||
weightsTensor = csl::makeTensorHeader<T>(weights);
|
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
std::size_t inner_size = input.size_range(2, input.rank());
|
||||
kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weightsTensor, biasTensor);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> weightsTensor, biasTensor;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP */
|
90
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/concat.hpp
vendored
Normal file
90
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/concat.hpp
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/pointer.hpp"
|
||||
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
#include "../kernels/concat.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ConcatOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ConcatOp(csl::Stream stream_, std::size_t concat_axis, bool zero_padding)
|
||||
: stream(std::move(stream_)), concat_axis{ concat_axis }, zero_padding{ zero_padding }
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if(zero_padding)
|
||||
{
|
||||
auto output_shape = output_wrapper->getShape();
|
||||
|
||||
kernels::fill<T>(stream, output, 0.0);
|
||||
|
||||
std::size_t output_concat_axis_offset = 0;
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
auto input_shape = input_wrapper->getShape();
|
||||
|
||||
std::vector<std::size_t> offsets(input_shape.size());
|
||||
for (int j = 0; j < offsets.size(); j++)
|
||||
offsets[j] = (output_shape[j] - input_shape[j]) / 2;
|
||||
offsets[concat_axis] = output_concat_axis_offset;
|
||||
|
||||
kernels::concat_with_offsets(stream, output, input, offsets);
|
||||
|
||||
output_concat_axis_offset += input.get_axis_size(concat_axis);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::size_t output_axis_offset = 0;
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
kernels::concat(stream, output, output_axis_offset, input, concat_axis);
|
||||
|
||||
output_axis_offset += input.get_axis_size(concat_axis);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::size_t concat_axis;
|
||||
bool zero_padding;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP */
|
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/const.hpp
vendored
Normal file
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/const.hpp
vendored
Normal file
@ -0,0 +1,51 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ConstOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ConstOp(csl::Stream stream_, const cv::Mat& data)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
constTensor = csl::makeTensorHeader<T>(data);
|
||||
csl::copyMatToTensor<T>(data, constTensor, stream);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1 && inputs.size() == 0);
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
csl::tensor_ops::copy<T>(stream, output, constTensor);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> constTensor;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP */
|
608
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
vendored
Normal file
608
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
vendored
Normal file
@ -0,0 +1,608 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
#include "../kernels/activations.hpp"
|
||||
#include "../kernels/activation_eltwise.hpp"
|
||||
#include "../kernels/bias_activation.hpp"
|
||||
#include "../kernels/bias_eltwise_activation.hpp"
|
||||
#include "../kernels/bias_activation_eltwise.hpp"
|
||||
#include "../kernels/activation_eltwise.hpp"
|
||||
#include "../kernels/eltwise_activation.hpp"
|
||||
#include "../kernels/eltwise_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct ConvolutionConfiguration {
|
||||
/* the size of the following vectors must be equal to the kernel size */
|
||||
std::vector<std::size_t> kernel_size;
|
||||
std::vector<std::size_t> dilations, strides;
|
||||
|
||||
enum class PaddingMode {
|
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
|
||||
VALID, /* no padding is added */
|
||||
SAME /* TensorFlow logic is used for same padding */
|
||||
};
|
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */
|
||||
PaddingMode padMode;
|
||||
std::vector<std::size_t> pads_begin, pads_end;
|
||||
|
||||
/* full shape inclusive of channel and batch axis */
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
/* group count for grouped convolution */
|
||||
std::size_t groups;
|
||||
|
||||
enum class FusionMode {
|
||||
NONE,
|
||||
ACTIVATION, /* act(conv) */
|
||||
ELTWISE_SUM, /* eltwise + conv */ /* eltwise tensor is passed as second input to forward */
|
||||
ELTWISE_SUM_THEN_ACTIVATION, /* act(conv + eltwise) */
|
||||
ACTIVATION_THEN_ELTWISE_SUM, /* act(conv) + eltwise */
|
||||
};
|
||||
|
||||
FusionMode fusion_mode;
|
||||
|
||||
enum class ActivationType {
|
||||
IDENTITY,
|
||||
RELU, /* uses value provided in `relu_negative_slope` */
|
||||
CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
|
||||
POWER,
|
||||
TANH,
|
||||
SIGMOID,
|
||||
SWISH,
|
||||
MISH
|
||||
};
|
||||
|
||||
ActivationType activation_type;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil;
|
||||
float power_exp, power_scale, power_shift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ConvolutionOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle_, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
|
||||
: stream(std::move(stream_)), cudnnHandle(std::move(handle_))
|
||||
{
|
||||
const auto& kernel_size = config.kernel_size;
|
||||
const auto& dilations = config.dilations;
|
||||
const auto& strides = config.strides;
|
||||
|
||||
const auto convolution_order = kernel_size.size();
|
||||
CV_Assert(convolution_order == dilations.size());
|
||||
CV_Assert(convolution_order == strides.size());
|
||||
|
||||
const auto& input_shape = config.input_shape;
|
||||
const auto& output_shape = config.output_shape;
|
||||
CV_Assert(input_shape.size() == output_shape.size());
|
||||
CV_Assert(input_shape.size() == convolution_order + 2);
|
||||
|
||||
const auto groups = config.groups;
|
||||
|
||||
CV_Assert (1 <= convolution_order && convolution_order <= 3);
|
||||
|
||||
const auto rank = input_shape.size();
|
||||
const auto output_feature_maps = output_shape[1];
|
||||
const auto input_feature_maps = input_shape[1];
|
||||
const auto input_feature_maps_per_group = input_feature_maps / groups;
|
||||
CV_Assert(input_feature_maps % groups == 0);
|
||||
|
||||
filtersTensor = csl::makeTensorHeader<T>(filters);
|
||||
csl::copyMatToTensor<T>(filters, filtersTensor, stream);
|
||||
|
||||
if (!bias.empty())
|
||||
{
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
}
|
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
|
||||
*
|
||||
* `common_padding` contains the amount of padding that has to be added to both sides
|
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added
|
||||
* to a particular side in addition to the common padding
|
||||
*/
|
||||
std::vector<std::size_t> common_padding(rank, 0);
|
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
|
||||
if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL)
|
||||
{
|
||||
const auto& pads_begin = config.pads_begin;
|
||||
const auto& pads_end = config.pads_end;
|
||||
|
||||
CV_Assert(convolution_order == pads_begin.size());
|
||||
CV_Assert(convolution_order == pads_end.size());
|
||||
|
||||
for (int i = 2; i < common_padding.size(); i++)
|
||||
{
|
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
|
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i];
|
||||
padding_right[i] = pads_end[i - 2] - common_padding[i];
|
||||
}
|
||||
}
|
||||
else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID)
|
||||
{
|
||||
/* nothing to do as the paddings are already preset to zero */
|
||||
}
|
||||
else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME)
|
||||
{
|
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
|
||||
*
|
||||
* if total padding is odd, the extra is added towards the end
|
||||
*/
|
||||
for (int i = 2; i < rank; i++)
|
||||
{
|
||||
const auto j = i - 2; /* filter index */
|
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
|
||||
const auto required_total_padding =
|
||||
std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]);
|
||||
|
||||
common_padding[i] = required_total_padding / 2;
|
||||
padding_left[i] = 0;
|
||||
padding_right[i] = required_total_padding % 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */
|
||||
for (int i = 2; i < rank; i++) {
|
||||
const auto j = i - 2; /* filter idx */
|
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
|
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
|
||||
std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
|
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right
|
||||
*/
|
||||
if (rem && padding_right[i] > 0)
|
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
|
||||
}
|
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; };
|
||||
if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
|
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
|
||||
{
|
||||
/* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
|
||||
* copying the input to a bigger tensor and padding the ends manually
|
||||
*/
|
||||
transformed_shape = input_shape;
|
||||
for (int i = 0; i < rank; i++)
|
||||
transformed_shape[i] += padding_left[i] + padding_right[i];
|
||||
|
||||
inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
|
||||
}
|
||||
|
||||
typename csl::Convolution<T>::params_type params;
|
||||
if (transformed_shape.empty())
|
||||
{
|
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* the convolution operation will be seeing the transformed input */
|
||||
params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape));
|
||||
}
|
||||
|
||||
auto& fshape = params.filter_shape;
|
||||
fshape.resize(rank);
|
||||
fshape[0] = output_feature_maps;
|
||||
fshape[1] = input_feature_maps_per_group;
|
||||
std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
|
||||
CV_Assert(fshape.size() == kernel_size.size() + 2);
|
||||
|
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
|
||||
params.stride = strides;
|
||||
params.dilation = dilations;
|
||||
params.groups = config.groups;
|
||||
|
||||
fusion_mode = config.fusion_mode;
|
||||
activation = config.activation_type;
|
||||
relu_negative_slope = config.relu_negative_slope;
|
||||
crelu_floor = config.crelu_floor;
|
||||
crelu_ceil = config.crelu_ceil;
|
||||
power_exp = config.power_exp;
|
||||
power_scale = config.power_scale;
|
||||
power_shift = config.power_shift;
|
||||
|
||||
/* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves
|
||||
* hence, the activation for cuDNN is IDENTITY by default
|
||||
*/
|
||||
fusion_location = InternalFusionLocation::NATIVE; /* i.e. we perform bias, act and eltwise */
|
||||
params.eltwise = false;
|
||||
params.activation_type = csl::Convolution<T>::ActivationType::IDENTITY;
|
||||
|
||||
/* cuDNN can fuse the operations with convolution in some cases; try if it's possible */
|
||||
if (!biasTensor.empty() && 0 &&
|
||||
biasTensor.size() == output_feature_maps && /* cuDNN requirement */
|
||||
activation == ConvolutionConfiguration::ActivationType::RELU && /* cuDNN requirement */
|
||||
relu_negative_slope == 0.0 && /* cuDNN requirement */
|
||||
(fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION || /* act(conv + bias) */
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) /* act(conv + bias + eltwise) */
|
||||
)
|
||||
{
|
||||
bool do_not_fuse = false;
|
||||
if(std::is_same<T, half>::value)
|
||||
{
|
||||
/* performance degrades if fused with tensor core based convolutions in most cases */
|
||||
int device;
|
||||
CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device));
|
||||
|
||||
int cc_major;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
|
||||
|
||||
if (cc_major >= 7)
|
||||
do_not_fuse = true;
|
||||
}
|
||||
|
||||
if (!do_not_fuse)
|
||||
{
|
||||
fusion_location = InternalFusionLocation::CUDNN;
|
||||
auto bias_shape = std::vector<std::size_t>(rank, 1);
|
||||
bias_shape[1] = output_feature_maps;
|
||||
params.bias_shape = bias_shape;
|
||||
|
||||
if (config.fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
params.eltwise = true;
|
||||
|
||||
params.activation_type = csl::Convolution<T>::ActivationType::RELU;
|
||||
}
|
||||
}
|
||||
|
||||
convoluter = csl::Convolution<T>(cudnnHandle, params);
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
if (!transformed_shape.empty())
|
||||
{
|
||||
auto& shape = transformed_shape;
|
||||
auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
|
||||
builder.require<T>(sz);
|
||||
}
|
||||
builder.require(convoluter.get_workspace_size());
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* input[0] = conv input, input[1] = bias (from fused eltwise layer) */
|
||||
CV_Assert(inputs.size() == 1 || inputs.size() == 2);
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
csl::WorkspaceAllocator allocator(workspace);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
if (!transformed_shape.empty())
|
||||
{
|
||||
auto& shape = transformed_shape;
|
||||
auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
inputTransformer.transform(input, transformed_input);
|
||||
input = transformed_input;
|
||||
}
|
||||
|
||||
auto conv_scratchpad = allocator.get_instance();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if (fusion_location == InternalFusionLocation::CUDNN)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION)
|
||||
convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, conv_scratchpad);
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
{
|
||||
auto eltwise_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto eltwise = eltwise_wrapper->getView();
|
||||
CV_Assert(is_shape_same(eltwise, output));
|
||||
|
||||
convoluter.convolve_with_bias_eltwise_activation(output, input, filtersTensor, biasTensor, eltwise, conv_scratchpad);
|
||||
}
|
||||
}
|
||||
catch(const csl::cudnn::cuDNNException& ex)
|
||||
{
|
||||
if (ex.getCUDNNStatus() == CUDNN_STATUS_NOT_SUPPORTED)
|
||||
{
|
||||
/* drop cuDNN fusion and use the native fusion path */
|
||||
fusion_location = InternalFusionLocation::NATIVE;
|
||||
}
|
||||
else
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
if (fusion_location == InternalFusionLocation::NATIVE)
|
||||
{
|
||||
convoluter.convolve(output, input, filtersTensor, conv_scratchpad);
|
||||
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM)
|
||||
{
|
||||
CV_Assert(inputs.size() == 2);
|
||||
}
|
||||
|
||||
if (!biasTensor.empty() && inputs.size() == 2)
|
||||
{
|
||||
/* bias and eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM);
|
||||
|
||||
auto eltwise_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto eltwise = eltwise_wrapper->getView();
|
||||
CV_Assert(is_shape_same(eltwise, output));
|
||||
|
||||
std::size_t inner_size = output.size_range(2, output.rank());
|
||||
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM)
|
||||
{
|
||||
kernels::biasN_eltwise_sum_2_identity_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
{
|
||||
/* activation(conv + bias + eltwise) */
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::biasN_eltwise_sum_2_identity_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::biasN_eltwise_sum_2_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::biasN_eltwise_sum_2_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_eltwise_sum_2_tanh_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::biasN_eltwise_sum_2_sigmoid_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::biasN_eltwise_sum_2_swish_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::biasN_eltwise_sum_2_mish_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM)
|
||||
{
|
||||
/* activation(conv + bias) + eltwise */
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::biasN_eltwise_sum_2_identity_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::biasN_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::biasN_clipped_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_tanh_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::biasN_sigmoid_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::biasN_swish_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::biasN_mish_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!biasTensor.empty() && inputs.size() == 1)
|
||||
{
|
||||
/* bias but no eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION);
|
||||
|
||||
std::size_t inner_size = output.size_range(2, output.rank());
|
||||
switch(activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (biasTensor.empty() && inputs.size() == 2)
|
||||
{
|
||||
/* no bias but eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM);
|
||||
|
||||
auto eltwise_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto eltwise = eltwise_wrapper->getView();
|
||||
CV_Assert(is_shape_same(eltwise, output));
|
||||
|
||||
/* we pass `eltwise` as `bias` (with `inner_size` as one) to bias-activation kernels */
|
||||
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM)
|
||||
{
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, eltwise);
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
{
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::eltwise_sum_2_relu<T>(stream, output, output, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::eltwise_sum_2_clipped_relu<T>(stream, output, output, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::eltwise_sum_2_tanh<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::eltwise_sum_2_sigmoid<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::eltwise_sum_2_swish<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::eltwise_sum_2_mish<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM)
|
||||
{
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::clipped_relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::tanh_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::sigmoid_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::swish_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::mish_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(biasTensor.empty() && inputs.size() == 1)
|
||||
{
|
||||
/* no bias and no eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION);
|
||||
|
||||
switch(activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::relu<T>(stream, output, output, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::power<T>(stream, output, output, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::tanh<T>(stream, output, output);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::sigmoid<T>(stream, output, output);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::swish<T>(stream, output, output);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::mish<T>(stream, output, output);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::cudnn::Handle cudnnHandle;
|
||||
csl::Tensor<T> filtersTensor, biasTensor;
|
||||
csl::Convolution<T> convoluter;
|
||||
|
||||
std::vector<std::size_t> transformed_shape;
|
||||
csl::TensorTransform<T> inputTransformer;
|
||||
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
|
||||
ConvolutionConfiguration::FusionMode fusion_mode;
|
||||
ConvolutionConfiguration::ActivationType activation;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil;
|
||||
float power_exp, power_scale, power_shift;
|
||||
|
||||
enum class InternalFusionLocation {
|
||||
CUDNN,
|
||||
NATIVE
|
||||
} fusion_location;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */
|
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
vendored
Normal file
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
vendored
Normal file
@ -0,0 +1,51 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include "../kernels/crop_and_resize.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class CropAndResizeOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
CropAndResizeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 2 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto box_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto boxes = box_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::crop_and_resize(stream, output, input, static_cast<csl::View<T>>(boxes));
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP */
|
282
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/detection_output.hpp
vendored
Normal file
282
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/detection_output.hpp
vendored
Normal file
@ -0,0 +1,282 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
#include "../kernels/permute.hpp"
|
||||
#include "../kernels/detection_output.hpp"
|
||||
#include "../kernels/grid_nms.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct DetectionOutputConfiguration {
|
||||
std::size_t batch_size;
|
||||
|
||||
enum class CodeType {
|
||||
CORNER,
|
||||
CENTER_SIZE
|
||||
};
|
||||
CodeType code_type;
|
||||
|
||||
bool share_location;
|
||||
std::size_t num_priors;
|
||||
std::size_t num_classes;
|
||||
std::size_t background_class_id;
|
||||
|
||||
bool transpose_location;
|
||||
bool variance_encoded_in_target;
|
||||
bool normalized_bbox;
|
||||
bool clip_box;
|
||||
|
||||
std::size_t classwise_topK;
|
||||
float confidence_threshold;
|
||||
float nms_threshold;
|
||||
|
||||
int keepTopK;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class DetectionOutputOp final : public CUDABackendNode {
|
||||
private:
|
||||
/* We have block level NMS kernel where each block handles one class of one batch item.
|
||||
* If the number of classes and batch size together is very low, the blockwise NMS kernel
|
||||
* won't able to fully saturate the GPU with work.
|
||||
*
|
||||
* We also have a grid level NMS kernel where multiple blocks handle each class of every batch item.
|
||||
* This performs better in the worst case and utilizes resources better when block level kernel isn't
|
||||
* able to saturate the GPU with enough work. However, this is not efficient in the average case where
|
||||
* the block level kernel is able to saturate the GPU. It does better when the blockwise NMS barely
|
||||
* saturates the GPU.
|
||||
*
|
||||
* `GRID_NMS_CUTOFF` is the cutoff for `num_classes * batch_size` above which we will switch from grid
|
||||
* level NMS to block level NMS.
|
||||
*/
|
||||
static constexpr int GRID_NMS_CUTOFF = 32;
|
||||
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
DetectionOutputOp(csl::Stream stream_, const DetectionOutputConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
corner_true_or_center_false = (config.code_type == DetectionOutputConfiguration::CodeType::CORNER);
|
||||
|
||||
share_location = config.share_location;
|
||||
num_priors = config.num_priors;
|
||||
num_classes = config.num_classes;
|
||||
background_class_id = config.background_class_id;
|
||||
|
||||
transpose_location = config.transpose_location;
|
||||
variance_encoded_in_target = config.variance_encoded_in_target;
|
||||
normalized_bbox = config.normalized_bbox;
|
||||
clip_box = config.clip_box;
|
||||
|
||||
classwise_topK = config.classwise_topK;
|
||||
confidence_threshold = config.confidence_threshold;
|
||||
nms_threshold = config.nms_threshold;
|
||||
|
||||
keepTopK = config.keepTopK;
|
||||
CV_Assert(keepTopK > 0);
|
||||
|
||||
if (classwise_topK == -1)
|
||||
{
|
||||
classwise_topK = num_priors;
|
||||
if (keepTopK > 0 && keepTopK < num_priors)
|
||||
classwise_topK = keepTopK;
|
||||
}
|
||||
|
||||
auto batch_size = config.batch_size;
|
||||
auto num_loc_classes = (share_location ? 1 : num_classes);
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
builder.require<T>(batch_size * num_priors * num_loc_classes * 4); /* decoded boxes */
|
||||
builder.require<T>(batch_size * num_classes * num_priors); /* transposed scores */
|
||||
builder.require<int>(batch_size * num_classes * classwise_topK); /* indices */
|
||||
builder.require<int>(batch_size * num_classes); /* classwise topK count */
|
||||
builder.require<T>(batch_size * num_classes * classwise_topK * 4); /* topK decoded boxes */
|
||||
|
||||
if (batch_size * num_classes <= GRID_NMS_CUTOFF)
|
||||
{
|
||||
auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK);
|
||||
builder.require(batch_size * workspace_per_batch_item);
|
||||
}
|
||||
|
||||
builder.require<int>(batch_size * keepTopK); /* final kept indices */
|
||||
builder.require<int>(batch_size); /* kept indices count */
|
||||
builder.require<int>(1); /* total number of detections */
|
||||
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* locations, scores and priors make the first three inputs in order */
|
||||
/* the 4th input is used to obtain the shape for clipping */
|
||||
CV_Assert((inputs.size() == 3 || inputs.size() == 4) && outputs.size() == 1);
|
||||
|
||||
// locations: [batch_size, num_priors, num_loc_classes, 4]
|
||||
auto locations_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto locations = locations_wrapper->getView();
|
||||
|
||||
// scores: [batch_size, num_priors, num_classes]
|
||||
auto scores_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto scores = scores_wrapper->getView();
|
||||
scores.unsqueeze();
|
||||
scores.reshape(-1, num_priors, num_classes);
|
||||
|
||||
// priors: [1, 2, num_priors, 4]
|
||||
auto priors_wrapper = inputs[2].dynamicCast<wrapper_type>();
|
||||
auto priors = priors_wrapper->getView();
|
||||
|
||||
// output: [1, 1, batch_size * keepTopK, 7]
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto batch_size = locations.get_axis_size(0);
|
||||
auto num_loc_classes = (share_location ? 1 : num_classes);
|
||||
while(locations.rank() < 4)
|
||||
locations.unsqueeze();
|
||||
locations.reshape(batch_size, num_priors, num_loc_classes, 4);
|
||||
|
||||
float clip_width = 0.0, clip_height = 0.0;
|
||||
if (clip_box)
|
||||
{
|
||||
if (normalized_bbox)
|
||||
{
|
||||
clip_width = clip_height = 1.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto image_wrapper = inputs[3].dynamicCast<wrapper_type>();
|
||||
auto image_shape = image_wrapper->getShape();
|
||||
|
||||
CV_Assert(image_shape.size() == 4);
|
||||
clip_width = image_shape[3] - 1;
|
||||
clip_height = image_shape[2] - 1;
|
||||
}
|
||||
}
|
||||
|
||||
csl::WorkspaceAllocator allocator(workspace);
|
||||
|
||||
// decoded_boxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
csl::TensorSpan<T> decoded_boxes;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_priors, num_loc_classes, 4};
|
||||
decoded_boxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
CV_Assert(is_shape_same(decoded_boxes, locations));
|
||||
}
|
||||
|
||||
kernels::decode_bboxes<T>(stream, decoded_boxes, locations, priors,
|
||||
num_loc_classes, share_location, background_class_id,
|
||||
transpose_location, variance_encoded_in_target,
|
||||
corner_true_or_center_false, normalized_bbox,
|
||||
clip_box, clip_width, clip_height);
|
||||
|
||||
// scores_permuted: [batch_size, num_classes, num_priors]
|
||||
csl::TensorSpan<T> scores_permuted;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes, num_priors};
|
||||
scores_permuted = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::permute<T>(stream, scores_permuted, scores, {0, 2, 1});
|
||||
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
csl::TensorSpan<int> indices;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK};
|
||||
indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
// count: [batch_size, num_classes]
|
||||
csl::TensorSpan<int> count;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes};
|
||||
count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::findTopK<T>(stream, indices, count, scores_permuted, background_class_id, confidence_threshold);
|
||||
|
||||
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
|
||||
csl::TensorSpan<T> collected_bboxes;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK, 4};
|
||||
collected_bboxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::box_collect<T>(stream, collected_bboxes, decoded_boxes, indices, count, share_location, background_class_id);
|
||||
|
||||
if (batch_size * num_classes <= GRID_NMS_CUTOFF)
|
||||
{
|
||||
auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK);
|
||||
auto workspace = allocator.get_span<unsigned int>(batch_size * workspace_per_batch_item / sizeof(unsigned int));
|
||||
kernels::grid_nms<T>(stream, workspace, indices, count, collected_bboxes, background_class_id, normalized_bbox, nms_threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernels::blockwise_class_nms<T>(stream, indices, count, collected_bboxes, normalized_bbox, background_class_id, nms_threshold);
|
||||
}
|
||||
|
||||
// kept_indices: [batch_size, keepTopK]
|
||||
csl::TensorSpan<int> kept_indices;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, static_cast<std::size_t>(keepTopK)};
|
||||
kept_indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
// kept_count: [batch_size]
|
||||
csl::TensorSpan<int> kept_count;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size};
|
||||
kept_count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::nms_collect<T>(stream, kept_indices, kept_count, indices, count, scores_permuted, confidence_threshold, background_class_id);
|
||||
|
||||
auto num_detections = allocator.get_span<int>(1);
|
||||
kernels::fill<int>(stream, num_detections, 0);
|
||||
kernels::fill<T>(stream, output, 0.0);
|
||||
kernels::consolidate_detections<T>(stream, output, kept_indices, kept_count, decoded_boxes, scores_permuted, share_location, num_detections.data());
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
|
||||
bool share_location;
|
||||
std::size_t num_priors;
|
||||
std::size_t num_classes;
|
||||
std::size_t background_class_id;
|
||||
|
||||
bool transpose_location;
|
||||
bool variance_encoded_in_target;
|
||||
bool corner_true_or_center_false;
|
||||
bool normalized_bbox;
|
||||
bool clip_box;
|
||||
|
||||
std::size_t classwise_topK;
|
||||
float confidence_threshold;
|
||||
float nms_threshold;
|
||||
|
||||
int keepTopK;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP */
|
130
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
vendored
Normal file
130
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
vendored
Normal file
@ -0,0 +1,130 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/eltwise_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
enum class EltwiseOpType {
|
||||
MAX,
|
||||
SUM,
|
||||
PRODUCT,
|
||||
DIV,
|
||||
MIN,
|
||||
};
|
||||
|
||||
class EltwiseOpBase : public CUDABackendNode {
|
||||
public:
|
||||
EltwiseOpBase(csl::Stream stream_, EltwiseOpType op_, std::vector<float> coeffs_)
|
||||
: stream(std::move(stream_)), op(op_), coeffs(std::move(coeffs_))
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
csl::Stream stream;
|
||||
|
||||
public:
|
||||
EltwiseOpType op;
|
||||
std::vector<float> coeffs;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class EltwiseOp final : public EltwiseOpBase {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
EltwiseOp(csl::Stream stream_, EltwiseOpType op_, std::vector<float> coeffs_)
|
||||
: EltwiseOpBase(std::move(stream_), op_, std::move(coeffs_))
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() >= 2);
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
CV_Assert(coeffs.size() == 0 || op == EltwiseOpType::SUM);
|
||||
CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size());
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if (inputs.size() == 2)
|
||||
{
|
||||
auto input_wrapper_x = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input_x = input_wrapper_x->getView();
|
||||
|
||||
auto input_wrapper_y = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto input_y = input_wrapper_y->getView();
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, input_x, input_y); break;
|
||||
case EltwiseOpType::MIN: kernels::eltwise_min_2<T>(stream, output, input_x, input_y); break;
|
||||
case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, input_x, input_y); break;
|
||||
case EltwiseOpType::DIV: kernels::eltwise_div_2<T>(stream, output, input_x, input_y); break;
|
||||
case EltwiseOpType::SUM:
|
||||
if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1))
|
||||
kernels::eltwise_sum_2<T>(stream, output, input_x, input_y);
|
||||
else
|
||||
kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input_0 = input_wrapper_0->getView();
|
||||
|
||||
/* we first make a copy and then apply EltwiseOp cumulatively */
|
||||
csl::tensor_ops::copy(stream, output, input_0);
|
||||
|
||||
for (int i = 1; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, output, input); break;
|
||||
case EltwiseOpType::MIN: kernels::eltwise_min_2<T>(stream, output, output, input); break;
|
||||
case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, output, input); break;
|
||||
case EltwiseOpType::DIV: kernels::eltwise_div_2<T>(stream, output, output, input); break;
|
||||
case EltwiseOpType::SUM:
|
||||
if (coeffs.empty() || coeffs[i] == 1)
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, input);
|
||||
else
|
||||
{
|
||||
/* if this is the first op, we must scale output too */
|
||||
T coeff_x = (i == 1) ? coeffs[0] : 1.0;
|
||||
kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP */
|
92
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp
vendored
Normal file
92
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp
vendored
Normal file
@ -0,0 +1,92 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/cublas.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class InnerProductOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
InnerProductOp(csl::Stream stream_, csl::cublas::Handle handle, std::size_t axis, const Mat& weights, const Mat& bias)
|
||||
: stream(std::move(stream_)), cublasHandle(std::move(handle)), axis{ axis }
|
||||
{
|
||||
weightsTensor = csl::makeTensorHeader<T>(weights);
|
||||
CV_Assert(get_effective_rank(weightsTensor) <= 2);
|
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream);
|
||||
|
||||
if (!bias.empty())
|
||||
{
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.size());
|
||||
}
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
std::size_t batch_size = input.size_range(0, axis);
|
||||
|
||||
auto input_size = input.size() / batch_size;
|
||||
CV_Assert(input_size == weightsTensor.get_axis_size(-1));
|
||||
|
||||
auto output_size = output.size() / batch_size;
|
||||
CV_Assert(output_size == weightsTensor.get_axis_size(-2));
|
||||
|
||||
/* we treat the input and output as a matrix with dimensions (batch_size, input_size)
|
||||
* and (batch_size, output_size) respectively
|
||||
*
|
||||
* weight matrix dimensions: (output_size, input_size)
|
||||
*
|
||||
* I(W^T) = O
|
||||
* (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size)
|
||||
*/
|
||||
input.reshape(batch_size, input_size);
|
||||
output.reshape(batch_size, output_size);
|
||||
csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor);
|
||||
|
||||
if (!biasTensor.empty())
|
||||
kernels::biasN<T>(stream, output, output, 1, biasTensor);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::cublas::Handle cublasHandle;
|
||||
csl::Tensor<T> weightsTensor, biasTensor;
|
||||
std::size_t axis;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP */
|
75
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/lrn.hpp
vendored
Normal file
75
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/lrn.hpp
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
enum class LRNType {
|
||||
ACROSS_CHANNELS,
|
||||
WITHIN_CHANNEL
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class LRNOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
LRNOp(csl::cudnn::Handle handle, LRNType type_, std::size_t local_size, T alpha, T beta, T bias, std::size_t largestInputSize)
|
||||
: scratch_mem_in_bytes { 0 }
|
||||
{
|
||||
typename csl::LRN<T>::LRNType type{};
|
||||
switch (type_) {
|
||||
case LRNType::ACROSS_CHANNELS: type = csl::LRN<T>::LRNType::ACROSS_CHANNELS; break;
|
||||
case LRNType::WITHIN_CHANNEL: type = csl::LRN<T>::LRNType::WITHIN_CHANNEL; break;
|
||||
}
|
||||
lrn = csl::LRN<T>(std::move(handle), local_size, alpha, beta, bias, type);
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
if (type_ == LRNType::WITHIN_CHANNEL) {
|
||||
/* this is not a bug; we require two of these */
|
||||
builder.require<T>(largestInputSize);
|
||||
builder.require<T>(largestInputSize);
|
||||
}
|
||||
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
csl::WorkspaceAllocator allocator(workspace);
|
||||
lrn.normalize(input, output, allocator.get_instance());
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::LRN<T> lrn;
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP */
|
95
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/matmul.hpp
vendored
Normal file
95
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/matmul.hpp
vendored
Normal file
@ -0,0 +1,95 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/cublas.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class MatMulOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
MatMulOp(csl::Stream stream_, csl::cublas::Handle handle)
|
||||
: stream(std::move(stream_)), cublasHandle(std::move(handle))
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 2 && outputs.size() == 1);
|
||||
|
||||
auto input1_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input1 = input1_wrapper->getView();
|
||||
|
||||
auto input2_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto input2 = input2_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto rank = output.rank();
|
||||
CV_Assert(rank == input1.rank());
|
||||
CV_Assert(rank == input2.rank());
|
||||
CV_Assert(rank >= 2); // 1D MatMul not supported
|
||||
|
||||
for (int i = 0; i < rank - 2; i++)
|
||||
{
|
||||
// broadcasting not supported
|
||||
auto size = output.get_axis_size(i);
|
||||
CV_Assert(input1.get_axis_size(i) == size);
|
||||
CV_Assert(input2.get_axis_size(i) == size);
|
||||
}
|
||||
|
||||
auto m = input1.get_axis_size(-2);
|
||||
auto n = input1.get_axis_size(-1);
|
||||
auto k = input2.get_axis_size(-1);
|
||||
auto b = input1.size() / m / n;
|
||||
CV_Assert(input2.get_axis_size(-2) == n);
|
||||
CV_Assert(output.get_axis_size(-2) == m);
|
||||
CV_Assert(output.get_axis_size(-1) == k);
|
||||
|
||||
if (get_effective_rank(output) <= 2)
|
||||
{
|
||||
CV_Assert(b == 1);
|
||||
CV_Assert(get_effective_rank(input1) <= 2);
|
||||
CV_Assert(get_effective_rank(input2) <= 2);
|
||||
csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input1, false, input2);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(rank >= 3);
|
||||
input1.reshape(b, m, n);
|
||||
input2.reshape(b, n, k);
|
||||
output.reshape(b, m, k);
|
||||
input1.squeeze_to(3);
|
||||
input2.squeeze_to(3);
|
||||
output.squeeze_to(3);
|
||||
csl::tensor_ops::gemmStridedBatched<T>(cublasHandle, 0.0, output, 1.0, false, input1, false, input2);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::cublas::Handle cublasHandle;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_HPP */
|
181
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
vendored
Normal file
181
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
vendored
Normal file
@ -0,0 +1,181 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
|
||||
#include "../kernels/max_unpooling.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct MaxPoolingConfiguration {
|
||||
/* the size of the following vectors must be equal to the pooling order */
|
||||
std::vector<std::size_t> window_size;
|
||||
std::vector<std::size_t> strides;
|
||||
|
||||
enum class PaddingMode {
|
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
|
||||
VALID, /* no padding is added */
|
||||
SAME /* TensorFlow logic is used for same padding */
|
||||
};
|
||||
|
||||
PaddingMode padMode;
|
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */
|
||||
std::vector<std::size_t> pads_begin;
|
||||
|
||||
/* full shape inclusive of channel and batch axis */
|
||||
std::vector<std::size_t> input_shape;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class MaxPoolingOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
MaxPoolingOp(csl::Stream stream_, const MaxPoolingConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
window_size = config.window_size;
|
||||
|
||||
const auto pooling_order = window_size.size();
|
||||
|
||||
strides = config.strides;
|
||||
CV_Assert(pooling_order == strides.size());
|
||||
|
||||
if (pooling_order < 1 || pooling_order > 3)
|
||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D max-pooling are supported.");
|
||||
|
||||
padding_left.resize(pooling_order);
|
||||
if (config.padMode == MaxPoolingConfiguration::PaddingMode::MANUAL)
|
||||
{
|
||||
const auto& pads_begin = config.pads_begin;
|
||||
CV_Assert(pooling_order == pads_begin.size());
|
||||
|
||||
padding_left.assign(std::begin(pads_begin), std::end(pads_begin));
|
||||
}
|
||||
else if (config.padMode == MaxPoolingConfiguration::PaddingMode::VALID)
|
||||
{
|
||||
/* nothing to do as the paddings are already preset to zero */
|
||||
}
|
||||
else if (config.padMode == MaxPoolingConfiguration::PaddingMode::SAME)
|
||||
{
|
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
|
||||
*
|
||||
* if total padding is odd, the extra is added towards the end
|
||||
*/
|
||||
const auto& input_shape = config.input_shape;
|
||||
CV_Assert(input_shape.size() == pooling_order + 2);
|
||||
|
||||
for (int i = 0; i < pooling_order; i++)
|
||||
{
|
||||
const auto output_dim = (input_shape[i + 2] - 1 + strides[i]) / strides[i];
|
||||
const auto required_total_padding =
|
||||
std::max<std::int64_t>(0, (output_dim - 1) * strides[i] + window_size[i] - input_shape[i + 2]);
|
||||
|
||||
padding_left[i] = required_total_padding / 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 2);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input_data = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output_data = output_wrapper->getSpan();
|
||||
|
||||
auto indices_wrapper = outputs[1].dynamicCast<wrapper_type>();
|
||||
auto output_indices = indices_wrapper->getSpan();
|
||||
|
||||
kernels::max_pooling_with_indices<T>(
|
||||
stream, output_data, output_indices, input_data, window_size, strides, padding_left
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
|
||||
std::vector<std::size_t> window_size, strides, padding_left;
|
||||
};
|
||||
|
||||
struct MaxUnpoolingConfiguration {
|
||||
/* the size of the following vectors must be equal to the unpooling order */
|
||||
std::vector<std::size_t> window_size;
|
||||
std::vector<std::size_t> strides;
|
||||
std::vector<std::size_t> pads_begin;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class MaxUnpoolingOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
MaxUnpoolingOp(csl::Stream stream_, const MaxUnpoolingConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
window_size = config.window_size;
|
||||
|
||||
const auto pooling_order = window_size.size();
|
||||
CV_Assert(pooling_order >= 1);
|
||||
|
||||
strides = config.strides;
|
||||
padding_left = config.pads_begin;
|
||||
CV_Assert(strides.size() == pooling_order);
|
||||
CV_Assert(padding_left.size() == pooling_order);
|
||||
|
||||
if (pooling_order != 2 && pooling_order != 3)
|
||||
CV_Error(Error::StsNotImplemented, "Only 2D/3D max-unpooling are supported.");
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* sometimes a third input is passed to provide the output shape; we won't need it */
|
||||
CV_Assert(inputs.size() == 2 || inputs.size() == 3);
|
||||
CV_Assert(outputs.size() >= 1);
|
||||
|
||||
for(int i = 0; i < outputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input_data = input_wrapper->getView();
|
||||
|
||||
auto indices_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto input_indices = indices_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output_data = output_wrapper->getSpan();
|
||||
|
||||
kernels::max_unpooling<T>(stream, output_data, input_data, input_indices, window_size, strides, padding_left);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
|
||||
std::vector<std::size_t> window_size, strides, padding_left;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP */
|
134
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/mvn.hpp
vendored
Normal file
134
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/mvn.hpp
vendored
Normal file
@ -0,0 +1,134 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MVN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MVN_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/workspace.hpp"
|
||||
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
#include "../kernels/mvn.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct MVNConfiguration {
|
||||
std::vector<std::vector<std::size_t>> input_shapes;
|
||||
|
||||
/*
|
||||
* [0, split_axis) = outer range
|
||||
* [split_axis, -1] = inner range
|
||||
*
|
||||
* for each location in the outer range, all the values in the inner range are normalized as a group
|
||||
*/
|
||||
std::size_t split_axis;
|
||||
|
||||
/* The group (described above) is centered always. The following parameter controls whether the variance
|
||||
* is also normalized.
|
||||
*/
|
||||
bool normalize_variance;
|
||||
float epsilon;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class MVNOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
MVNOp(csl::Stream stream_, const MVNConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
split_axis = config.split_axis;
|
||||
normalize_variance = config.normalize_variance;
|
||||
epsilon = config.epsilon;
|
||||
|
||||
std::size_t max_outer_size = 0;
|
||||
const auto& input_shapes = config.input_shapes;
|
||||
for (int i = 0; i < input_shapes.size(); i++)
|
||||
{
|
||||
std::size_t outer_size = 1;
|
||||
for (int j = 0; j < split_axis; j++)
|
||||
outer_size *= input_shapes[i][j];
|
||||
max_outer_size = std::max(max_outer_size, outer_size);
|
||||
}
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
builder.require<float>(max_outer_size);
|
||||
if (normalize_variance)
|
||||
builder.require<float>(max_outer_size);
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == outputs.size());
|
||||
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto outer_size = input.size_range(0, split_axis);
|
||||
auto inner_size = input.size_range(split_axis, input.rank());
|
||||
if (inner_size == 1)
|
||||
{
|
||||
kernels::fill<T>(stream, output, 0.0f);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto ws_allocator = csl::WorkspaceAllocator(workspace);
|
||||
|
||||
auto means = ws_allocator.get_span<float>(outer_size);
|
||||
kernels::fill<float>(stream, means, 0);
|
||||
|
||||
if (normalize_variance)
|
||||
{
|
||||
auto scales = ws_allocator.get_span<float>(outer_size);
|
||||
kernels::fill<float>(stream, scales, 0);
|
||||
|
||||
kernels::reduce_mean_sqr_sum<T>(stream, means, scales, input, inner_size);
|
||||
kernels::compute_normalization_scale(stream, scales, means, scales, inner_size, epsilon);
|
||||
kernels::normalize_mean_variance<T>(stream, output, input, means, scales, inner_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernels::reduce_mean<T>(stream, means, input, inner_size);
|
||||
kernels::normalize_mean<T>(stream, output, input, means, inner_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
|
||||
bool normalize_variance;
|
||||
float epsilon;
|
||||
std::size_t split_axis;
|
||||
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MVN_HPP */
|
142
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
vendored
Normal file
142
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
vendored
Normal file
@ -0,0 +1,142 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/workspace.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
#include "../kernels/normalize.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
struct NormalizeConfiguration {
|
||||
std::vector<std::size_t> input_shape;
|
||||
|
||||
/* axis range across which values are normalized
|
||||
*
|
||||
* [0, axis_start) = outer range
|
||||
* [axis_start, axis_end) = mid range
|
||||
* [axis_end + 1, -1) = inner range
|
||||
*
|
||||
* for each location in the outer and inner range, all the values in the mid range are
|
||||
* normalized together
|
||||
*/
|
||||
std::size_t axis_start, axis_end;
|
||||
|
||||
/* 1 for L1 norm, 2 for L2 norm */
|
||||
std::size_t norm;
|
||||
|
||||
/* epsilon to use to avoid division by zero */
|
||||
T eps;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class NormalizeOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
template <class V>
|
||||
NormalizeOp(csl::Stream stream_, const Mat& weights, const NormalizeConfiguration<V>& config)
|
||||
: stream(std::move(stream_)), weight{ 1.0 }
|
||||
{
|
||||
norm_order = config.norm;
|
||||
epsilon = config.eps;
|
||||
axis_start = config.axis_start;
|
||||
axis_end = config.axis_end;
|
||||
|
||||
if (!weights.empty())
|
||||
{
|
||||
if (weights.total() == 1)
|
||||
{
|
||||
CV_Assert(weights.type() == CV_32F);
|
||||
weight = weights.at<float>(0, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
weightsTensor = csl::makeTensorHeader<T>(weights);
|
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream);
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t outer_size = 1;
|
||||
for (int i = 0; i < axis_start; i++)
|
||||
outer_size *= config.input_shape[i];
|
||||
|
||||
std::size_t inner_size = 1;
|
||||
for (int i = axis_end; i < config.input_shape.size(); i++)
|
||||
inner_size *= config.input_shape[i];
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
builder.require<T>(outer_size * inner_size);
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
std::size_t outer_size = input.size_range(0, axis_start);
|
||||
std::size_t mid_size = input.size_range(axis_start, axis_end);
|
||||
std::size_t inner_size = input.size_range(axis_end, input.rank());
|
||||
|
||||
auto ws_allocator = csl::WorkspaceAllocator(workspace);
|
||||
auto scratch = ws_allocator.get_span<T>();
|
||||
kernels::normalize<T>(stream, output, input, outer_size, mid_size, inner_size, norm_order, epsilon, scratch);
|
||||
|
||||
/* there might be a single weight in which case `weight` will be not equal to 1.0
|
||||
* or there might be several weights
|
||||
* or we don't have to scale
|
||||
*/
|
||||
if (weight != 1.0)
|
||||
{
|
||||
kernels::scale1_with_bias1<T>(stream, output, input, weight, 1.0);
|
||||
}
|
||||
else if (!weightsTensor.empty())
|
||||
{
|
||||
CV_Assert(weightsTensor.size() != 1); /* constructor should have set up to use `weight` */
|
||||
CV_Assert(weightsTensor.size() == mid_size);
|
||||
kernels::scaleN<T>(stream, output, input, inner_size, weightsTensor);
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> weightsTensor;
|
||||
T weight; /* if there is only one weight, we use this */
|
||||
|
||||
T epsilon;
|
||||
std::size_t norm_order;
|
||||
std::size_t axis_start, axis_end;
|
||||
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP */
|
118
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/padding.hpp
vendored
Normal file
118
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/padding.hpp
vendored
Normal file
@ -0,0 +1,118 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
#include "../kernels/concat.hpp"
|
||||
#include "../kernels/padding.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
enum class PaddingType {
|
||||
CONSTANT,
|
||||
REFLECTION101
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class PaddingOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
/* `ranges` is indexed by axis and contains the range in the output where the input is copied to */
|
||||
PaddingOp(csl::Stream stream_, PaddingType type_, T value_, std::vector<cv::Range> ranges)
|
||||
: stream(std::move(stream_)), type{ type_ }, value{ value_ }, dstRanges(std::move(ranges))
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
/* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW)
|
||||
*
|
||||
* there could be a case where the batch axis, channel axis, and the first spatial axis are all one
|
||||
* this would result in effective rank being less than the number of axes requiring padding
|
||||
*/
|
||||
/* the effective rank of the input may be smaller than the effective rank of the output but the converse is never true
|
||||
* input: [1, 1, 1, 3]; effective rank = 1
|
||||
* output: [1, 1, 3, 3]; effective rank = 2
|
||||
*
|
||||
* hence, we use the effective rank of the output tensor for the padding operation
|
||||
*/
|
||||
auto effective_rank = get_effective_rank(output);
|
||||
CV_Assert(get_effective_rank(input) <= effective_rank);
|
||||
effective_rank = std::max(effective_rank, dstRanges.size());
|
||||
|
||||
for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++)
|
||||
{
|
||||
if (dstRanges[i] == Range::all())
|
||||
CV_Assert(input.get_axis_size(i) == output.get_axis_size(i));
|
||||
else
|
||||
CV_Assert(input.get_axis_size(i) == dstRanges[i].size());
|
||||
}
|
||||
|
||||
if (type == PaddingType::CONSTANT)
|
||||
{
|
||||
kernels::fill<T>(stream, output, value);
|
||||
|
||||
std::vector<std::size_t> offsets(effective_rank, 0);
|
||||
for (int i = 0; i < dstRanges.size(); i++)
|
||||
{
|
||||
const auto delta = effective_rank - dstRanges.size();
|
||||
if (dstRanges[i] != Range::all())
|
||||
offsets[delta + i] = dstRanges[i].start;
|
||||
}
|
||||
|
||||
kernels::concat_with_offsets<T>(stream, output, input, offsets);
|
||||
}
|
||||
else if (type == PaddingType::REFLECTION101)
|
||||
{
|
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges(effective_rank);
|
||||
for (int i = 0; i < effective_rank; i++)
|
||||
{
|
||||
const auto delta = effective_rank - dstRanges.size();
|
||||
if (i < delta || dstRanges[i - delta] == Range::all())
|
||||
ranges[i] = { 0, input.get_axis_size(i) };
|
||||
else
|
||||
ranges[i] = { dstRanges[i].start, dstRanges[i].end };
|
||||
}
|
||||
|
||||
kernels::copy_with_reflection101<T>(stream, output, input, ranges);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
PaddingType type;
|
||||
T value;
|
||||
|
||||
std::vector<cv::Range> dstRanges;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP */
|
70
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/permute.hpp
vendored
Normal file
70
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/permute.hpp
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/permute.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class PermuteOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
PermuteOp(csl::Stream stream_, std::vector<std::size_t> order_)
|
||||
: stream(std::move(stream_)), order(std::move(order_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto needsPermute = [&] {
|
||||
for (int i = 0; i < order.size(); i++)
|
||||
if (order[i] != i)
|
||||
return true;
|
||||
return false;
|
||||
}();
|
||||
|
||||
if (needsPermute)
|
||||
{
|
||||
kernels::permute(stream, output, input, order);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (input.get() != output.get())
|
||||
csl::tensor_ops::copy(stream, output, input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::vector<std::size_t> order;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP */
|
258
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/pooling.hpp
vendored
Normal file
258
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/pooling.hpp
vendored
Normal file
@ -0,0 +1,258 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct PoolingConfiguration {
|
||||
enum class PoolingMode {
|
||||
MAX,
|
||||
AVERAGE_INCLUDE_PADDING, /* include padding while calculating average */
|
||||
AVERAGE_EXCLUDE_PADDING /* exclude padding while calculating average */
|
||||
};
|
||||
|
||||
PoolingMode poolMode;
|
||||
|
||||
/* the size of the following vectors must be equal to the window size */
|
||||
std::vector<std::size_t> window_size;
|
||||
std::vector<std::size_t> strides;
|
||||
|
||||
enum class PaddingMode {
|
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
|
||||
VALID, /* no padding is added */
|
||||
SAME /* TensorFlow logic is used for same padding */
|
||||
};
|
||||
|
||||
PaddingMode padMode;
|
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */
|
||||
std::vector<std::size_t> pads_begin, pads_end;
|
||||
|
||||
/* the output shape is calculated using the following formula:
|
||||
* output_dim = func[(input_dim + padding_left + padding_right - kernel_dim)/stride] + 1
|
||||
*
|
||||
* rounding mode decides what is used as `func`
|
||||
*/
|
||||
enum class RoundingMode {
|
||||
CEIL, /* uses ceil */
|
||||
FLOOR
|
||||
};
|
||||
|
||||
RoundingMode roundMode;
|
||||
|
||||
/* full shape inclusive of channel and batch axis */
|
||||
std::vector<std::size_t> input_shape;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class PoolingOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
PoolingOp(csl::cudnn::Handle handle, const PoolingConfiguration& config)
|
||||
: cudnnHandle(std::move(handle))
|
||||
{
|
||||
const auto& window_size = config.window_size;
|
||||
|
||||
const auto pooling_order = window_size.size();
|
||||
CV_Assert(pooling_order >= 1);
|
||||
|
||||
const auto& strides = config.strides;
|
||||
CV_Assert(pooling_order == strides.size());
|
||||
|
||||
const auto& input_shape = config.input_shape;
|
||||
CV_Assert(input_shape.size() == pooling_order + 2);
|
||||
|
||||
if (pooling_order > 3)
|
||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D pooling are supported.");
|
||||
|
||||
const auto rank = input_shape.size();
|
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
|
||||
*
|
||||
* `common_padding` contains the amount of padding that has to be added to both sides
|
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added
|
||||
* to a particular side in addition to the common padding
|
||||
*/
|
||||
std::vector<std::size_t> common_padding(rank, 0);
|
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
|
||||
if (config.padMode == PoolingConfiguration::PaddingMode::MANUAL)
|
||||
{
|
||||
const auto& pads_begin = config.pads_begin;
|
||||
const auto& pads_end = config.pads_end;
|
||||
|
||||
CV_Assert(pooling_order == pads_begin.size());
|
||||
CV_Assert(pooling_order == pads_end.size());
|
||||
|
||||
/* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing
|
||||
* otherwise, we add extra padding towards the end so that the convolution arithmetic yields
|
||||
* the correct output size without having to deal with fancy fractional sizes
|
||||
*/
|
||||
auto pads_end_modified = pads_end;
|
||||
if (config.roundMode == PoolingConfiguration::RoundingMode::CEIL)
|
||||
{
|
||||
for (int i = 0; i < window_size.size(); i++) {
|
||||
auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - window_size[i]) % strides[i];
|
||||
if (rem)
|
||||
pads_end_modified[i] += strides[i] - rem;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 2; i < common_padding.size(); i++)
|
||||
{
|
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]);
|
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i];
|
||||
padding_right[i] = pads_end_modified[i - 2] - common_padding[i];
|
||||
}
|
||||
}
|
||||
else if (config.padMode == PoolingConfiguration::PaddingMode::VALID)
|
||||
{
|
||||
/* nothing to do as the paddings are already preset to zero */
|
||||
}
|
||||
else if (config.padMode == PoolingConfiguration::PaddingMode::SAME)
|
||||
{
|
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
|
||||
*
|
||||
* if total padding is odd, the extra is added towards the end
|
||||
*/
|
||||
for (int i = 2; i < rank; i++)
|
||||
{
|
||||
const auto j = i - 2; /* filter index */
|
||||
const auto output_dim = (input_shape[i] - 1 + strides[j]) / strides[j];
|
||||
const auto required_total_padding =
|
||||
std::max<std::int64_t>(0, (output_dim - 1) * strides[j] + window_size[j] - input_shape[i]);
|
||||
|
||||
common_padding[i] = required_total_padding / 2;
|
||||
padding_left[i] = 0;
|
||||
padding_right[i] = required_total_padding % 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */
|
||||
for (int i = 2; i < rank; i++) {
|
||||
const auto j = i - 2; /* filter idx */
|
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
|
||||
std::int64_t rem = (input_shape[i] + total_padding - window_size[j]) % strides[j];
|
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right
|
||||
*/
|
||||
if (rem && padding_right[i] > 0)
|
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
|
||||
}
|
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; };
|
||||
if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
|
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
|
||||
{
|
||||
/* csl::Pooling does not fully support asymmetric padding; hence, we deal with asymmetric padding by
|
||||
* copying the input to a bigger tensor and padding the ends manually
|
||||
*
|
||||
* But we first try to avoid the transformation using cuDNN's flexibility. cuDNN can accept a smaller or
|
||||
* a bigger output shape. This effectively allows us to have arbitrary padding at the right.
|
||||
*/
|
||||
if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero))
|
||||
{
|
||||
/* there is padding on the left and we are forced to transform */
|
||||
auto transformed_input_shape = input_shape;
|
||||
for (int i = 0; i < rank; i++)
|
||||
transformed_input_shape[i] += padding_left[i] + padding_right[i];
|
||||
|
||||
transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape));
|
||||
inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
|
||||
}
|
||||
}
|
||||
|
||||
typename csl::Pooling<T>::params_type params;
|
||||
if (transformedInput.empty())
|
||||
{
|
||||
/* no transform => use original input shape */
|
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* the pooling operation will be seeing the transformed input */
|
||||
auto transformed_input_shape = transformedInput.shape_as_vector();
|
||||
params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape));
|
||||
}
|
||||
|
||||
auto output_shape = input_shape;
|
||||
for (int i = 2; i < rank; i++)
|
||||
{
|
||||
auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
|
||||
output_shape[i] = (params.input_shape[i] + total_padding - window_size[i - 2]) / strides[i - 2] + 1;
|
||||
}
|
||||
|
||||
params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
|
||||
params.window_size = window_size;
|
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
|
||||
params.stride = strides;
|
||||
|
||||
if (config.poolMode == PoolingConfiguration::PoolingMode::MAX)
|
||||
{
|
||||
params.type = csl::Pooling<T>::PoolingType::MAX;
|
||||
}
|
||||
else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING)
|
||||
{
|
||||
params.type = csl::Pooling<T>::PoolingType::AVERAGE_INCLUDE_PADDING;
|
||||
}
|
||||
else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING)
|
||||
{
|
||||
params.type = csl::Pooling<T>::PoolingType::AVERAGE_EXCLUDE_PADDING;
|
||||
}
|
||||
|
||||
pooler = csl::Pooling<T>(cudnnHandle, params);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
if (!transformedInput.empty())
|
||||
{
|
||||
inputTransformer.transform(input, transformedInput);
|
||||
input = csl::TensorView<T>(transformedInput);
|
||||
}
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
pooler.pool(input, output);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::cudnn::Handle cudnnHandle;
|
||||
csl::Pooling<T> pooler;
|
||||
|
||||
csl::Tensor<T> transformedInput;
|
||||
csl::TensorTransform<T> inputTransformer;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP */
|
136
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp
vendored
Normal file
136
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp
vendored
Normal file
@ -0,0 +1,136 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/prior_box.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct PriorBoxConfiguration {
|
||||
std::size_t feature_map_width, feature_map_height;
|
||||
std::size_t image_width, image_height;
|
||||
|
||||
/* parameters for prior boxes for each feature point */
|
||||
std::vector<float> box_widths, box_heights;
|
||||
std::vector<float> offsets_x, offsets_y;
|
||||
float stepX, stepY;
|
||||
|
||||
std::vector<float> variance;
|
||||
|
||||
/* number of priors per feature point */
|
||||
std::size_t num_priors;
|
||||
|
||||
/* clamps the box coordinates to [0, 1] range */
|
||||
bool clip;
|
||||
|
||||
/* normalizes the box coordinates using the image dimensions */
|
||||
bool normalize;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class PriorBoxOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
PriorBoxOp(csl::Stream stream_, const PriorBoxConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
feature_map_width = config.feature_map_width;
|
||||
feature_map_height = config.feature_map_height;
|
||||
|
||||
image_width = config.image_width;
|
||||
image_height = config.image_height;
|
||||
|
||||
const auto& box_widths = config.box_widths;
|
||||
const auto& box_heights = config.box_heights;
|
||||
CV_Assert(box_widths.size() == box_heights.size());
|
||||
|
||||
box_size = box_widths.size();
|
||||
|
||||
const auto& offsets_x = config.offsets_x;
|
||||
const auto& offsets_y = config.offsets_y;
|
||||
CV_Assert(offsets_x.size() == offsets_y.size());
|
||||
|
||||
offset_size = offsets_x.size();
|
||||
|
||||
/* for better memory utilization and preassumably better cache performance, we merge
|
||||
* the four vectors and put them in a single tensor
|
||||
*/
|
||||
auto total = box_widths.size() * 2 + offsets_x.size() * 2;
|
||||
std::vector<float> merged_params;
|
||||
merged_params.insert(std::end(merged_params), std::begin(box_widths), std::end(box_widths));
|
||||
merged_params.insert(std::end(merged_params), std::begin(box_heights), std::end(box_heights));
|
||||
merged_params.insert(std::end(merged_params), std::begin(offsets_x), std::end(offsets_x));
|
||||
merged_params.insert(std::end(merged_params), std::begin(offsets_y), std::end(offsets_y));
|
||||
CV_Assert(merged_params.size() == total);
|
||||
|
||||
paramsTensor.resize(total);
|
||||
csl::memcpy(paramsTensor.get(), merged_params.data(), total, stream); /* synchronous copy */
|
||||
|
||||
const auto& variance_ = config.variance;
|
||||
variance.assign(std::begin(variance_), std::end(variance_));
|
||||
|
||||
num_priors = config.num_priors;
|
||||
stepX = config.stepX;
|
||||
stepY = config.stepY;
|
||||
clip = config.clip;
|
||||
normalize = config.normalize;
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 2); /* we don't need the inputs but we are given */
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
/* we had stored all the parameters in a single tensor; now we create appropriate views
|
||||
* for each of the parameter arrays from the single tensor
|
||||
*/
|
||||
auto boxWidths = csl::View<float>(paramsTensor.get(), box_size);
|
||||
auto boxHeights = csl::View<float>(paramsTensor.get() + box_size, box_size);
|
||||
auto offsetsX = csl::View<float>(paramsTensor.get() + 2 * box_size, offset_size);
|
||||
auto offsetsY = csl::View<float>(paramsTensor.get() + 2 * box_size + offset_size, offset_size);
|
||||
|
||||
kernels::generate_prior_boxes<T>(stream, output,
|
||||
boxWidths, boxHeights, offsetsX, offsetsY, stepX, stepY,
|
||||
variance, num_priors, feature_map_width, feature_map_height, image_width, image_height, normalize, clip);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<float> paramsTensor; /* widths, heights, offsetsX, offsetsY */
|
||||
|
||||
std::size_t feature_map_width, feature_map_height;
|
||||
std::size_t image_width, image_height;
|
||||
|
||||
std::size_t box_size, offset_size;
|
||||
float stepX, stepY;
|
||||
|
||||
std::vector<float> variance;
|
||||
|
||||
std::size_t num_priors;
|
||||
bool clip, normalize;
|
||||
};
|
||||
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP */
|
187
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/region.hpp
vendored
Normal file
187
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/region.hpp
vendored
Normal file
@ -0,0 +1,187 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/region.hpp"
|
||||
|
||||
#include "../../nms.inl.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
enum class SquashMethod {
|
||||
SOFTMAX,
|
||||
SIGMOID
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct RegionConfiguration {
|
||||
/* The image is divided into (H, W) cells.
|
||||
*
|
||||
* Each cell is interested in exactly one object and predicts `boxes_per_cell` bounding boxes
|
||||
* for that object.
|
||||
*
|
||||
* Each bounding box contains:
|
||||
* - 4 box coordinates
|
||||
* - objectness confidence score
|
||||
* - `classes` number of class scores
|
||||
*
|
||||
* The object score is reduced to a probability using sigmoid and the class scores are reduced to
|
||||
* probabilities by either applying sigmoid or softmax (which is a configuration option).
|
||||
*
|
||||
* object_prob = sigmoid(object_score)
|
||||
* conditional_class_prob = sigmoid, softmax across all classes
|
||||
*
|
||||
* actual class probability = conditional_class_prob * object_prob
|
||||
*/
|
||||
std::size_t classes, boxes_per_cell;
|
||||
std::size_t width_norm, height_norm;
|
||||
T scale_x_y;
|
||||
|
||||
/* method for reducing class scores to probabilities */
|
||||
SquashMethod squash_method;
|
||||
|
||||
/* prob cutoffs below which the prediction is nulled */
|
||||
T object_prob_cutoff;
|
||||
T class_prob_cutoff;
|
||||
|
||||
T nms_iou_threshold;
|
||||
bool new_coords;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class RegionOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
template <class V>
|
||||
RegionOp(csl::Stream stream_, const cv::Mat& bias, const RegionConfiguration<V>& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
|
||||
classes = config.classes;
|
||||
boxes_per_cell = config.boxes_per_cell;
|
||||
|
||||
width_norm = config.width_norm;
|
||||
height_norm = config.height_norm;
|
||||
|
||||
scale_x_y = config.scale_x_y;
|
||||
|
||||
squash_type = config.squash_method;
|
||||
object_prob_cutoff = config.object_prob_cutoff;
|
||||
class_prob_cutoff = config.class_prob_cutoff;
|
||||
|
||||
nms_iou_threshold = config.nms_iou_threshold;
|
||||
new_coords = config.new_coords;
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto rows = input.get_axis_size(1);
|
||||
auto cols = input.get_axis_size(2);
|
||||
|
||||
auto cell_box_size = classes + 4 + 1;
|
||||
|
||||
/* we squash class scores into probabilities using softmax or sigmoid */
|
||||
bool if_true_sigmoid_else_softmax = (squash_type == SquashMethod::SIGMOID);
|
||||
|
||||
kernels::region<T>(stream, output, input, biasTensor,
|
||||
object_prob_cutoff, class_prob_cutoff,
|
||||
boxes_per_cell, cell_box_size,
|
||||
rows, cols, scale_x_y,
|
||||
height_norm, width_norm,
|
||||
if_true_sigmoid_else_softmax,
|
||||
new_coords
|
||||
);
|
||||
|
||||
if (nms_iou_threshold > 0) {
|
||||
auto output_mat = output_wrapper->getMutableHostMat();
|
||||
CV_Assert(output_mat.type() == CV_32F);
|
||||
for (int i = 0; i < input.get_axis_size(0); i++) {
|
||||
auto sample_size = rows * cols * boxes_per_cell * cell_box_size;
|
||||
do_nms_sort(reinterpret_cast<float*>(output_mat.data) + i * sample_size, rows * cols * boxes_per_cell, class_prob_cutoff, nms_iou_threshold);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void do_nms_sort(float *detections, int total, float score_thresh, float nms_thresh)
|
||||
{
|
||||
std::vector<Rect2d> boxes(total);
|
||||
std::vector<float> scores(total);
|
||||
|
||||
for (int i = 0; i < total; ++i)
|
||||
{
|
||||
Rect2d &b = boxes[i];
|
||||
int box_index = i * (classes + 4 + 1);
|
||||
b.width = detections[box_index + 2];
|
||||
b.height = detections[box_index + 3];
|
||||
b.x = detections[box_index + 0] - b.width / 2;
|
||||
b.y = detections[box_index + 1] - b.height / 2;
|
||||
}
|
||||
|
||||
std::vector<int> indices;
|
||||
for (int k = 0; k < classes; ++k)
|
||||
{
|
||||
for (int i = 0; i < total; ++i)
|
||||
{
|
||||
int box_index = i * (classes + 4 + 1);
|
||||
int class_index = box_index + 5;
|
||||
scores[i] = detections[class_index + k];
|
||||
detections[class_index + k] = 0;
|
||||
}
|
||||
NMSBoxes(boxes, scores, score_thresh, nms_thresh, indices);
|
||||
for (int i = 0, n = indices.size(); i < n; ++i)
|
||||
{
|
||||
int box_index = indices[i] * (classes + 4 + 1);
|
||||
int class_index = box_index + 5;
|
||||
detections[class_index + k] = scores[indices[i]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
|
||||
csl::Tensor<T> biasTensor;
|
||||
std::size_t classes, boxes_per_cell;
|
||||
std::size_t width_norm, height_norm;
|
||||
T scale_x_y;
|
||||
|
||||
SquashMethod squash_type;
|
||||
T object_prob_cutoff, class_prob_cutoff;
|
||||
|
||||
T nms_iou_threshold;
|
||||
bool new_coords;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP */
|
75
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/reorg.hpp
vendored
Normal file
75
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/reorg.hpp
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../kernels/permute.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ReorgOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ReorgOp(csl::Stream stream_, std::size_t stride_)
|
||||
: stream(std::move(stream_)), stride{ stride_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
const std::size_t permute_input_shape[] = {
|
||||
input.get_axis_size(0),
|
||||
input.get_axis_size(1) * input.get_axis_size(2) / (stride * stride),
|
||||
stride,
|
||||
input.get_axis_size(3),
|
||||
stride
|
||||
};
|
||||
|
||||
constexpr std::size_t order[] = { 0, 2, 4, 1, 3 };
|
||||
|
||||
const std::size_t permute_output_shape[] = {
|
||||
permute_input_shape[order[0]],
|
||||
permute_input_shape[order[1]],
|
||||
permute_input_shape[order[2]],
|
||||
permute_input_shape[order[3]],
|
||||
permute_input_shape[order[4]]
|
||||
};
|
||||
|
||||
input.unsqueeze();
|
||||
input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape));
|
||||
|
||||
output.unsqueeze();
|
||||
output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape));
|
||||
|
||||
kernels::permute(stream, output, input, { std::begin(order), std::end(order) });
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::size_t stride;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP */
|
61
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/reshape.hpp
vendored
Normal file
61
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/reshape.hpp
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ReshapeOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ReshapeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* sometimes the output shape is passed as extra inputs; hence, >= instead of == */
|
||||
CV_Assert(inputs.size() >= outputs.size());
|
||||
|
||||
for (int i = 0; i < outputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if (input.get() != output.get())
|
||||
{
|
||||
while (input.rank() < output.rank())
|
||||
input.unsqueeze();
|
||||
|
||||
while (output.rank() < input.rank())
|
||||
output.unsqueeze();
|
||||
|
||||
input.reshape_as(output);
|
||||
csl::tensor_ops::copy(stream, output, input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP */
|
81
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/resize.hpp
vendored
Normal file
81
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/resize.hpp
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
|
||||
#include "../kernels/resize.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
enum class InterpolationType {
|
||||
NEAREST_NEIGHBOUR,
|
||||
BILINEAR
|
||||
};
|
||||
|
||||
struct ResizeConfiguration {
|
||||
InterpolationType type;
|
||||
bool align_corners;
|
||||
bool half_pixel_centers;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ResizeOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ResizeOp(csl::Stream stream_, const ResizeConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
type = config.type;
|
||||
align_corners = config.align_corners;
|
||||
half_pixel_centers = config.half_pixel_centers;
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
// sometimes the target shape is taken from the second input; we don't use it however
|
||||
CV_Assert((inputs.size() == 1 || inputs.size() == 2) && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
const auto compute_scale = [this](std::size_t input_size, std::size_t output_size) {
|
||||
return (align_corners && output_size > 1) ?
|
||||
static_cast<float>(input_size - 1) / (output_size - 1) :
|
||||
static_cast<float>(input_size) / output_size;
|
||||
};
|
||||
|
||||
auto out_height = output.get_axis_size(-2), out_width = output.get_axis_size(-1);
|
||||
auto in_height = input.get_axis_size(-2), in_width = input.get_axis_size(-1);
|
||||
float scale_height = compute_scale(in_height, out_height),
|
||||
scale_width = compute_scale(in_width, out_width);
|
||||
|
||||
if (type == InterpolationType::NEAREST_NEIGHBOUR)
|
||||
kernels::resize_nn<T>(stream, output, input, scale_height, scale_width, align_corners, half_pixel_centers);
|
||||
else if (type == InterpolationType::BILINEAR)
|
||||
kernels::resize_bilinear<T>(stream, output, input, scale_height, scale_width, half_pixel_centers);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
InterpolationType type;
|
||||
bool align_corners, half_pixel_centers;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP */
|
52
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/roi_pooling.hpp
vendored
Normal file
52
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/roi_pooling.hpp
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ROI_POOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ROI_POOLING_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
|
||||
#include "../kernels/roi_pooling.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ROIPoolingOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ROIPoolingOp(csl::Stream stream_, float spatial_scale)
|
||||
: stream(std::move(stream_)), spatial_scale{spatial_scale} { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 2 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto rois_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto rois = rois_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::roi_pooling<T>(stream, output, input, rois, spatial_scale);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
float spatial_scale;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ROI_POOLING_HPP */
|
158
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
vendored
Normal file
158
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
vendored
Normal file
@ -0,0 +1,158 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct ScaleShiftConfiguration {
|
||||
enum class OpMode {
|
||||
NONE,
|
||||
TRAINABLE, /* use a pretrained blob */
|
||||
UNTRAINABLE /* use another input */
|
||||
};
|
||||
|
||||
OpMode scaleMode;
|
||||
OpMode shiftMode;
|
||||
|
||||
std::size_t axis;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ScaleShiftOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ScaleShiftOp(csl::Stream stream_, const ScaleShiftConfiguration& config, const cv::Mat& weights, const cv::Mat& bias)
|
||||
: stream(std::move(stream_)), axis{ config.axis }
|
||||
{
|
||||
scaleMode = config.scaleMode;
|
||||
if (scaleMode == ScaleShiftConfiguration::OpMode::TRAINABLE)
|
||||
{
|
||||
CV_Assert(!weights.empty());
|
||||
weightsTensor = csl::makeTensorHeader<T>(weights);
|
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream);
|
||||
}
|
||||
|
||||
shiftMode = config.shiftMode;
|
||||
if (shiftMode == ScaleShiftConfiguration::OpMode::TRAINABLE)
|
||||
{
|
||||
CV_Assert(!bias.empty());
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
}
|
||||
|
||||
CV_Assert(scaleMode != ScaleShiftConfiguration::OpMode::NONE ||
|
||||
shiftMode != ScaleShiftConfiguration::OpMode::NONE);
|
||||
|
||||
if (scaleMode == ScaleShiftConfiguration::OpMode::UNTRAINABLE &&
|
||||
shiftMode == ScaleShiftConfiguration::OpMode::UNTRAINABLE)
|
||||
{
|
||||
CV_Error(cv::Error::StsNotImplemented, "scale and shift both in untrainable mode is not supported");
|
||||
}
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
/* number of batches in the weights/bias
|
||||
* trainable mode: same for all batches
|
||||
* untrainable mode: could be different for different batch samples
|
||||
*/
|
||||
std::size_t parameter_batch_size = 1;
|
||||
|
||||
csl::TensorView<T> weights;
|
||||
if (scaleMode == ScaleShiftConfiguration::OpMode::TRAINABLE)
|
||||
{
|
||||
CV_Assert(!weightsTensor.empty());
|
||||
weights = csl::TensorView<T>(weightsTensor);
|
||||
}
|
||||
else if (scaleMode == ScaleShiftConfiguration::OpMode::UNTRAINABLE)
|
||||
{
|
||||
CV_Assert(inputs.size() == 2);
|
||||
auto wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
weights = wrapper->getView();
|
||||
|
||||
parameter_batch_size = weights.get_axis_size(0);
|
||||
CV_Assert(parameter_batch_size == input.get_axis_size(0));
|
||||
}
|
||||
|
||||
csl::TensorView<T> bias;
|
||||
if (shiftMode == ScaleShiftConfiguration::OpMode::TRAINABLE)
|
||||
{
|
||||
CV_Assert(!biasTensor.empty());
|
||||
bias = csl::TensorView<T>(biasTensor);
|
||||
}
|
||||
else if (shiftMode == ScaleShiftConfiguration::OpMode::UNTRAINABLE)
|
||||
{
|
||||
CV_Assert(inputs.size() == 2);
|
||||
auto wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
bias = wrapper->getView();
|
||||
|
||||
parameter_batch_size = bias.get_axis_size(0);
|
||||
CV_Assert(parameter_batch_size == input.get_axis_size(0));
|
||||
}
|
||||
|
||||
CV_Assert(!weights.empty() || !bias.empty());
|
||||
if (!weights.empty() && !bias.empty())
|
||||
{
|
||||
CV_CheckEQ(weights.size(), bias.size(), "different broadcasting options for weights and bias is not supported");
|
||||
}
|
||||
|
||||
const auto num_parameters = !weights.empty() ? weights.size() : bias.size();
|
||||
const auto mid_size = num_parameters / parameter_batch_size;
|
||||
|
||||
/* the scale shift operation might require broadcasting */
|
||||
const int end_axis = [&] {
|
||||
for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++) {
|
||||
if (input.size_range(axis, endAxis) == mid_size)
|
||||
return endAxis;
|
||||
}
|
||||
CV_Assert(0 /* failed to find a broadcast config */);
|
||||
}();
|
||||
|
||||
std::size_t inner_size = input.size_range(end_axis, input.rank());
|
||||
|
||||
if (!weights.empty() && !bias.empty())
|
||||
kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weights, bias);
|
||||
else if (!weights.empty())
|
||||
kernels::scaleN<T>(stream, output, input, inner_size, weights);
|
||||
else
|
||||
kernels::biasN<T>(stream, output, input, inner_size, bias);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> weightsTensor, biasTensor;
|
||||
std::size_t axis;
|
||||
|
||||
ScaleShiftConfiguration::OpMode scaleMode, shiftMode;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */
|
76
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/shortcut.hpp
vendored
Normal file
76
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/shortcut.hpp
vendored
Normal file
@ -0,0 +1,76 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHORTCUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHORTCUT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/shortcut.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ShortcutOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ShortcutOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
/* output shape is determined by the input shape */
|
||||
CV_Assert(is_shape_same(output, input));
|
||||
|
||||
for (int i = 1; i < inputs.size(); i++)
|
||||
{
|
||||
auto from_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto from = from_wrapper->getView();
|
||||
|
||||
CV_Assert(output.rank() == from.rank());
|
||||
for (int i = 0; i < output.rank(); i++) {
|
||||
if (i != 1) {
|
||||
CV_Assert(from.get_axis_size(i) == output.get_axis_size(i));
|
||||
}
|
||||
}
|
||||
|
||||
if (i == 1)
|
||||
{
|
||||
/* optimized path for first two inputs */
|
||||
kernels::input_shortcut<T>(stream, output, input, from);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernels::input_shortcut<T>(stream, output, output, from);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHORTCUT_HPP */
|
79
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/shuffle_channel.hpp
vendored
Normal file
79
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/shuffle_channel.hpp
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/permute.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ShuffleChannelOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ShuffleChannelOp(csl::Stream stream_, std::size_t group_)
|
||||
: stream(std::move(stream_)), group{ group_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if (group == 1) {
|
||||
/* permute is redundant; check else branch to know why */
|
||||
if (input.get() != output.get()) {
|
||||
input.reshape_as(output);
|
||||
csl::tensor_ops::copy(stream, output, input);
|
||||
}
|
||||
} else {
|
||||
const std::size_t permute_input_shape[] = {
|
||||
input.get_axis_size(0),
|
||||
group,
|
||||
input.get_axis_size(1) / group,
|
||||
input.get_axis_size(2) * input.get_axis_size(3)
|
||||
};
|
||||
|
||||
constexpr std::size_t order[] = { 0, 2, 1, 3 };
|
||||
|
||||
const std::size_t permute_output_shape[] = {
|
||||
permute_input_shape[order[0]],
|
||||
permute_input_shape[order[1]],
|
||||
permute_input_shape[order[2]],
|
||||
permute_input_shape[order[3]],
|
||||
};
|
||||
|
||||
input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape));
|
||||
output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape));
|
||||
kernels::permute(stream, output, input, { std::begin(order), std::end(order) });
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::size_t group;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP */
|
66
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/slice.hpp
vendored
Normal file
66
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/slice.hpp
vendored
Normal file
@ -0,0 +1,66 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
|
||||
#include "../kernels/slice.hpp"
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class SliceOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
/* offsets is indexed by output number and each subvector is indexed by axis number */
|
||||
SliceOp(csl::Stream stream_, std::vector<std::vector<std::size_t>> offsets)
|
||||
: stream(std::move(stream_)), offsets(std::move(offsets))
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* sometimes the output shape is passed in the form of a second input tensor
|
||||
* it's only required for initialization and not here
|
||||
*/
|
||||
CV_Assert(inputs.size() == 1 || inputs.size() == 2);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
CV_Assert(offsets.size() == outputs.size());
|
||||
|
||||
for (int i = 0; i < outputs.size(); ++i)
|
||||
{
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::slice<T>(stream, output, input, offsets[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::vector<std::vector<std::size_t>> offsets;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP */
|
53
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/softmax.hpp
vendored
Normal file
53
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/softmax.hpp
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class SoftmaxOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
SoftmaxOp(csl::cudnn::Handle handle, std::size_t axis_, bool log_)
|
||||
: cudnnHandle(std::move(handle)), channel_axis{ axis_ }, log{ log_ }
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
csl::tensor_ops::softmax<T>(cudnnHandle, output, input, channel_axis, log);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::cudnn::Handle cudnnHandle;
|
||||
std::size_t channel_axis;
|
||||
bool log;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP */
|
54
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/split.hpp
vendored
Normal file
54
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/split.hpp
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class SplitOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
SplitOp(csl::Stream stream_)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
for (int i = 0; i < outputs.size(); i++)
|
||||
{
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
csl::tensor_ops::copy<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP */
|
230
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp
vendored
Normal file
230
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp
vendored
Normal file
@ -0,0 +1,230 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct TransposeConvolutionConfiguration {
|
||||
/* other than `input_shape` and `output_shape`, all the configuration values must be provided
|
||||
* for the corresponding convolution operation (not transpose convolution)
|
||||
*/
|
||||
|
||||
/* the size of the following vectors must be equal to the kernel size */
|
||||
std::vector<std::size_t> kernel_size;
|
||||
std::vector<std::size_t> dilations, strides;
|
||||
|
||||
enum class PaddingMode {
|
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
|
||||
VALID, /* no padding is added */
|
||||
SAME /* TensorFlow logic is used for same padding */
|
||||
};
|
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */
|
||||
PaddingMode padMode;
|
||||
std::vector<std::size_t> pads_begin, pads_end;
|
||||
|
||||
/* full shape inclusive of channel and batch axis */
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
/* group count for grouped convolution */
|
||||
std::size_t groups;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TransposeConvolutionOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
TransposeConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const TransposeConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
|
||||
: stream(std::move(stream_)), cudnnHandle(std::move(handle))
|
||||
{
|
||||
/* we make use of backward pass of convolution to perform forward pass of transpose convolution
|
||||
* hence, we must setup configuration for the convolution operation and perform backward pass
|
||||
*/
|
||||
const auto& kernel_size = config.kernel_size;
|
||||
const auto& dilations = config.dilations;
|
||||
const auto& strides = config.strides;
|
||||
|
||||
const auto convolution_order = kernel_size.size();
|
||||
CV_Assert(convolution_order >= 1);
|
||||
|
||||
CV_Assert(convolution_order == dilations.size());
|
||||
CV_Assert(convolution_order == strides.size());
|
||||
|
||||
const auto& input_shape = config.input_shape;
|
||||
const auto& output_shape = config.output_shape;
|
||||
CV_Assert(input_shape.size() == output_shape.size());
|
||||
CV_Assert(input_shape.size() == convolution_order + 2);
|
||||
|
||||
const auto groups = config.groups;
|
||||
|
||||
if (convolution_order > 3)
|
||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D transpose convolution is supported.");
|
||||
|
||||
const auto rank = input_shape.size();
|
||||
const auto input_feature_maps = input_shape[1];
|
||||
const auto output_feature_maps = output_shape[1];
|
||||
const auto output_feature_maps_per_group = output_feature_maps / groups;
|
||||
CV_Assert(output_feature_maps % groups == 0);
|
||||
|
||||
filtersTensor = csl::makeTensorHeader<T>(filters);
|
||||
csl::copyMatToTensor<T>(filters, filtersTensor, stream);
|
||||
|
||||
if (!bias.empty())
|
||||
{
|
||||
CV_Assert(bias.total() == output_feature_maps);
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
}
|
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
|
||||
*
|
||||
* `common_padding` contains the amount of padding that has to be added to both sides
|
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added
|
||||
* to a particular side in addition to the common padding
|
||||
*
|
||||
* note that we compute the padding for the convolution operation
|
||||
*/
|
||||
std::vector<std::size_t> common_padding(rank, 0);
|
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
|
||||
if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::MANUAL)
|
||||
{
|
||||
const auto& pads_begin = config.pads_begin;
|
||||
const auto& pads_end = config.pads_end;
|
||||
|
||||
CV_Assert(convolution_order == pads_begin.size());
|
||||
CV_Assert(convolution_order == pads_end.size());
|
||||
|
||||
for (int i = 2; i < common_padding.size(); i++)
|
||||
{
|
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
|
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i];
|
||||
padding_right[i] = pads_end[i - 2] - common_padding[i];
|
||||
}
|
||||
}
|
||||
else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::VALID)
|
||||
{
|
||||
/* nothing to do as the paddings are already preset to zero */
|
||||
}
|
||||
else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::SAME)
|
||||
{
|
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
|
||||
*
|
||||
* if total padding is odd, the extra is added towards the end
|
||||
*/
|
||||
for (int i = 2; i < rank; i++)
|
||||
{
|
||||
const auto j = i - 2; /* filter index */
|
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
|
||||
const auto required_total_padding =
|
||||
std::max<std::int64_t>(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]);
|
||||
|
||||
common_padding[i] = required_total_padding / 2;
|
||||
padding_left[i] = 0;
|
||||
padding_right[i] = required_total_padding % 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */
|
||||
for (int i = 2; i < rank; i++) {
|
||||
const auto j = i - 2; /* filter idx */
|
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
|
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
|
||||
std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
|
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right
|
||||
*/
|
||||
if (rem && padding_right[i] > 0)
|
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
|
||||
}
|
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; };
|
||||
if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
|
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
|
||||
{
|
||||
CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported.");
|
||||
}
|
||||
|
||||
typename csl::TransposeConvolution<T>::params_type params;
|
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
|
||||
params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
|
||||
|
||||
auto& fshape = params.filter_shape;
|
||||
fshape.resize(rank);
|
||||
fshape[0] = input_feature_maps;
|
||||
fshape[1] = output_feature_maps_per_group;
|
||||
std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
|
||||
CV_Assert(fshape.size() == kernel_size.size() + 2);
|
||||
|
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
|
||||
params.stride = strides;
|
||||
params.dilation = dilations;
|
||||
params.groups = config.groups;
|
||||
|
||||
convoluter = csl::TransposeConvolution<T>(cudnnHandle, params);
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
builder.require(convoluter.get_workspace_size());
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
csl::WorkspaceAllocator allocator(workspace);
|
||||
convoluter.transpose_convolve(output, input, filtersTensor, allocator.get_instance());
|
||||
if (!biasTensor.empty())
|
||||
{
|
||||
std::size_t inner_size = total(output_wrapper->getShape(), 2, -1);
|
||||
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::cudnn::Handle cudnnHandle;
|
||||
csl::Tensor<T> filtersTensor, biasTensor;
|
||||
csl::TransposeConvolution<T> convoluter;
|
||||
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP */
|
Reference in New Issue
Block a user