feat: 切换后端至PaddleOCR-NCNN,切换工程为CMake
1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试 2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程 3.重整权利声明文件,重整代码工程,确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
This commit is contained in:
593
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_importer.cpp
vendored
Normal file
593
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_importer.cpp
vendored
Normal file
@ -0,0 +1,593 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "../precomp.hpp"
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <google/protobuf/message.h>
|
||||
#include <google/protobuf/text_format.h>
|
||||
#include <google/protobuf/io/zero_copy_stream_impl.h>
|
||||
#include "caffe_io.hpp"
|
||||
#endif
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
CV__DNN_INLINE_NS_BEGIN
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
using ::google::protobuf::RepeatedField;
|
||||
using ::google::protobuf::RepeatedPtrField;
|
||||
using ::google::protobuf::Message;
|
||||
using ::google::protobuf::Descriptor;
|
||||
using ::google::protobuf::FieldDescriptor;
|
||||
using ::google::protobuf::Reflection;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template<typename T>
|
||||
static cv::String toString(const T &v)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << v;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static inline
|
||||
MatShape parseBlobShape(const caffe::BlobShape& _input_shape)
|
||||
{
|
||||
MatShape shape;
|
||||
for (int i = 0; i < _input_shape.dim_size(); i++)
|
||||
{
|
||||
shape.push_back((int)_input_shape.dim(i));
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
class CaffeImporter
|
||||
{
|
||||
caffe::NetParameter net;
|
||||
caffe::NetParameter netBinary;
|
||||
|
||||
public:
|
||||
|
||||
CaffeImporter(const char *pototxt, const char *caffeModel)
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
ReadNetParamsFromTextFileOrDie(pototxt, &net);
|
||||
|
||||
if (caffeModel && caffeModel[0])
|
||||
ReadNetParamsFromBinaryFileOrDie(caffeModel, &netBinary);
|
||||
}
|
||||
|
||||
CaffeImporter(const char *dataProto, size_t lenProto,
|
||||
const char *dataModel, size_t lenModel)
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
ReadNetParamsFromTextBufferOrDie(dataProto, lenProto, &net);
|
||||
|
||||
if (dataModel != NULL && lenModel > 0)
|
||||
ReadNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBinary);
|
||||
}
|
||||
|
||||
void extractCustomParams(const google::protobuf::UnknownFieldSet& unknownFields, cv::dnn::LayerParams ¶ms)
|
||||
{
|
||||
const int numFields = unknownFields.field_count();
|
||||
for (int i = 0; i < numFields; ++i)
|
||||
{
|
||||
const google::protobuf::UnknownField& field = unknownFields.field(i);
|
||||
CV_Assert(field.type() == google::protobuf::UnknownField::TYPE_GROUP);
|
||||
std::string fieldName = field.group().field(0).length_delimited();
|
||||
std::string fieldValue = field.group().field(1).length_delimited();
|
||||
params.set(fieldName, fieldValue);
|
||||
}
|
||||
}
|
||||
|
||||
void addParam(const Message &msg, const FieldDescriptor *field, cv::dnn::LayerParams ¶ms)
|
||||
{
|
||||
const Reflection *refl = msg.GetReflection();
|
||||
int type = field->cpp_type();
|
||||
bool isRepeated = field->is_repeated();
|
||||
const std::string &name = field->name();
|
||||
|
||||
#define SET_UP_FILED(getter, arrayConstr, gtype) \
|
||||
if (isRepeated) { \
|
||||
const RepeatedField<gtype> &v = refl->GetRepeatedField<gtype>(msg, field); \
|
||||
params.set(name, DictValue::arrayConstr(v.begin(), (int)v.size())); \
|
||||
} \
|
||||
else { \
|
||||
params.set(name, refl->getter(msg, field)); \
|
||||
}
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case FieldDescriptor::CPPTYPE_INT32:
|
||||
SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int32);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_UINT32:
|
||||
SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint32);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_INT64:
|
||||
SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int64);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_UINT64:
|
||||
SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint64);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_BOOL:
|
||||
SET_UP_FILED(GetBool, arrayInt, bool);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_DOUBLE:
|
||||
SET_UP_FILED(GetDouble, arrayReal, double);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_FLOAT:
|
||||
SET_UP_FILED(GetFloat, arrayReal, float);
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_STRING:
|
||||
if (isRepeated) {
|
||||
const RepeatedPtrField<std::string> &v = refl->GetRepeatedPtrField<std::string>(msg, field);
|
||||
params.set(name, DictValue::arrayString(v.begin(), (int)v.size()));
|
||||
}
|
||||
else {
|
||||
params.set(name, refl->GetString(msg, field));
|
||||
}
|
||||
break;
|
||||
case FieldDescriptor::CPPTYPE_ENUM:
|
||||
if (isRepeated) {
|
||||
int size = refl->FieldSize(msg, field);
|
||||
std::vector<cv::String> buf(size);
|
||||
for (int i = 0; i < size; i++)
|
||||
buf[i] = refl->GetRepeatedEnum(msg, field, i)->name();
|
||||
params.set(name, DictValue::arrayString(buf.begin(), size));
|
||||
}
|
||||
else {
|
||||
params.set(name, refl->GetEnum(msg, field)->name());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsError, "Unknown type \"" + String(field->type_name()) + "\" in prototxt");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline static bool ends_with_param(const std::string &str)
|
||||
{
|
||||
static const std::string _param("_param");
|
||||
return (str.size() >= _param.size()) && str.compare(str.size() - _param.size(), _param.size(), _param) == 0;
|
||||
}
|
||||
|
||||
void extractLayerParams(const Message &msg, cv::dnn::LayerParams ¶ms, bool isInternal = false)
|
||||
{
|
||||
const Descriptor *msgDesc = msg.GetDescriptor();
|
||||
const Reflection *msgRefl = msg.GetReflection();
|
||||
|
||||
for (int fieldId = 0; fieldId < msgDesc->field_count(); fieldId++)
|
||||
{
|
||||
const FieldDescriptor *fd = msgDesc->field(fieldId);
|
||||
|
||||
if (!isInternal && !ends_with_param(fd->name()))
|
||||
continue;
|
||||
|
||||
const google::protobuf::UnknownFieldSet& unknownFields = msgRefl->GetUnknownFields(msg);
|
||||
bool hasData = fd->is_required() ||
|
||||
(fd->is_optional() && msgRefl->HasField(msg, fd)) ||
|
||||
(fd->is_repeated() && msgRefl->FieldSize(msg, fd) > 0) ||
|
||||
!unknownFields.empty();
|
||||
if (!hasData)
|
||||
continue;
|
||||
|
||||
extractCustomParams(unknownFields, params);
|
||||
if (fd->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE)
|
||||
{
|
||||
if (fd->is_repeated()) //Extract only first item!
|
||||
extractLayerParams(msgRefl->GetRepeatedMessage(msg, fd, 0), params, true);
|
||||
else
|
||||
extractLayerParams(msgRefl->GetMessage(msg, fd), params, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
addParam(msg, fd, params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blobShapeFromProto(const caffe::BlobProto &pbBlob, MatShape& shape)
|
||||
{
|
||||
shape.clear();
|
||||
if (pbBlob.has_num() || pbBlob.has_channels() || pbBlob.has_height() || pbBlob.has_width())
|
||||
{
|
||||
shape.push_back(pbBlob.num());
|
||||
shape.push_back(pbBlob.channels());
|
||||
shape.push_back(pbBlob.height());
|
||||
shape.push_back(pbBlob.width());
|
||||
}
|
||||
else if (pbBlob.has_shape())
|
||||
{
|
||||
shape = parseBlobShape(pbBlob.shape());
|
||||
}
|
||||
else
|
||||
shape.resize(1, 1); // Is a scalar.
|
||||
}
|
||||
|
||||
void blobFromProto(const caffe::BlobProto &pbBlob, cv::Mat &dstBlob)
|
||||
{
|
||||
MatShape shape;
|
||||
blobShapeFromProto(pbBlob, shape);
|
||||
|
||||
dstBlob.create((int)shape.size(), &shape[0], CV_32F);
|
||||
if (pbBlob.data_size())
|
||||
{
|
||||
// Single precision floats.
|
||||
CV_Assert(pbBlob.data_size() == (int)dstBlob.total());
|
||||
|
||||
CV_DbgAssert(pbBlob.GetDescriptor()->FindFieldByLowercaseName("data")->cpp_type() == FieldDescriptor::CPPTYPE_FLOAT);
|
||||
Mat(dstBlob.dims, &dstBlob.size[0], CV_32F, (void*)pbBlob.data().data()).copyTo(dstBlob);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(pbBlob.has_raw_data());
|
||||
const std::string& raw_data = pbBlob.raw_data();
|
||||
if (pbBlob.raw_data_type() == caffe::FLOAT16)
|
||||
{
|
||||
// Half precision floats.
|
||||
CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());
|
||||
|
||||
Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
|
||||
convertFp16(halfs, dstBlob);
|
||||
}
|
||||
else if (pbBlob.raw_data_type() == caffe::FLOAT)
|
||||
{
|
||||
CV_Assert(raw_data.size() / 4 == (int)dstBlob.total());
|
||||
Mat((int)shape.size(), &shape[0], CV_32FC1, (void*)raw_data.c_str()).copyTo(dstBlob);
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unexpected blob data type");
|
||||
}
|
||||
}
|
||||
|
||||
void extractBinaryLayerParams(const caffe::LayerParameter& layer, LayerParams& layerParams)
|
||||
{
|
||||
const std::string &name = layer.name();
|
||||
|
||||
int li;
|
||||
for (li = 0; li != netBinary.layer_size(); li++)
|
||||
{
|
||||
const caffe::LayerParameter& binLayer = netBinary.layer(li);
|
||||
// Break if the layer name is the same and the blobs are not cleared
|
||||
if (binLayer.name() == name && binLayer.blobs_size() != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (li == netBinary.layer_size())
|
||||
return;
|
||||
|
||||
caffe::LayerParameter* binLayer = netBinary.mutable_layer(li);
|
||||
const int numBlobs = binLayer->blobs_size();
|
||||
std::vector<caffe::BlobProto*> blobs(numBlobs);
|
||||
binLayer->mutable_blobs()->ExtractSubrange(0, numBlobs, blobs.data());
|
||||
layerParams.blobs.resize(numBlobs);
|
||||
for (int bi = 0; bi < numBlobs; bi++)
|
||||
{
|
||||
blobFromProto(*blobs[bi], layerParams.blobs[bi]);
|
||||
delete blobs[bi];
|
||||
}
|
||||
}
|
||||
|
||||
struct BlobNote
|
||||
{
|
||||
BlobNote(const std::string &_name, int _layerId, int _outNum) :
|
||||
name(_name), layerId(_layerId), outNum(_outNum) {}
|
||||
|
||||
std::string name;
|
||||
int layerId, outNum;
|
||||
};
|
||||
|
||||
std::vector<BlobNote> addedBlobs;
|
||||
std::map<String, int> layerCounter;
|
||||
|
||||
void populateNet(Net dstNet)
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
int layersSize = net.layer_size();
|
||||
layerCounter.clear();
|
||||
addedBlobs.clear();
|
||||
addedBlobs.reserve(layersSize + 1);
|
||||
|
||||
//setup input layer names
|
||||
std::vector<String> netInputs(net.input_size());
|
||||
std::vector<MatShape> inp_shapes;
|
||||
{
|
||||
int net_input_size = net.input_size();
|
||||
for (int inNum = 0; inNum < net_input_size; inNum++)
|
||||
{
|
||||
addedBlobs.push_back(BlobNote(net.input(inNum), 0, inNum));
|
||||
netInputs[inNum] = net.input(inNum);
|
||||
}
|
||||
|
||||
if (net.input_dim_size() > 0) // deprecated in Caffe proto
|
||||
{
|
||||
int net_input_dim_size = net.input_dim_size();
|
||||
CV_Check(net_input_dim_size, net_input_dim_size % 4 == 0, "");
|
||||
CV_CheckEQ(net_input_dim_size, net_input_size * 4, "");
|
||||
for (int inp_id = 0; inp_id < net_input_size; inp_id++)
|
||||
{
|
||||
int dim = inp_id * 4;
|
||||
MatShape shape(4);
|
||||
shape[0] = net.input_dim(dim);
|
||||
shape[1] = net.input_dim(dim+1);
|
||||
shape[2] = net.input_dim(dim+2);
|
||||
shape[3] = net.input_dim(dim+3);
|
||||
inp_shapes.push_back(shape);
|
||||
}
|
||||
}
|
||||
else if (net.input_shape_size() > 0) // deprecated in Caffe proto
|
||||
{
|
||||
int net_input_shape_size = net.input_shape_size();
|
||||
CV_CheckEQ(net_input_shape_size, net_input_size, "");
|
||||
for (int inp_id = 0; inp_id < net_input_shape_size; inp_id++)
|
||||
{
|
||||
MatShape shape = parseBlobShape(net.input_shape(inp_id));
|
||||
inp_shapes.push_back(shape);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int inp_id = 0; inp_id < net_input_size; inp_id++)
|
||||
{
|
||||
MatShape shape; // empty
|
||||
inp_shapes.push_back(shape);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int li = 0; li < layersSize; li++)
|
||||
{
|
||||
const caffe::LayerParameter &layer = net.layer(li);
|
||||
String name = layer.name();
|
||||
String type = layer.type();
|
||||
LayerParams layerParams;
|
||||
|
||||
extractLayerParams(layer, layerParams);
|
||||
extractBinaryLayerParams(layer, layerParams);
|
||||
|
||||
int repetitions = layerCounter[name]++;
|
||||
if (repetitions)
|
||||
name += String("_") + toString(repetitions);
|
||||
|
||||
if (type == "Input")
|
||||
{
|
||||
for (int outNum = 0; outNum < layer.top_size(); outNum++)
|
||||
{
|
||||
addOutput(layer, 0, outNum);
|
||||
addedBlobs.back().outNum = netInputs.size();
|
||||
netInputs.push_back(addedBlobs.back().name);
|
||||
}
|
||||
if (layer.has_input_param())
|
||||
{
|
||||
const caffe::InputParameter &inputParameter = layer.input_param();
|
||||
int input_shape_size = inputParameter.shape_size();
|
||||
CV_CheckEQ(input_shape_size, layer.top_size(), "");
|
||||
for (int inp_id = 0; inp_id < input_shape_size; inp_id++)
|
||||
{
|
||||
MatShape shape = parseBlobShape(inputParameter.shape(inp_id));
|
||||
inp_shapes.push_back(shape);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else if (type == "BatchNorm")
|
||||
{
|
||||
if (!layerParams.get<bool>("use_global_stats", true))
|
||||
{
|
||||
CV_Assert_N(layer.bottom_size() == 1, layer.top_size() == 1);
|
||||
|
||||
LayerParams mvnParams;
|
||||
mvnParams.set("eps", layerParams.get<float>("eps", 1e-5));
|
||||
std::string mvnName = name + "/mvn";
|
||||
|
||||
int repetitions = layerCounter[mvnName]++;
|
||||
if (repetitions)
|
||||
mvnName += String("_") + toString(repetitions);
|
||||
|
||||
int mvnId = dstNet.addLayer(mvnName, "MVN", mvnParams);
|
||||
addInput(layer.bottom(0), mvnId, 0, dstNet);
|
||||
addOutput(layer, mvnId, 0);
|
||||
net.mutable_layer(li)->set_bottom(0, layer.top(0));
|
||||
layerParams.blobs[0].setTo(0); // mean
|
||||
layerParams.blobs[1].setTo(1); // std
|
||||
}
|
||||
}
|
||||
else if (type == "Axpy")
|
||||
{
|
||||
CV_Assert_N(layer.bottom_size() == 3, layer.top_size() == 1);
|
||||
|
||||
std::string scaleName = name + "/scale";
|
||||
int repetitions = layerCounter[scaleName]++;
|
||||
if (repetitions) {
|
||||
scaleName += String("_") + toString(repetitions);
|
||||
}
|
||||
|
||||
LayerParams scaleParams;
|
||||
scaleParams.set("axis", 1);
|
||||
scaleParams.set("has_bias", false);
|
||||
int scaleId = dstNet.addLayer(scaleName, "Scale", scaleParams);
|
||||
addInput(layer.bottom(2), scaleId, 0, dstNet);
|
||||
addInput(layer.bottom(0), scaleId, 1, dstNet);
|
||||
addOutput(layer, scaleId, 0);
|
||||
net.mutable_layer(li)->set_bottom(0, layer.top(0));
|
||||
net.mutable_layer(li)->mutable_bottom()->RemoveLast();
|
||||
type = "Eltwise";
|
||||
}
|
||||
else if (type == "Resample")
|
||||
{
|
||||
CV_Assert(layer.bottom_size() == 1 || layer.bottom_size() == 2);
|
||||
type = "Resize";
|
||||
String interp = toLowerCase(layerParams.get<String>("type"));
|
||||
layerParams.set("interpolation", interp == "linear" ? "bilinear" : interp);
|
||||
|
||||
if (layerParams.has("factor"))
|
||||
{
|
||||
float factor = layerParams.get<float>("factor");
|
||||
CV_Assert(layer.bottom_size() != 2 || factor == 1.0);
|
||||
layerParams.set("zoom_factor", factor);
|
||||
|
||||
if ((interp == "linear" && factor != 1.0) ||
|
||||
(interp == "nearest" && factor < 1.0))
|
||||
CV_Error(Error::StsNotImplemented, "Unsupported Resample mode");
|
||||
}
|
||||
}
|
||||
else if ("Convolution" == type)
|
||||
{
|
||||
CV_Assert(layer.bottom_size() == layer.top_size());
|
||||
for (int i = 0; i < layer.bottom_size(); i++)
|
||||
{
|
||||
int conv_id = dstNet.addLayer(layer.top(i), type, layerParams);
|
||||
addInput(layer.bottom(i), conv_id, 0, dstNet);
|
||||
addedBlobs.push_back(BlobNote(layer.top(i), conv_id, 0));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else if ("ConvolutionDepthwise" == type)
|
||||
{
|
||||
type = "Convolution";
|
||||
}
|
||||
|
||||
int id = dstNet.addLayer(name, type, layerParams);
|
||||
|
||||
for (int inNum = 0; inNum < layer.bottom_size(); inNum++)
|
||||
addInput(layer.bottom(inNum), id, inNum, dstNet);
|
||||
|
||||
for (int outNum = 0; outNum < layer.top_size(); outNum++)
|
||||
addOutput(layer, id, outNum);
|
||||
}
|
||||
dstNet.setInputsNames(netInputs);
|
||||
|
||||
if (inp_shapes.size() > 0)
|
||||
{
|
||||
CV_CheckEQ(inp_shapes.size(), netInputs.size(), "");
|
||||
for (int inp_id = 0; inp_id < inp_shapes.size(); inp_id++)
|
||||
dstNet.setInputShape(netInputs[inp_id], inp_shapes[inp_id]);
|
||||
}
|
||||
|
||||
addedBlobs.clear();
|
||||
}
|
||||
|
||||
void addOutput(const caffe::LayerParameter &layer, int layerId, int outNum)
|
||||
{
|
||||
const std::string &name = layer.top(outNum);
|
||||
|
||||
bool haveDups = false;
|
||||
for (int idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
|
||||
{
|
||||
if (addedBlobs[idx].name == name)
|
||||
{
|
||||
haveDups = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (haveDups)
|
||||
{
|
||||
bool isInplace = layer.bottom_size() > outNum && layer.bottom(outNum) == name;
|
||||
if (!isInplace)
|
||||
CV_Error(Error::StsBadArg, "Duplicate blobs produced by multiple sources");
|
||||
}
|
||||
|
||||
addedBlobs.push_back(BlobNote(name, layerId, outNum));
|
||||
}
|
||||
|
||||
void addInput(const std::string &name, int layerId, int inNum, Net &dstNet)
|
||||
{
|
||||
int idx;
|
||||
for (idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
|
||||
{
|
||||
if (addedBlobs[idx].name == name)
|
||||
break;
|
||||
}
|
||||
|
||||
if (idx < 0)
|
||||
{
|
||||
CV_Error(Error::StsObjectNotFound, "Can't find output blob \"" + name + "\"");
|
||||
return;
|
||||
}
|
||||
|
||||
dstNet.connect(addedBlobs[idx].layerId, addedBlobs[idx].outNum, layerId, inNum);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
Net readNetFromCaffe(const String &prototxt, const String &caffeModel /*= String()*/)
|
||||
{
|
||||
CaffeImporter caffeImporter(prototxt.c_str(), caffeModel.c_str());
|
||||
Net net;
|
||||
caffeImporter.populateNet(net);
|
||||
return net;
|
||||
}
|
||||
|
||||
Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
|
||||
const char *bufferModel, size_t lenModel)
|
||||
{
|
||||
CaffeImporter caffeImporter(bufferProto, lenProto, bufferModel, lenModel);
|
||||
Net net;
|
||||
caffeImporter.populateNet(net);
|
||||
return net;
|
||||
}
|
||||
|
||||
Net readNetFromCaffe(const std::vector<uchar>& bufferProto, const std::vector<uchar>& bufferModel)
|
||||
{
|
||||
const char* bufferProtoPtr = reinterpret_cast<const char*>(&bufferProto[0]);
|
||||
const char* bufferModelPtr = bufferModel.empty() ? NULL :
|
||||
reinterpret_cast<const char*>(&bufferModel[0]);
|
||||
return readNetFromCaffe(bufferProtoPtr, bufferProto.size(),
|
||||
bufferModelPtr, bufferModel.size());
|
||||
}
|
||||
|
||||
#endif //HAVE_PROTOBUF
|
||||
|
||||
CV__DNN_INLINE_NS_END
|
||||
}} // namespace
|
1190
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.cpp
vendored
Normal file
1190
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
129
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.hpp
vendored
Normal file
129
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_io.hpp
vendored
Normal file
@ -0,0 +1,129 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//COPYRIGHT
|
||||
//
|
||||
//All contributions by the University of California:
|
||||
//Copyright (c) 2014, The Regents of the University of California (Regents)
|
||||
//All rights reserved.
|
||||
//
|
||||
//All other contributions:
|
||||
//Copyright (c) 2014, the respective contributors
|
||||
//All rights reserved.
|
||||
//
|
||||
//Caffe uses a shared copyright model: each contributor holds copyright over
|
||||
//their contributions to Caffe. The project versioning records all such
|
||||
//contribution and copyright details. If a contributor wants to further mark
|
||||
//their specific copyright on a particular contribution, they should indicate
|
||||
//their copyright solely in the commit message of the change when it is
|
||||
//committed.
|
||||
//
|
||||
//LICENSE
|
||||
//
|
||||
//Redistribution and use in source and binary forms, with or without
|
||||
//modification, are permitted provided that the following conditions are met:
|
||||
//
|
||||
//1. Redistributions of source code must retain the above copyright notice, this
|
||||
// list of conditions and the following disclaimer.
|
||||
//2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
//THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
//ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
//WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
//DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
//ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
//(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
//LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
//ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
//(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
//SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
//CONTRIBUTION AGREEMENT
|
||||
//
|
||||
//By contributing to the BVLC/caffe repository through pull-request, comment,
|
||||
//or otherwise, the contributor releases their content to the
|
||||
//license and copyright terms herein.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_DNN_CAFFE_IO_HPP__
|
||||
#define __OPENCV_DNN_CAFFE_IO_HPP__
|
||||
#ifdef HAVE_PROTOBUF
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__ >= 5
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wsuggest-override"
|
||||
#endif
|
||||
#include "opencv-caffe.pb.h"
|
||||
#if defined(__GNUC__) && __GNUC__ >= 5
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
namespace caffe { using namespace opencv_caffe; } // avoid massive renames from caffe proto package
|
||||
|
||||
namespace cv {
|
||||
namespace dnn {
|
||||
|
||||
// Read parameters from a file into a NetParameter proto message.
|
||||
void ReadNetParamsFromTextFileOrDie(const char* param_file,
|
||||
caffe::NetParameter* param);
|
||||
void ReadNetParamsFromBinaryFileOrDie(const char* param_file,
|
||||
caffe::NetParameter* param);
|
||||
|
||||
// Read parameters from a memory buffer into a NetParammeter proto message.
|
||||
void ReadNetParamsFromBinaryBufferOrDie(const char* data, size_t len,
|
||||
caffe::NetParameter* param);
|
||||
void ReadNetParamsFromTextBufferOrDie(const char* data, size_t len,
|
||||
caffe::NetParameter* param);
|
||||
|
||||
// Utility functions used internally by Caffe and TensorFlow loaders
|
||||
bool ReadProtoFromTextFile(const char* filename, ::google::protobuf::Message* proto);
|
||||
bool ReadProtoFromBinaryFile(const char* filename, ::google::protobuf::Message* proto);
|
||||
bool ReadProtoFromTextBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
|
||||
bool ReadProtoFromBinaryBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
80
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_shrinker.cpp
vendored
Normal file
80
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/caffe_shrinker.cpp
vendored
Normal file
@ -0,0 +1,80 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
|
||||
#include "../precomp.hpp"
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
#include <fstream>
|
||||
#include "caffe_io.hpp"
|
||||
#endif
|
||||
|
||||
namespace cv { namespace dnn {
|
||||
CV__DNN_INLINE_NS_BEGIN
|
||||
|
||||
#ifdef HAVE_PROTOBUF
|
||||
|
||||
void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& layersTypes)
|
||||
{
|
||||
CV_TRACE_FUNCTION();
|
||||
|
||||
std::vector<String> types(layersTypes);
|
||||
if (types.empty())
|
||||
{
|
||||
types.push_back("Convolution");
|
||||
types.push_back("InnerProduct");
|
||||
}
|
||||
|
||||
caffe::NetParameter net;
|
||||
ReadNetParamsFromBinaryFileOrDie(src.c_str(), &net);
|
||||
|
||||
for (int i = 0; i < net.layer_size(); ++i)
|
||||
{
|
||||
caffe::LayerParameter* lp = net.mutable_layer(i);
|
||||
if (std::find(types.begin(), types.end(), lp->type()) == types.end())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < lp->blobs_size(); ++j)
|
||||
{
|
||||
caffe::BlobProto* blob = lp->mutable_blobs(j);
|
||||
CV_Assert(blob->data_size() != 0); // float32 array.
|
||||
|
||||
Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
|
||||
Mat halfs(1, blob->data_size(), CV_16SC1);
|
||||
convertFp16(floats, halfs); // Convert to float16.
|
||||
|
||||
blob->clear_data(); // Clear float32 data.
|
||||
|
||||
// Set float16 data.
|
||||
blob->set_raw_data(halfs.data, halfs.total() * halfs.elemSize());
|
||||
blob->set_raw_data_type(caffe::FLOAT16);
|
||||
}
|
||||
}
|
||||
#if GOOGLE_PROTOBUF_VERSION < 3005000
|
||||
size_t msgSize = saturate_cast<size_t>(net.ByteSize());
|
||||
#else
|
||||
size_t msgSize = net.ByteSizeLong();
|
||||
#endif
|
||||
std::vector<uint8_t> output(msgSize);
|
||||
net.SerializeWithCachedSizesToArray(&output[0]);
|
||||
|
||||
std::ofstream ofs(dst.c_str(), std::ios::binary);
|
||||
ofs.write((const char*)&output[0], msgSize);
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& types)
|
||||
{
|
||||
CV_Error(cv::Error::StsNotImplemented, "libprotobuf required to import data from Caffe models");
|
||||
}
|
||||
|
||||
#endif // HAVE_PROTOBUF
|
||||
|
||||
CV__DNN_INLINE_NS_END
|
||||
}} // namespace
|
106
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/glog_emulator.hpp
vendored
Normal file
106
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/glog_emulator.hpp
vendored
Normal file
@ -0,0 +1,106 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
|
||||
#define __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#define CHECK(cond) for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #cond, cond); _logger.exit(); _logger.check()) _logger.stream()
|
||||
#define CHECK_EQ(a, b) for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #a"="#b, ((a) == (b))); _logger.exit(); _logger.check()) _logger.stream()
|
||||
#define LOG(TYPE) for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, #TYPE); _logger.exit(); _logger.check()) _logger.stream()
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace dnn
|
||||
{
|
||||
|
||||
class GLogWrapper
|
||||
{
|
||||
const char *file, *func, *type, *cond_str;
|
||||
int line;
|
||||
bool cond_status, exit_loop;
|
||||
std::stringstream sstream;
|
||||
|
||||
public:
|
||||
|
||||
GLogWrapper(const char *_file, const char *_func, int _line,
|
||||
const char *_type,
|
||||
const char *_cond_str = NULL, bool _cond_status = true
|
||||
) :
|
||||
file(_file), func(_func), type(_type), cond_str(_cond_str),
|
||||
line(_line), cond_status(_cond_status), exit_loop(true) {}
|
||||
|
||||
std::iostream &stream()
|
||||
{
|
||||
return sstream;
|
||||
}
|
||||
|
||||
bool exit()
|
||||
{
|
||||
return exit_loop;
|
||||
}
|
||||
|
||||
void check()
|
||||
{
|
||||
exit_loop = false;
|
||||
|
||||
if (cond_str && !cond_status)
|
||||
{
|
||||
cv::error(cv::Error::StsError, "FAILED: " + String(cond_str) + ". " + sstream.str(), func, file, line);
|
||||
}
|
||||
else if (!cond_str && strcmp(type, "CHECK"))
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
if (!std::strcmp(type, "INFO"))
|
||||
std::cout << sstream.str() << std::endl;
|
||||
else
|
||||
std::cerr << sstream.str() << std::endl;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
1649
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/opencv-caffe.proto
vendored
Normal file
1649
3rdparty/opencv-4.5.4/modules/dnn/src/caffe/opencv-caffe.proto
vendored
Normal file
File diff suppressed because it is too large
Load Diff
121
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activation_eltwise.cu
vendored
Normal file
121
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activation_eltwise.cu
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "functors.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
|
||||
__global__ void generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||
auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
|
||||
|
||||
ActivationOp activation_op(act_params);
|
||||
EltwiseOp eltwise_op(eltwise_params);
|
||||
|
||||
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||
vector_type output_vec, eltwise_vec;
|
||||
v_load(output_vec, inplace_output_vPtr[i]);
|
||||
v_load(eltwise_vec, eltwise_vPtr[i]);
|
||||
for(int j = 0; j < output_vec.size(); j++)
|
||||
output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j]), eltwise_vec.data[j]);
|
||||
v_store(inplace_output_vPtr[i], output_vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
|
||||
void launch_vectorized_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
|
||||
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||
CV_Assert(is_fully_aligned<T>(eltwise, N));
|
||||
|
||||
auto kernel = raw::generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
|
||||
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, inplace_output, eltwise, act_params, eltwise_params);
|
||||
}
|
||||
|
||||
template <class T, class ActivationOp, class EltwiseOp> static
|
||||
void generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
|
||||
CV_Assert(inplace_output.size() == eltwise.size());
|
||||
|
||||
if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4)) {
|
||||
launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, eltwise, act_params, eltwise_params);
|
||||
} else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2)) {
|
||||
launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, eltwise, act_params, eltwise_params);
|
||||
} else {
|
||||
launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, eltwise, act_params, eltwise_params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T slope) {
|
||||
generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {slope});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T floor, T ceiling) {
|
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
|
||||
generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {floor, ceiling});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
|
||||
generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
|
||||
generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
|
||||
generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
|
||||
generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T exp, T scale, T shift) {
|
||||
generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {exp, scale, shift});
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half);
|
||||
template void clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
template void tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
|
||||
#endif
|
||||
|
||||
template void relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float);
|
||||
template void clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
template void tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
|
||||
template void swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
|
||||
template void mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
|
||||
template void sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
|
||||
template void power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float, float);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
209
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activations.cu
vendored
Normal file
209
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/activations.cu
vendored
Normal file
@ -0,0 +1,209 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "functors.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include "../cuda4dnn/kernels/scale_shift.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, class ActivationOp, std::size_t N>
|
||||
__global__ void generic_op_vec(Span<T> output, View<T> input, const typename ActivationOp::Params params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
ActivationOp activation_op(params);
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
for (int j = 0; j < vector_type::size(); j++)
|
||||
vec.data[j] = activation_op(vec.data[j]);
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
__global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
const index_type c = (i / inner_size) % slope.size();
|
||||
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
for (int j = 0; j < vector_type::size(); j++)
|
||||
vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c];
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace raw */
|
||||
|
||||
template <class T, class ActivationOp, std::size_t N> static
|
||||
void launch_vectorized_generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
|
||||
auto kernel = raw::generic_op_vec<T, ActivationOp, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, params);
|
||||
}
|
||||
|
||||
template <class T, class ActivationOp> static
|
||||
void generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params = {}) {
|
||||
CV_Assert(input.size() == output.size());
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
|
||||
launch_vectorized_generic_op<T, ActivationOp, 4>(stream, output, input, params);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
|
||||
launch_vectorized_generic_op<T, ActivationOp, 2>(stream, output, input, params);
|
||||
} else {
|
||||
launch_vectorized_generic_op<T, ActivationOp, 1>(stream, output, input, params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
|
||||
generic_op<T, ReLUFunctor<T>>(stream, output, input, {slope});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
|
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
|
||||
generic_op<T, ClippedReLUFunctor<T>>(stream, output, input, {floor, ceiling});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void tanh(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, TanHFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void swish(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, SwishFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void mish(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, MishFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, SigmoidFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void elu(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, ELUFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void bnll(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, BNLLFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void abs(const Stream& stream, Span<T> output, View<T> input) {
|
||||
generic_op<T, AbsFunctor<T>>(stream, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
|
||||
CV_Assert(input.size() == output.size());
|
||||
|
||||
if (static_cast<float>(exp) == 1.0f) {
|
||||
scale1_with_bias1(stream, output, input, scale, shift);
|
||||
return;
|
||||
}
|
||||
|
||||
generic_op<T, PowerFunctor<T>>(stream, output, input, {exp, scale, shift});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void exp(const Stream& stream, Span<T> output, View<T> input, T normScale, T normShift) {
|
||||
generic_op<T, ExpFunctor<T>>(stream, output, input, {normScale, normShift});
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
|
||||
template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void swish<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void mish<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void elu<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
|
||||
template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
|
||||
template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
|
||||
template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
#endif
|
||||
|
||||
|
||||
template void relu<float>(const Stream&, Span<float>, View<float>, float);
|
||||
template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
template void tanh<float>(const Stream&, Span<float>, View<float>);
|
||||
template void swish<float>(const Stream&, Span<float>, View<float>);
|
||||
template void mish<float>(const Stream&, Span<float>, View<float>);
|
||||
template void sigmoid<float>(const Stream&, Span<float>, View<float>);
|
||||
template void elu<float>(const Stream&, Span<float>, View<float>);
|
||||
template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
|
||||
template void bnll<float>(const Stream&, Span<float>, View<float>);
|
||||
template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
|
||||
template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::axiswise_relu_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, inner_size / N, slope);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
|
||||
CV_Assert(input.size() == output.size());
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
|
||||
launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
|
||||
launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope);
|
||||
} else {
|
||||
launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>);
|
||||
#endif
|
||||
template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
73
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/array.hpp
vendored
Normal file
73
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/array.hpp
vendored
Normal file
@ -0,0 +1,73 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "types.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
template <class T, std::size_t N>
|
||||
struct array {
|
||||
using value_type = T;
|
||||
using size_type = device::size_type;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using reference = typename std::add_lvalue_reference<value_type>::type;
|
||||
using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type;
|
||||
using pointer = typename std::add_pointer<value_type>::type;
|
||||
using const_pointer = typename std::add_pointer<typename std::add_const<value_type>::type>::type;
|
||||
using iterator = pointer;
|
||||
using const_iterator = const_pointer;
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
|
||||
|
||||
__host__ __device__ bool empty() const noexcept { return N == 0; }
|
||||
__host__ __device__ size_type size() const noexcept { return N; }
|
||||
|
||||
__host__ __device__ iterator begin() noexcept { return ptr; }
|
||||
__host__ __device__ iterator end() noexcept { return ptr + N; }
|
||||
__host__ __device__ const_iterator begin() const noexcept { return ptr; }
|
||||
__host__ __device__ const_iterator end() const noexcept { return ptr + N; }
|
||||
|
||||
__host__ __device__ const_iterator cbegin() const noexcept { return ptr; }
|
||||
__host__ __device__ const_iterator cend() const noexcept { return ptr + N; }
|
||||
|
||||
__host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; }
|
||||
__host__ __device__ reverse_iterator rend() noexcept { return ptr; }
|
||||
__host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; }
|
||||
__host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; }
|
||||
|
||||
__host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; }
|
||||
__host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; }
|
||||
|
||||
template <class InputItr>
|
||||
__host__ void assign(InputItr first, InputItr last) {
|
||||
std::copy(first, last, std::begin(ptr));
|
||||
}
|
||||
|
||||
__host__ __device__ reference operator[](int idx) { return ptr[idx]; }
|
||||
__host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; }
|
||||
|
||||
__host__ __device__ reference front() { return ptr[0]; }
|
||||
__host__ __device__ const_reference front() const { return ptr[0]; }
|
||||
|
||||
__host__ __device__ reference back() { return ptr[N - 1]; }
|
||||
__host__ __device__ const_reference back() const { return ptr[N - 1]; }
|
||||
|
||||
__host__ __device__ pointer data() noexcept { return ptr; }
|
||||
__host__ __device__ const_pointer data() const noexcept { return ptr; }
|
||||
|
||||
T ptr[N];
|
||||
};
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */
|
38
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/atomics.hpp
vendored
Normal file
38
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/atomics.hpp
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
|
||||
// This function was introduced in CUDA 10.
|
||||
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700 && CUDART_VERSION >= 10000)
|
||||
// And half-precision floating-point operations are not supported by devices of compute capability strictly lower than 5.3
|
||||
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
|
||||
#elif __CUDA_ARCH__ < 530
|
||||
#else
|
||||
inline __device__ void atomicAdd(__half* address, __half val) {
|
||||
unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
|
||||
unsigned int old = *address_as_ui;
|
||||
unsigned int assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
|
||||
__half_raw hsum;
|
||||
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
|
||||
__half tmpres = hsum + val;
|
||||
hsum = __half_raw(tmpres);
|
||||
|
||||
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
|
||||
old = atomicCAS(address_as_ui, assumed, old);
|
||||
} while (assumed != old);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */
|
39
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bbox_utils.hpp
vendored
Normal file
39
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bbox_utils.hpp
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
|
||||
|
||||
#include "math.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
struct BoundingBox
|
||||
{
|
||||
float xmin, ymin, xmax, ymax;
|
||||
};
|
||||
|
||||
template <bool NORMALIZED_BBOX>
|
||||
__device__ __forceinline__ float compute_bbox_size(BoundingBox bbox)
|
||||
{
|
||||
float width = bbox.xmax - bbox.xmin;
|
||||
float height = bbox.ymax - bbox.ymin;
|
||||
if (width < 0 || height < 0)
|
||||
return 0.0;
|
||||
|
||||
if (!NORMALIZED_BBOX)
|
||||
{
|
||||
width += 1;
|
||||
height += 1;
|
||||
}
|
||||
|
||||
using csl::device::mul_ftz;
|
||||
return mul_ftz(width, height);
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP */
|
120
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation.cu
vendored
Normal file
120
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation.cu
vendored
Normal file
@ -0,0 +1,120 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "functors.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, class ActivationOp, std::size_t N>
|
||||
__global__ void biasN_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, const typename ActivationOp::Params params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||
|
||||
ActivationOp activation_op(params);
|
||||
|
||||
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||
const index_type bias_idx = (i / inner_size) % bias.size();
|
||||
|
||||
vector_type vec;
|
||||
v_load(vec, inplace_output_vPtr[i]);
|
||||
for(int j = 0; j < vec.size(); j++)
|
||||
vec.data[j] = activation_op(vec.data[j] + bias[bias_idx]);
|
||||
v_store(inplace_output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace raw */
|
||||
|
||||
template <class T, class ActivationOp, std::size_t N> static
|
||||
void launch_vectorized_biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params) {
|
||||
CV_Assert(inplace_output.size() % inner_size == 0);
|
||||
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::biasN_generic_op_inplace_vec<T, ActivationOp, N>;
|
||||
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, params);
|
||||
}
|
||||
|
||||
template <class T, class ActivationOp> static
|
||||
void biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params = {}) {
|
||||
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||
launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 4>(stream, inplace_output, inner_size, bias, params);
|
||||
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||
launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 2>(stream, inplace_output, inner_size, bias, params);
|
||||
} else {
|
||||
launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 1>(stream, inplace_output, inner_size, bias, params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
|
||||
biasN_generic_op_inplace<T, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {slope});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
|
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceil));
|
||||
biasN_generic_op_inplace<T, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {floor, ceil});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||
biasN_generic_op_inplace<T, TanHFunctor<T>>(stream, inplace_output, inner_size, bias);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||
biasN_generic_op_inplace<T, SwishFunctor<T>>(stream, inplace_output, inner_size, bias);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||
biasN_generic_op_inplace<T, MishFunctor<T>>(stream, inplace_output, inner_size, bias);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||
biasN_generic_op_inplace<T, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power, T scale, T shift) {
|
||||
biasN_generic_op_inplace<T, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, {power, scale, shift});
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
|
||||
template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half);
|
||||
template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||
template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||
template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||
template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||
template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half, __half);
|
||||
#endif
|
||||
|
||||
template void biasN_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
|
||||
template void biasN_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float);
|
||||
template void biasN_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||
template void biasN_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||
template void biasN_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||
template void biasN_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||
template void biasN_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float, float);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
125
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation_eltwise.cu
vendored
Normal file
125
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_activation_eltwise.cu
vendored
Normal file
@ -0,0 +1,125 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "functors.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
|
||||
__global__ void biasN_generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||
auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
|
||||
|
||||
ActivationOp activation_op(act_params);
|
||||
EltwiseOp eltwise_op(eltwise_params);
|
||||
|
||||
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||
const index_type bias_idx = (i / inner_size) % bias.size();
|
||||
|
||||
vector_type output_vec, eltwise_vec;
|
||||
v_load(output_vec, inplace_output_vPtr[i]);
|
||||
v_load(eltwise_vec, eltwise_vPtr[i]);
|
||||
for(int j = 0; j < output_vec.size(); j++)
|
||||
output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j] + bias[bias_idx]), eltwise_vec.data[j]);
|
||||
v_store(inplace_output_vPtr[i], output_vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
|
||||
void launch_vectorized_biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
|
||||
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||
CV_Assert(is_fully_aligned<T>(eltwise, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::biasN_generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
|
||||
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, act_params, eltwise_params);
|
||||
}
|
||||
|
||||
template <class T, class ActivationOp, class EltwiseOp> static
|
||||
void biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
|
||||
CV_Assert(inplace_output.size() == eltwise.size());
|
||||
|
||||
if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
|
||||
launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
|
||||
} else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
|
||||
launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
|
||||
} else {
|
||||
launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
|
||||
biasN_generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {slope});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
|
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
|
||||
biasN_generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {floor, ceiling});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
|
||||
biasN_generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {exp, scale, shift});
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void biasN_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
|
||||
template void biasN_clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
|
||||
template void biasN_tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
|
||||
#endif
|
||||
|
||||
template void biasN_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
|
||||
template void biasN_clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
|
||||
template void biasN_tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
132
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_eltwise_activation.cu
vendored
Normal file
132
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/bias_eltwise_activation.cu
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "functors.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
|
||||
__global__ void biasN_eltwise_op_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||
auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
|
||||
|
||||
EltwiseOp eltwise_op(eltwise_params);
|
||||
ActivationOp activation_op(act_params);
|
||||
|
||||
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||
const index_type bias_idx = (i / inner_size) % bias.size();
|
||||
|
||||
vector_type output_vec, eltwise_vec;
|
||||
v_load(output_vec, inplace_output_vPtr[i]);
|
||||
v_load(eltwise_vec, eltwise_vPtr[i]);
|
||||
for(int j = 0; j < output_vec.size(); j++)
|
||||
output_vec.data[j] = activation_op(eltwise_op(output_vec.data[j] + bias[bias_idx], eltwise_vec.data[j]));
|
||||
v_store(inplace_output_vPtr[i], output_vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
|
||||
void launch_vectorized_biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
|
||||
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||
CV_Assert(inplace_output.size() % bias.size() == 0);
|
||||
CV_Assert(is_fully_aligned<T>(eltwise, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::biasN_eltwise_op_generic_op_inplace_vec<T, EltwiseOp, ActivationOp, N>;
|
||||
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, eltwise_params, act_params);
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, class ActivationOp> static
|
||||
void biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
|
||||
CV_Assert(inplace_output.size() == eltwise.size());
|
||||
|
||||
if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
|
||||
launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 4>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
|
||||
} else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
|
||||
launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 2>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
|
||||
} else {
|
||||
launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 1>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_identity_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, IdentityFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {slope});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
|
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {floor, ceiling});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, TanHFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SwishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, MishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
|
||||
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {exp, scale, shift});
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void biasN_eltwise_sum_2_identity_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_eltwise_sum_2_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
|
||||
template void biasN_eltwise_sum_2_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
|
||||
template void biasN_eltwise_sum_2_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_eltwise_sum_2_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_eltwise_sum_2_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_eltwise_sum_2_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
|
||||
template void biasN_eltwise_sum_2_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
|
||||
#endif
|
||||
|
||||
template void biasN_eltwise_sum_2_identity_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_eltwise_sum_2_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
|
||||
template void biasN_eltwise_sum_2_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
|
||||
template void biasN_eltwise_sum_2_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_eltwise_sum_2_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_eltwise_sum_2_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_eltwise_sum_2_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
|
||||
template void biasN_eltwise_sum_2_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
71
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/block_stride_range.hpp
vendored
Normal file
71
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/block_stride_range.hpp
vendored
Normal file
@ -0,0 +1,71 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
|
||||
|
||||
#include "types.hpp"
|
||||
#include "index_helpers.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
template <int dim, int BLOCK_SIZE = 0, class index_type = device::index_type, class size_type = device::size_type>
|
||||
class block_stride_range_generic {
|
||||
public:
|
||||
__device__ block_stride_range_generic(index_type to_) : from(0), to(to_) { }
|
||||
__device__ block_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
|
||||
|
||||
class iterator
|
||||
{
|
||||
public:
|
||||
__device__ iterator(index_type pos_) : pos(pos_) {}
|
||||
|
||||
/* these iterators return the index when dereferenced; this allows us to loop
|
||||
* through the indices using a range based for loop
|
||||
*/
|
||||
__device__ index_type operator*() const { return pos; }
|
||||
|
||||
__device__ iterator& operator++() {
|
||||
const index_type block_size = BLOCK_SIZE == 0 ? getBlockDim<dim>() : BLOCK_SIZE;
|
||||
pos += block_size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
__device__ bool operator!=(const iterator& other) const {
|
||||
/* NOTE HACK
|
||||
* 'pos' can move in large steps (see operator++)
|
||||
* expansion of range for loop uses != as the loop conditioion
|
||||
* => operator!= must return false if 'pos' crosses the end
|
||||
*/
|
||||
return pos < other.pos;
|
||||
}
|
||||
|
||||
private:
|
||||
index_type pos;
|
||||
};
|
||||
|
||||
__device__ iterator begin() const {
|
||||
return iterator(from + getThreadIdx<dim>());
|
||||
}
|
||||
|
||||
__device__ iterator end() const {
|
||||
return iterator(to);
|
||||
}
|
||||
|
||||
private:
|
||||
index_type from, to;
|
||||
};
|
||||
|
||||
using block_stride_range_x = block_stride_range_generic<0>;
|
||||
using block_stride_range_y = block_stride_range_generic<1>;
|
||||
using block_stride_range_z = block_stride_range_generic<2>;
|
||||
|
||||
template <size_type BLOCK_SIZE = 0>
|
||||
using block_stride_range = block_stride_range_generic<0, BLOCK_SIZE>;
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP */
|
277
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/concat.cu
vendored
Normal file
277
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/concat.cu
vendored
Normal file
@ -0,0 +1,277 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "kernel_dispatcher.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t N>
|
||||
__global__ void concat_vec(
|
||||
Span<T> output, size_type output_axis_size, index_type output_axis_offset,
|
||||
View<T> input, size_type input_axis_size, size_type concat_size)
|
||||
{
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
/* we need to copy all the elements of input to some location in the output
|
||||
* we copy blocks of size `total_concat_size` to some location in the output
|
||||
*/
|
||||
const auto total_concat_size = concat_size * input_axis_size;
|
||||
|
||||
for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) {
|
||||
const index_type idx = in_idx * vector_type::size();
|
||||
const index_type concat_num = idx / total_concat_size;
|
||||
const index_type concat_index = idx % total_concat_size;
|
||||
const index_type top_index = concat_index +
|
||||
(concat_num * output_axis_size + output_axis_offset) * concat_size;
|
||||
|
||||
const auto out_idx = top_index / vector_type::size();
|
||||
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[in_idx]);
|
||||
v_store(output_vPtr[out_idx], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t Rank>
|
||||
__global__ void concat_with_offsets(
|
||||
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset,
|
||||
View<T> input, array<size_type, Rank> in_strides)
|
||||
{
|
||||
for (auto i : grid_stride_range(input.size())) {
|
||||
index_type in_index = i / in_strides[0];
|
||||
index_type out_index = out_offset[0] + in_index;
|
||||
index_type oidx = out_index * out_strides[0];
|
||||
for (int j = 1; j < Rank; j++) {
|
||||
in_index = (i % in_strides[j - 1]) / in_strides[j];
|
||||
out_index = out_offset[j] + in_index;
|
||||
oidx += out_index * out_strides[j];
|
||||
}
|
||||
|
||||
output[oidx] = input[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_vectorized_concat(const Stream& stream,
|
||||
Span<T> output, size_type output_axis_size, index_type output_axis_offset,
|
||||
View<T> input, size_type input_axis_size, size_type concat_size)
|
||||
{
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
/* more assertions are required to fully check for vectorization possibility; check concat() */
|
||||
|
||||
auto kernel = raw::concat_vec<T, N>;
|
||||
auto policy = make_policy(kernel, input.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void concat(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output, std::size_t output_axis_offset,
|
||||
TensorView<T> input, std::size_t axis)
|
||||
{
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output_axis_offset < output.get_axis_size(axis));
|
||||
|
||||
/* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
|
||||
* in the output and we can copy each block directly
|
||||
*/
|
||||
if (output.size_range(0, axis) == 1)
|
||||
{
|
||||
auto stride = output.size_range(axis + 1, output.rank());
|
||||
auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
|
||||
kernels::copy<T>(stream, sliced_output, input);
|
||||
return;
|
||||
}
|
||||
|
||||
/* let's call the axis of interest as the channel axis for the purpose of the following discussion
|
||||
* even though it can be any axis
|
||||
*
|
||||
* for each batch item:
|
||||
* we move all the channels from the input (which together, for a single batch item, is contiguous)
|
||||
* of a batch item to its corresponding contiguous place in the output
|
||||
*
|
||||
* for a valid vector operation:
|
||||
* - the size of each copy block must be aligned
|
||||
* - input must be aligned
|
||||
* - all the destination locations in the output must be aligned
|
||||
*/
|
||||
std::size_t concat_size = output.size_range(axis + 1, output.rank());
|
||||
|
||||
std::size_t input_axis_size = input.get_axis_size(axis);
|
||||
std::size_t output_axis_size = output.get_axis_size(axis);
|
||||
|
||||
std::size_t copy_block_size = concat_size * input_axis_size;
|
||||
std::size_t copy_block_stride = concat_size * output_axis_size;
|
||||
std::size_t starting_offset = output_axis_offset * concat_size;
|
||||
|
||||
/* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size`
|
||||
* to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride`
|
||||
*/
|
||||
|
||||
bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0;
|
||||
bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0;
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) {
|
||||
launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) {
|
||||
launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
|
||||
} else {
|
||||
launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
|
||||
#endif
|
||||
template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>, std::size_t);
|
||||
|
||||
template <class T, std::size_t Rank> static
|
||||
void launch_concat_with_offsets(
|
||||
const Stream& stream,
|
||||
Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset,
|
||||
View<T> input, const std::vector<std::size_t>& inStride)
|
||||
{
|
||||
CV_Assert(outStride.size() == Rank);
|
||||
CV_Assert(outOffset.size() == Rank);
|
||||
CV_Assert(inStride.size() == Rank);
|
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k;
|
||||
outStride_k.assign(std::begin(outStride), std::end(outStride));
|
||||
inStride_k.assign(std::begin(inStride), std::end(inStride));
|
||||
|
||||
array<index_type, Rank> outOffset_k;
|
||||
outOffset_k.assign(std::begin(outOffset), std::end(outOffset));
|
||||
|
||||
auto kernel = raw::concat_with_offsets<T, Rank>;
|
||||
auto policy = make_policy(kernel, input.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k);
|
||||
}
|
||||
|
||||
GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets);
|
||||
|
||||
template <class T>
|
||||
void concat_with_offsets(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output, TensorView<T> input,
|
||||
std::vector<std::size_t> offsets)
|
||||
{
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output.rank() == offsets.size());
|
||||
|
||||
/* squeezable axes at the beginning of both tensors can be eliminated
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output
|
||||
* tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items
|
||||
* from the input tensor to new locations in the output tensor.
|
||||
*
|
||||
* If the size of the first axis of the input and output tensor is unity, the input and output
|
||||
* indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...]
|
||||
* respectively. The first index does not contribute to the element's address calculation and
|
||||
* hence does nothing apart from eating up few cycles.
|
||||
*/
|
||||
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
|
||||
CV_Assert(offsets[0] == 0);
|
||||
|
||||
input.squeeze(0);
|
||||
output.squeeze(0);
|
||||
offsets.erase(std::begin(offsets));
|
||||
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output.rank() == offsets.size());
|
||||
}
|
||||
|
||||
auto inShape = input.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
/* contiguous axes that undergo full copy can be combined into one axis
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any
|
||||
* concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
|
||||
*
|
||||
* Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
|
||||
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
|
||||
* a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`.
|
||||
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
|
||||
*/
|
||||
for (int i = 0; i < inShape.size(); i++) {
|
||||
/* check if axis `i` requires any slicing */
|
||||
if (offsets[i] == 0 && inShape[i] == outShape[i]) {
|
||||
/* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
|
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
|
||||
/* `j` axis is also copied fully; merge `i` and `j` */
|
||||
auto new_size = inShape[i] * inShape[j];
|
||||
inShape[i] = new_size;
|
||||
outShape[i] = new_size;
|
||||
offsets[i] = 0; /* redundant */
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape.erase(std::begin(inShape) + j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
offsets.erase(std::begin(offsets) + j);
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(inShape.size() == outShape.size());
|
||||
CV_Assert(inShape.size() == offsets.size());
|
||||
CV_Assert(inShape[i] == outShape[i]);
|
||||
CV_Assert(offsets[i] == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rank = inShape.size();
|
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank);
|
||||
inStride.back() = 1;
|
||||
outStride.back() = 1;
|
||||
/* garbage, ..., garbage, 1 */
|
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
|
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
|
||||
/* dim[0], dim[1], ..., dim[-1], 1 */
|
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
|
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
|
||||
/* stride[0], stride[1], ..., stride[-2], 1 */
|
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
|
||||
concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride);
|
||||
}
|
||||
|
||||
template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
|
||||
template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
171
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/crop_and_resize.cu
vendored
Normal file
171
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/crop_and_resize.cu
vendored
Normal file
@ -0,0 +1,171 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "memory.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER>
|
||||
__global__ void crop_and_resize(
|
||||
Span<T> output, size_type out_height, size_type out_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
View<T> boxes,
|
||||
size_type num_channels)
|
||||
{
|
||||
// input [1, num_channels, in_height, in_width]
|
||||
// output [boxes, num_channels, out_height, out_width]
|
||||
|
||||
const auto in_image_size = in_height * in_width;
|
||||
const auto out_image_size = out_height * out_width;
|
||||
const auto out_box_size = num_channels * out_image_size;
|
||||
|
||||
/* we have to compute the output value for every combination of (box, c, y, x) in the output
|
||||
*
|
||||
* the computation involving (y, x) are identical for all non-spatial dimensions
|
||||
* the computation and memory requests involving the box are identical for remaining three axes
|
||||
*
|
||||
* we process multiple channels every iteration to reuse the identical computation
|
||||
* and memory requests involved with the box and spatial dimensions
|
||||
*/
|
||||
|
||||
/*
|
||||
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
|
||||
* (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y)
|
||||
*/
|
||||
auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER;
|
||||
|
||||
/* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are
|
||||
* `num_boxes` boxes and `out_image_size` combinations of (x, y)
|
||||
*/
|
||||
auto num_boxes = boxes.size() / 7; /* 7 values per box */
|
||||
auto iters_per_box = num_channel_iters_per_box_xy * out_image_size;
|
||||
auto iters_required = num_boxes * iters_per_box;
|
||||
|
||||
for (auto iter : grid_stride_range(iters_required)) {
|
||||
const index_type box_no = iter / iters_per_box;
|
||||
const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER;
|
||||
|
||||
/* note here that consecutive `iter` values will often have consecutive `x` values
|
||||
* => stores into output will be coalesced across threads
|
||||
*/
|
||||
const index_type y = (iter % out_image_size) / out_width;
|
||||
const index_type x = iter % out_width;
|
||||
|
||||
const index_type box_offset = box_no * 7;
|
||||
const auto left = boxes[box_offset + 3],
|
||||
top = boxes[box_offset + 4],
|
||||
right = boxes[box_offset + 5],
|
||||
bottom = boxes[box_offset + 6];
|
||||
|
||||
const auto box_width = right - left;
|
||||
const auto box_height = bottom - top;
|
||||
|
||||
const auto o2i_fy = static_cast<T>(in_height - 1) / static_cast<T>(out_height - 1);
|
||||
const auto o2i_fx = static_cast<T>(in_width - 1) / static_cast<T>(out_width - 1);
|
||||
|
||||
const auto height_scale = box_height * o2i_fy;
|
||||
const auto width_scale = box_width * o2i_fx;
|
||||
|
||||
const auto in_y = top * static_cast<T>(in_height - 1) + static_cast<T>(y) * height_scale;
|
||||
const auto in_x = left * static_cast<T>(in_width - 1) + static_cast<T>(x) * width_scale;
|
||||
|
||||
const auto in_y0 = static_cast<index_type>(in_y);
|
||||
const auto in_x0 = static_cast<index_type>(in_x);
|
||||
|
||||
using device::min;
|
||||
const auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
|
||||
const auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
|
||||
|
||||
index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
|
||||
index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
|
||||
index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x;
|
||||
|
||||
#pragma unroll 1 /* disable unrolling */
|
||||
for (int i = 0; i < CHANNELS_PER_ITER; i++) {
|
||||
auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
|
||||
v_01 = load_ldg(input[in_offset_r0 + in_x1]),
|
||||
v_10 = load_ldg(input[in_offset_r1 + in_x0]),
|
||||
v_11 = load_ldg(input[in_offset_r1 + in_x1]);
|
||||
|
||||
output[out_idx] =
|
||||
v_00 +
|
||||
T(in_y - T(in_y0)) * T(v_10 - v_00) +
|
||||
T(in_x - T(in_x0)) * T(v_01 - v_00) +
|
||||
T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00);
|
||||
|
||||
in_offset_r0 += in_image_size;
|
||||
in_offset_r1 += in_image_size;
|
||||
out_idx += out_image_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER> static
|
||||
void launch_multichannel_crop_and_resize(const Stream& stream,
|
||||
Span<T> output, size_type out_height, size_type out_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
View<T> boxes, size_type num_channels)
|
||||
{
|
||||
auto kernel = raw::crop_and_resize<T, CHANNELS_PER_ITER>;
|
||||
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
|
||||
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void crop_and_resize(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> boxes) {
|
||||
CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */
|
||||
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
|
||||
|
||||
auto out_height = output.get_axis_size(-2);
|
||||
auto out_width = output.get_axis_size(-1);
|
||||
|
||||
auto in_height = input.get_axis_size(-2);
|
||||
auto in_width = input.get_axis_size(-1);
|
||||
|
||||
auto num_channels = input.get_axis_size(1);
|
||||
|
||||
if (num_channels % 64 == 0) {
|
||||
launch_multichannel_crop_and_resize<T, 64>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
} else if (num_channels % 32 == 0) {
|
||||
launch_multichannel_crop_and_resize<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
} else if (num_channels % 16 == 0) {
|
||||
launch_multichannel_crop_and_resize<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
} else if (num_channels % 8 == 0) {
|
||||
launch_multichannel_crop_and_resize<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
} else if (num_channels % 4 == 0) {
|
||||
launch_multichannel_crop_and_resize<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
} else if (num_channels % 2 == 0) {
|
||||
launch_multichannel_crop_and_resize<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
} else {
|
||||
launch_multichannel_crop_and_resize<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes);
|
||||
#endif
|
||||
template void crop_and_resize<float>(const Stream&, TensorSpan<float>, TensorView<float>, View<float> boxes);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
897
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/detection_output.cu
vendored
Normal file
897
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/detection_output.cu
vendored
Normal file
@ -0,0 +1,897 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "bbox_utils.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "block_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "memory.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX>
|
||||
__global__ void decode_bbox(Span<T> decoded_bboxes, View<T> locations, View<T> priors,
|
||||
bool transpose_location, bool normalized_bbox,
|
||||
size_type num_loc_classes, index_type background_class_id,
|
||||
float clip_width, float clip_height)
|
||||
{
|
||||
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
// locations: [batch_size, num_priors, num_loc_classes, 4]
|
||||
// priors: [1, C, num_priors, 4]
|
||||
// C = 2 if !VARIANCE_ENCODED_IN_TARGET; otherwise, 1
|
||||
|
||||
/* 4 bbox values + 4 variance values per prior */
|
||||
constexpr int PRIOR_BOX_SIZE = VARIANCE_ENCODED_IN_TARGET ? 4 : 8;
|
||||
const size_type num_priors = priors.size() / PRIOR_BOX_SIZE;
|
||||
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto locations_vPtr = vector_type::get_pointer(locations.data());
|
||||
auto priors_vPtr = vector_type::get_pointer(priors.data());
|
||||
auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
|
||||
|
||||
const auto boxes_per_batch = num_priors * num_loc_classes;
|
||||
for (auto idx : grid_stride_range(decoded_bboxes.size() / 4))
|
||||
{
|
||||
index_type p;
|
||||
index_type c;
|
||||
|
||||
if (SHARE_LOCATION)
|
||||
{
|
||||
// locations are shared across all classes => num_loc_classes = 1
|
||||
p = idx % boxes_per_batch;
|
||||
c = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = (idx % boxes_per_batch) / num_loc_classes;
|
||||
c = idx % num_loc_classes;
|
||||
}
|
||||
|
||||
if (!SHARE_LOCATION && c == background_class_id)
|
||||
continue;
|
||||
|
||||
BoundingBox bbox;
|
||||
{
|
||||
vector_type location;
|
||||
v_load(location, locations_vPtr[idx]);
|
||||
|
||||
if (transpose_location)
|
||||
{
|
||||
bbox.ymin = location.data[0];
|
||||
bbox.xmin = location.data[1];
|
||||
bbox.ymax = location.data[2];
|
||||
bbox.xmax = location.data[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
bbox.xmin = location.data[0];
|
||||
bbox.ymin = location.data[1];
|
||||
bbox.xmax = location.data[2];
|
||||
bbox.ymax = location.data[3];
|
||||
}
|
||||
}
|
||||
|
||||
if (!VARIANCE_ENCODED_IN_TARGET)
|
||||
{
|
||||
vector_type prior_variance;
|
||||
v_load_ldg(prior_variance, priors_vPtr[num_priors + p]);
|
||||
|
||||
bbox.xmin *= static_cast<float>(prior_variance.data[0]);
|
||||
bbox.ymin *= static_cast<float>(prior_variance.data[1]);
|
||||
bbox.xmax *= static_cast<float>(prior_variance.data[2]);
|
||||
bbox.ymax *= static_cast<float>(prior_variance.data[3]);
|
||||
}
|
||||
|
||||
BoundingBox prior;
|
||||
{
|
||||
vector_type prior_box;
|
||||
v_load_ldg(prior_box, priors_vPtr[p]);
|
||||
|
||||
prior.xmin = prior_box.data[0];
|
||||
prior.ymin = prior_box.data[1];
|
||||
prior.xmax = prior_box.data[2];
|
||||
prior.ymax = prior_box.data[3];
|
||||
}
|
||||
|
||||
BoundingBox decoded_bbox;
|
||||
if (CORNER_TRUE_CENTER_FALSE)
|
||||
{
|
||||
decoded_bbox.xmin = prior.xmin + bbox.xmin;
|
||||
decoded_bbox.ymin = prior.ymin + bbox.ymin;
|
||||
decoded_bbox.xmax = prior.xmax + bbox.xmax;
|
||||
decoded_bbox.ymax = prior.ymax + bbox.ymax;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto prior_width = prior.xmax - prior.xmin;
|
||||
auto prior_height = prior.ymax - prior.ymin;
|
||||
if (!normalized_bbox)
|
||||
{
|
||||
prior_width += 1;
|
||||
prior_height += 1;
|
||||
}
|
||||
|
||||
auto prior_center_x = prior.xmin + prior_width * 0.5f;
|
||||
auto prior_center_y = prior.ymin + prior_height * 0.5f;
|
||||
|
||||
auto decode_bbox_center_x = bbox.xmin * prior_width + prior_center_x;
|
||||
auto decode_bbox_center_y = bbox.ymin * prior_height + prior_center_y;
|
||||
|
||||
using device::exp;
|
||||
float decode_bbox_width = exp(bbox.xmax) * prior_width;
|
||||
float decode_bbox_height = exp(bbox.ymax) * prior_height;
|
||||
|
||||
decoded_bbox.xmin = decode_bbox_center_x - decode_bbox_width * 0.5f;
|
||||
decoded_bbox.ymin = decode_bbox_center_y - decode_bbox_height * 0.5f;
|
||||
decoded_bbox.xmax = decode_bbox_center_x + decode_bbox_width * 0.5f;
|
||||
decoded_bbox.ymax = decode_bbox_center_y + decode_bbox_height * 0.5f;
|
||||
}
|
||||
|
||||
vector_type decoded_bbox_vec;
|
||||
if (CLIP_BBOX)
|
||||
{
|
||||
decoded_bbox_vec.data[0] = clamp(decoded_bbox.xmin, 0.0f, clip_width);
|
||||
decoded_bbox_vec.data[1] = clamp(decoded_bbox.ymin, 0.0f, clip_height);
|
||||
decoded_bbox_vec.data[2] = clamp(decoded_bbox.xmax, 0.0f, clip_width);
|
||||
decoded_bbox_vec.data[3] = clamp(decoded_bbox.ymax, 0.0f, clip_height);
|
||||
}
|
||||
else
|
||||
{
|
||||
decoded_bbox_vec.data[0] = decoded_bbox.xmin;
|
||||
decoded_bbox_vec.data[1] = decoded_bbox.ymin;
|
||||
decoded_bbox_vec.data[2] = decoded_bbox.xmax;
|
||||
decoded_bbox_vec.data[3] = decoded_bbox.ymax;
|
||||
}
|
||||
|
||||
v_store(decoded_bboxes_vPtr[idx], decoded_bbox_vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int BINS, int BLOCK_SIZE>
|
||||
__launch_bounds__(BLOCK_SIZE)
|
||||
__global__ void findTopK(Span<int> indices_, Span<int> count_, View<T> scores_, float threshold, size_type classwise_topK, size_type num_classes, size_type num_priors, index_type background_class_id)
|
||||
{
|
||||
/* We need to sort boxes based on their confidence scores. The confidence scores fall in
|
||||
* the range [0.0, 1.0]. We break the range into bins and perform count sort. This is an
|
||||
* approximate algorithm.
|
||||
*
|
||||
* Each block handles a particular class of a particular batch item.
|
||||
*/
|
||||
const auto c = blockIdx.x;
|
||||
const auto b = blockIdx.y;
|
||||
|
||||
if (c == background_class_id)
|
||||
return;
|
||||
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// scores: [batch_size, num_classes, num_priors]
|
||||
|
||||
auto count = count_.data() + b * num_classes + c;
|
||||
auto scores = scores_.data() + (b * num_classes + c) * num_priors;
|
||||
auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
|
||||
|
||||
/* We do not require a large number of bins to find the top K confidence scores. We will use
|
||||
* a reasonable number of bins which will fit in the shared memory.
|
||||
*
|
||||
* Note that smaller scores will have a smaller index, i.e. the `bins` are ordered in
|
||||
* ascending order.
|
||||
*/
|
||||
|
||||
__shared__ int bins[BINS];
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
|
||||
bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
|
||||
{
|
||||
const float confidence = load_ldg(scores[i]);
|
||||
if (confidence > threshold)
|
||||
{
|
||||
using device::fast_divide_ftz;
|
||||
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
|
||||
|
||||
using device::clamp;
|
||||
int bin_index = conf_scaled * BINS;
|
||||
|
||||
/* We store counts of confidence scores in the bins. Our ultimate goal is to store the indices
|
||||
* of the `classwise_topK` confidence values in the `indices` array.
|
||||
*
|
||||
* We use a little trick to parallelize the process of filling up the `indices` array.
|
||||
* We want every thread in the block to participate in the process. To do so, we want the
|
||||
* bins array to be shifted by one place to the left. We will be computing the suffix sum
|
||||
* of the bins array later. Details and reasons for doing so will be explained later.
|
||||
*/
|
||||
bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
|
||||
|
||||
if (bin_index >= 0)
|
||||
atomicAdd(&bins[bin_index], 1);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
|
||||
// FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
|
||||
|
||||
if (threadIdx.x < WARP_SIZE)
|
||||
{
|
||||
/* We can compute suffix sum of an array in groups of N numbers.
|
||||
* Let N be 4 for this example.
|
||||
*
|
||||
* 1) Last 4 numbers
|
||||
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
|
||||
* group suffix sum: 42 33 23 12
|
||||
*
|
||||
* 2) Middle 4 numbers
|
||||
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
|
||||
* group suffix sum: | 26 21 15 8 |
|
||||
*
|
||||
* We add `42` (first element in the previous group) to each element to get:
|
||||
*
|
||||
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
|
||||
* | 68 63 57 50 | 42 33 23 12
|
||||
* 3) First 4 numbers
|
||||
*
|
||||
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
|
||||
* group suffix sum: 10 9 7 4 |
|
||||
*
|
||||
* We add `68` (first element in the previous group) to each element to get:
|
||||
*
|
||||
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
|
||||
* group suffix sum: 78 77 75 72 | 68 63 57 50 | 42 33 23 12
|
||||
*
|
||||
* What we are left with now is the suffix sum of the entire array.
|
||||
*
|
||||
* We use the aforementioned logic in the code below but work in groups of `warpSize`.
|
||||
*/
|
||||
|
||||
/* We calculate suffix sums WARP_SIZE elements at a time starting from the right end.
|
||||
* Hence, we will need BINS / WARP_SIZE number of iterations.
|
||||
*
|
||||
* Each iteration uses shuffle instructions to exchange data between threads. Shuffle
|
||||
* instructions cannot be used in warp-divergent code. If the bins are a multiple of
|
||||
* the warpSize, all the threads in the warp will participate.
|
||||
*/
|
||||
static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
|
||||
|
||||
const int thread_id = threadIdx.x;
|
||||
const int inverse_lane_id = WARP_SIZE - thread_id - 1;
|
||||
|
||||
int previous_group_first_element = 0;
|
||||
for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
|
||||
{
|
||||
const index_type idx = iter * WARP_SIZE + thread_id;
|
||||
auto value = bins[idx];
|
||||
|
||||
for (int i = 1; i < WARP_SIZE; i *= 2)
|
||||
{
|
||||
auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
|
||||
if (inverse_lane_id >= i)
|
||||
value += n;
|
||||
}
|
||||
|
||||
value += previous_group_first_element;
|
||||
bins[idx] = value;
|
||||
|
||||
previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
*count = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
|
||||
{
|
||||
const float confidence = load_ldg(scores[i]);
|
||||
if (confidence > threshold)
|
||||
{
|
||||
using device::fast_divide_ftz;
|
||||
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
|
||||
|
||||
int bin_index = conf_scaled * BINS;
|
||||
bin_index = clamp<int>(bin_index, 0, BINS - 1);
|
||||
|
||||
/* This bounding box is eligible to be selected unless it does not fall in
|
||||
* the `classwise_topK`. If it did, we would have to compute the location where it needs
|
||||
* to be stored.
|
||||
*
|
||||
* Suppose we had just 4 bins and say the following were the counts:
|
||||
* BIN0 2
|
||||
* BIN1 1
|
||||
* BIN2 3
|
||||
* BIN3 0 (last bin is always zero as we shift left by one while populating the bins)
|
||||
*
|
||||
* We will try our best to store the boxes in a sorted order in the `indices` array.
|
||||
* This requires that the boxes in later bins (higher confidence scores) must be
|
||||
* stored earlier.
|
||||
*
|
||||
* We compute the suffix sum of the array. This gives us:
|
||||
* BIN0 6
|
||||
* BIN1 4
|
||||
* BIN2 3
|
||||
* BIN3 0
|
||||
*
|
||||
* The bins now give us the location in the `indices` array from which the indices of the
|
||||
* scores corresponding to that bin would be stored. We atomically increment the bin count
|
||||
* everytime we store a box corresponding to that bin. Therefore, the value in the bins
|
||||
* gives the index in the `indices` array where the next box corresponding to that bin must
|
||||
* be put.
|
||||
*/
|
||||
|
||||
const index_type idx = atomicAdd(&bins[bin_index], 1);
|
||||
if (idx < classwise_topK)
|
||||
{
|
||||
indices[idx] = i;
|
||||
atomicAdd(&count[0], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void box_collect(Span<T> collected_bboxes_, View<T> decoded_bboxes_, View<int> indices_, View<int> count_, bool share_location, size_type num_priors, size_type num_classes, size_type classwise_topK, index_type background_class_id)
|
||||
{
|
||||
const index_type c = blockIdx.x;
|
||||
if (c == background_class_id)
|
||||
return;
|
||||
|
||||
const index_type b = blockIdx.y;
|
||||
|
||||
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
|
||||
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
|
||||
const auto num_loc_classes = share_location ? 1 : num_classes;
|
||||
|
||||
auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
|
||||
auto decoded_bboxes = decoded_bboxes_.data() + b * num_priors * num_loc_classes * 4;
|
||||
auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
|
||||
auto count = count_.data() + b * num_classes + c;
|
||||
|
||||
const auto boxes = load_ldg(&count[0]);
|
||||
if (boxes == 0)
|
||||
return;
|
||||
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes);
|
||||
auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
|
||||
|
||||
for (auto i : block_stride_range<>(boxes))
|
||||
{
|
||||
const auto prior_id = indices[i];
|
||||
const index_type idx = share_location ? prior_id : (prior_id * num_classes + c);
|
||||
|
||||
vector_type box;
|
||||
v_load(box, decoded_bboxes_vPtr[idx]);
|
||||
v_store(collected_bboxes_vPtr[i], box);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, bool NORMALIZED_BBOX>
|
||||
__global__ void blockwise_class_nms(Span<int> indices_, Span<int> count_, View<T> collected_bboxes_, size_type num_classes, size_type classwise_topK, index_type background_class_id, float nms_threshold)
|
||||
{
|
||||
const index_type b = blockIdx.x / num_classes;
|
||||
const index_type c = blockIdx.x % num_classes;
|
||||
if (c == background_class_id)
|
||||
return;
|
||||
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
|
||||
|
||||
auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
|
||||
auto count = count_.data() + b * num_classes + c;
|
||||
auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
|
||||
|
||||
const auto boxes = count[0];
|
||||
if (boxes == 0)
|
||||
return;
|
||||
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
|
||||
|
||||
for (int i = 0; i < boxes; i++)
|
||||
{
|
||||
auto prior_id = indices[i];
|
||||
if (prior_id != -1)
|
||||
{
|
||||
BoundingBox bbox1;
|
||||
{
|
||||
vector_type box;
|
||||
v_load(box, collected_bboxes_vPtr[i]);
|
||||
|
||||
bbox1.xmin = box.data[0];
|
||||
bbox1.ymin = box.data[1];
|
||||
bbox1.xmax = box.data[2];
|
||||
bbox1.ymax = box.data[3];
|
||||
}
|
||||
|
||||
for (auto j : block_stride_range<>(i + 1, boxes))
|
||||
{
|
||||
prior_id = indices[j];
|
||||
if (prior_id == -1)
|
||||
continue;
|
||||
|
||||
BoundingBox bbox2;
|
||||
{
|
||||
vector_type box;
|
||||
v_load_ldg(box, collected_bboxes_vPtr[j]);
|
||||
|
||||
bbox2.xmin = box.data[0];
|
||||
bbox2.ymin = box.data[1];
|
||||
bbox2.xmax = box.data[2];
|
||||
bbox2.ymax = box.data[3];
|
||||
}
|
||||
|
||||
using device::min;
|
||||
using device::max;
|
||||
|
||||
BoundingBox intersect_bbox;
|
||||
intersect_bbox.xmin = max(bbox1.xmin, bbox2.xmin);
|
||||
intersect_bbox.ymin = max(bbox1.ymin, bbox2.ymin);
|
||||
intersect_bbox.xmax = min(bbox1.xmax, bbox2.xmax);
|
||||
intersect_bbox.ymax = min(bbox1.ymax, bbox2.ymax);
|
||||
|
||||
float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
|
||||
float bbox1_size = compute_bbox_size<NORMALIZED_BBOX>(bbox1);
|
||||
float bbox2_size = compute_bbox_size<NORMALIZED_BBOX>(bbox2);
|
||||
|
||||
using device::fast_divide_ftz;
|
||||
float iou = fast_divide_ftz(intersect_size, bbox1_size + bbox2_size - intersect_size);
|
||||
if (iou > nms_threshold)
|
||||
indices[j] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
count[0] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (auto i : block_stride_range<>(boxes))
|
||||
{
|
||||
auto prior_id = indices[i];
|
||||
if(prior_id != -1)
|
||||
{
|
||||
const index_type idx = atomicAdd(&count[0], 1);
|
||||
indices[idx] = prior_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t BINS, int BLOCK_SIZE>
|
||||
__launch_bounds__(BLOCK_SIZE)
|
||||
__global__ void nms_collect(
|
||||
Span<int> kept_indices, Span<int> kept_count, View<int> indices_, View<int> count, View<T> scores_, float threshold,
|
||||
size_type num_classes, size_type num_priors, size_type classwise_topK, size_type keepTopK, index_type background_class_id)
|
||||
{
|
||||
// sorting algorithm is documented in detail in findTopK kernel comments
|
||||
// no explanations are provided here
|
||||
|
||||
// kept_indices: [batch_size, keepTopK]
|
||||
// kept_count: [batch_size]
|
||||
|
||||
const auto b = blockIdx.x;
|
||||
|
||||
__shared__ int bins[BINS];
|
||||
|
||||
#pragma unroll
|
||||
for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
|
||||
bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int c = 0; c < num_classes; c++)
|
||||
{
|
||||
if (c == background_class_id)
|
||||
continue;
|
||||
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// scores: [batch_size, num_classes, num_priors]
|
||||
|
||||
const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
|
||||
const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
|
||||
|
||||
auto boxes = count[b * num_classes + c];
|
||||
|
||||
for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
|
||||
{
|
||||
auto prior_id = indices[i];
|
||||
const float confidence = load_ldg(scores[prior_id]);
|
||||
if (confidence > threshold)
|
||||
{
|
||||
using device::fast_divide_ftz;
|
||||
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
|
||||
|
||||
using device::clamp;
|
||||
int bin_index = conf_scaled * BINS;
|
||||
bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
|
||||
|
||||
if (bin_index >= 0)
|
||||
atomicAdd(&bins[bin_index], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
|
||||
// FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
|
||||
|
||||
if (threadIdx.x < WARP_SIZE)
|
||||
{
|
||||
static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
|
||||
|
||||
const int thread_id = threadIdx.x;
|
||||
const int inverse_lane_id = WARP_SIZE - thread_id - 1;
|
||||
|
||||
int previous_group_first_element = 0;
|
||||
for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
|
||||
{
|
||||
const index_type idx = iter * WARP_SIZE + thread_id;
|
||||
auto value = bins[idx];
|
||||
|
||||
for (int i = 1; i < WARP_SIZE; i *= 2)
|
||||
{
|
||||
auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
|
||||
if (inverse_lane_id >= i)
|
||||
value += n;
|
||||
}
|
||||
|
||||
value += previous_group_first_element;
|
||||
bins[idx] = value;
|
||||
|
||||
previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
kept_count[b] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int c = 0; c < num_classes; c++)
|
||||
{
|
||||
if (c == background_class_id)
|
||||
continue;
|
||||
|
||||
const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
|
||||
const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
|
||||
|
||||
auto boxes = count[b * num_classes + c];
|
||||
|
||||
for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
|
||||
{
|
||||
auto prior_id = indices[i];
|
||||
const float confidence = load_ldg(scores[prior_id]);
|
||||
if (confidence > threshold)
|
||||
{
|
||||
using device::fast_divide_ftz;
|
||||
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
|
||||
|
||||
using device::clamp;
|
||||
int bin_index = conf_scaled * BINS;
|
||||
bin_index = clamp<int>(bin_index, 0, BINS - 1);
|
||||
|
||||
const index_type idx = atomicAdd(&bins[bin_index], 1);
|
||||
if (idx < keepTopK)
|
||||
{
|
||||
kept_indices[b * keepTopK + idx] = c * num_priors + prior_id;
|
||||
atomicAdd(&kept_count[b], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void consolidate_detections(Span<T> output,
|
||||
View<int> kept_indices, View<int> kept_count, View<T> decoded_bboxes, View<T> scores, bool share_location,
|
||||
size_type batch_size, size_type num_classes, size_type num_priors, size_type keepTopK, DevicePtr<int> num_detections)
|
||||
{
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
|
||||
|
||||
// output: [1, 1, batch_size * keepTopK, 7]
|
||||
// kept_indices: [batch_size, keepTopK]
|
||||
// kept_count: [batch_size]
|
||||
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
// scores: [batch_size, num_classes, num_priors]
|
||||
|
||||
for (int b = 0; b < batch_size; b++)
|
||||
{
|
||||
for (auto i : grid_stride_range(kept_count[b]))
|
||||
{
|
||||
auto score_id = kept_indices[b * keepTopK + i];
|
||||
auto c = score_id / num_priors;
|
||||
auto prior_id = score_id % num_priors;
|
||||
|
||||
const auto confidence = scores[b * num_classes * num_priors + score_id];
|
||||
|
||||
index_type bbox_id;
|
||||
if (share_location)
|
||||
{
|
||||
// decoded_bboxes: [batch_size, num_priors, 1, 4]
|
||||
bbox_id = b * num_priors + prior_id;
|
||||
}
|
||||
else
|
||||
{
|
||||
// decoded_bboxes: [batch_size, num_priors, num_classes, 4]
|
||||
bbox_id = (b * num_priors + prior_id) * num_classes + c;
|
||||
}
|
||||
|
||||
vector_type bbox;
|
||||
v_load(bbox, decoded_bboxes_vPtr[bbox_id]);
|
||||
|
||||
auto output_id = atomicAdd(num_detections.get(), 1);
|
||||
output[output_id * 7 + 0] = b;
|
||||
output[output_id * 7 + 1] = c;
|
||||
output[output_id * 7 + 2] = confidence;
|
||||
output[output_id * 7 + 3] = bbox.data[0];
|
||||
output[output_id * 7 + 4] = bbox.data[1];
|
||||
output[output_id * 7 + 5] = bbox.data[2];
|
||||
output[output_id * 7 + 6] = bbox.data[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX> static
|
||||
void launch_decode_boxes_kernel(const Stream& stream, Span<T> decoded_bboxes, View<T> locations, View<T> priors,
|
||||
bool transpose_location, bool normalized_bbox,
|
||||
size_type num_loc_classes, index_type background_class_id,
|
||||
float clip_width, float clip_height)
|
||||
{
|
||||
auto kernel = raw::decode_bbox<T, SHARE_LOCATION, VARIANCE_ENCODED_IN_TARGET, CORNER_TRUE_CENTER_FALSE, CLIP_BBOX>;
|
||||
auto policy = make_policy(kernel, decoded_bboxes.size() / 4, 0, stream);
|
||||
launch_kernel(kernel, policy, decoded_bboxes, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
|
||||
}
|
||||
|
||||
template <class T, unsigned int current, class ...Args> static
|
||||
typename std::enable_if<current == 0, void>
|
||||
::type dispatch_decode_bboxes(int selector, Args&& ...args) {
|
||||
if(selector == 0)
|
||||
launch_decode_boxes_kernel<T, 0, 0, 0, 0>(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <class T, unsigned int current, class ...Args> static
|
||||
typename std::enable_if<current != 0, void>
|
||||
::type dispatch_decode_bboxes(int selector, Args&& ...args) {
|
||||
if(selector == current)
|
||||
launch_decode_boxes_kernel<T,
|
||||
static_cast<bool>(current & 8),
|
||||
static_cast<bool>(current & 4),
|
||||
static_cast<bool>(current & 2),
|
||||
static_cast<bool>(current & 1)>(std::forward<Args>(args)...);
|
||||
else
|
||||
dispatch_decode_bboxes<T, current - 1, Args...>(selector, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void decode_bboxes(const Stream& stream, Span<T> output, View<T> locations, View<T> priors,
|
||||
std::size_t num_loc_classes,
|
||||
bool share_location, std::size_t background_class_id,
|
||||
bool transpose_location, bool variance_encoded_in_target,
|
||||
bool corner_true_or_center_false, bool normalized_bbox,
|
||||
bool clip_box, float clip_width, float clip_height)
|
||||
{
|
||||
/* `config` combines three kernel template options into one number using which a bit of TMP code can
|
||||
* run through all possible combinations and instantiate the correct template
|
||||
*/
|
||||
unsigned int config = (share_location << 3 | variance_encoded_in_target << 2 | corner_true_or_center_false << 1 | clip_box);
|
||||
dispatch_decode_bboxes<T, 15>(config, stream, output, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
|
||||
}
|
||||
|
||||
template void decode_bboxes(const Stream&, Span<__half>, View<__half>, View<__half>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
|
||||
template void decode_bboxes(const Stream&, Span<float>, View<float>, View<float>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
|
||||
|
||||
template <class T>
|
||||
void findTopK(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> scores, std::size_t background_class_id, float threshold)
|
||||
{
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// scores: [batch_size, num_classes, num_priors]
|
||||
|
||||
const auto batch_size = indices.get_axis_size(0);
|
||||
CV_Assert(count.get_axis_size(0) == batch_size);
|
||||
CV_Assert(scores.get_axis_size(0) == batch_size);
|
||||
|
||||
const auto num_classes = indices.get_axis_size(1);
|
||||
CV_Assert(count.get_axis_size(1) == num_classes);
|
||||
CV_Assert(scores.get_axis_size(1) == num_classes);
|
||||
|
||||
const auto classwise_topK = indices.get_axis_size(2);
|
||||
const auto num_priors = scores.get_axis_size(2);
|
||||
|
||||
/* each block processes one class from each batch */
|
||||
constexpr auto BLOCK_SIZE = 256;
|
||||
|
||||
dim3 grid_size(num_classes, batch_size);
|
||||
dim3 block_size(BLOCK_SIZE);
|
||||
auto policy = execution_policy(grid_size, block_size, stream);
|
||||
|
||||
auto kernel = raw::findTopK<T, 2048, BLOCK_SIZE>;
|
||||
launch_kernel(kernel, policy, indices, count, scores, threshold, classwise_topK, num_classes, num_priors, background_class_id);
|
||||
}
|
||||
|
||||
template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, std::size_t, float);
|
||||
template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, std::size_t, float);
|
||||
|
||||
template <class T>
|
||||
void box_collect(const Stream& stream, TensorSpan<T> collected_bboxes, TensorView<T> decoded_bboxes, TensorView<int> indices, TensorView<int> count, bool share_location, std::size_t background_class_id)
|
||||
{
|
||||
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
|
||||
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
|
||||
const auto batch_size = collected_bboxes.get_axis_size(0);
|
||||
CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
|
||||
CV_Assert(indices.get_axis_size(0) == batch_size);
|
||||
CV_Assert(count.get_axis_size(0) == batch_size);
|
||||
|
||||
const auto num_classes = collected_bboxes.get_axis_size(1);
|
||||
CV_Assert(indices.get_axis_size(1) == num_classes);
|
||||
CV_Assert(count.get_axis_size(1) == num_classes);
|
||||
|
||||
const auto classwise_topK = collected_bboxes.get_axis_size(2);
|
||||
CV_Assert(indices.get_axis_size(2) == classwise_topK);
|
||||
|
||||
const auto num_priors = decoded_bboxes.get_axis_size(1);
|
||||
|
||||
CV_Assert(!share_location || decoded_bboxes.get_axis_size(2) == 1);
|
||||
|
||||
constexpr int BLOCK_SIZE = 256;
|
||||
|
||||
/* each block processes one class from each batch */
|
||||
dim3 grid_size(num_classes, batch_size);
|
||||
dim3 block_size(BLOCK_SIZE);
|
||||
auto policy = execution_policy(grid_size, block_size, stream);
|
||||
|
||||
auto kernel = raw::box_collect<T>;
|
||||
launch_kernel(kernel, policy, collected_bboxes, decoded_bboxes, indices, count, share_location, num_priors, num_classes, classwise_topK, background_class_id);
|
||||
}
|
||||
|
||||
template void box_collect(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<int>, TensorView<int>, bool, std::size_t);
|
||||
template void box_collect(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<int>, TensorView<int>, bool, std::size_t);
|
||||
|
||||
template <class T>
|
||||
void blockwise_class_nms(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> collected_bboxes,
|
||||
bool normalized_bbox, std::size_t background_class_id, float nms_threshold)
|
||||
{
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
|
||||
|
||||
const auto batch_size = indices.get_axis_size(0);
|
||||
CV_Assert(count.get_axis_size(0) == batch_size);
|
||||
CV_Assert(collected_bboxes.get_axis_size(0) == batch_size);
|
||||
|
||||
const auto num_classes = indices.get_axis_size(1);
|
||||
CV_Assert(count.get_axis_size(1) == num_classes);
|
||||
CV_Assert(collected_bboxes.get_axis_size(1) == num_classes);
|
||||
|
||||
const auto classwise_topK = indices.get_axis_size(2);
|
||||
CV_Assert(collected_bboxes.get_axis_size(2) == classwise_topK);
|
||||
|
||||
/* each block processes one class from each batch */
|
||||
auto num_blocks = batch_size * num_classes;
|
||||
auto num_threads = std::max<std::size_t>(std::min<std::size_t>(1024, classwise_topK), 32);
|
||||
|
||||
dim3 grid_size(num_blocks);
|
||||
dim3 block_size(num_threads);
|
||||
auto policy = execution_policy(grid_size, block_size, stream);
|
||||
|
||||
if (normalized_bbox)
|
||||
{
|
||||
auto kernel = raw::blockwise_class_nms<T, true>;
|
||||
launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto kernel = raw::blockwise_class_nms<T, false>;
|
||||
launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
|
||||
}
|
||||
}
|
||||
|
||||
template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, bool, std::size_t, float);
|
||||
template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, bool, std::size_t, float);
|
||||
|
||||
template <class T>
|
||||
void nms_collect(const Stream& stream, TensorSpan<int> kept_indices, TensorSpan<int> kept_count,
|
||||
TensorView<int> indices, TensorView<int> count, TensorView<T> scores, float threshold, std::size_t background_class_id)
|
||||
{
|
||||
// kept_indices: [batch_size, keepTopK]
|
||||
// kept_count: [batch_size]
|
||||
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// scores: [batch_size, num_classes, num_priors]
|
||||
|
||||
auto batch_size = kept_indices.get_axis_size(0);
|
||||
CV_Assert(kept_count.get_axis_size(0) == batch_size);
|
||||
CV_Assert(indices.get_axis_size(0) == batch_size);
|
||||
CV_Assert(count.get_axis_size(0) == batch_size);
|
||||
CV_Assert(scores.get_axis_size(0) == batch_size);
|
||||
|
||||
auto keepTopK = kept_indices.get_axis_size(1);
|
||||
|
||||
auto num_classes = indices.get_axis_size(1);
|
||||
CV_Assert(count.get_axis_size(1) == num_classes);
|
||||
CV_Assert(scores.get_axis_size(1) == num_classes);
|
||||
|
||||
auto classwise_topK = indices.get_axis_size(2);
|
||||
auto num_priors = scores.get_axis_size(2);
|
||||
|
||||
auto num_blocks = batch_size;
|
||||
constexpr int BLOCK_SIZE = 1024;
|
||||
|
||||
dim3 grid_size(num_blocks);
|
||||
dim3 block_size(BLOCK_SIZE);
|
||||
auto policy = execution_policy(grid_size, block_size, stream);
|
||||
|
||||
auto kernel = raw::nms_collect<T, 1024, BLOCK_SIZE>;
|
||||
launch_kernel(kernel, policy, kept_indices, kept_count, indices, count, scores, threshold, num_classes, num_priors, classwise_topK, keepTopK, background_class_id);
|
||||
}
|
||||
|
||||
template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<__half>, float, std::size_t);
|
||||
template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<float>, float, std::size_t);
|
||||
|
||||
template <class T>
|
||||
void consolidate_detections(const Stream& stream, TensorSpan<T> output,
|
||||
TensorView<int> kept_indices, TensorView<int> kept_count,
|
||||
TensorView<T> decoded_bboxes, TensorView<T> scores, bool share_location, DevicePtr<int> num_detections)
|
||||
{
|
||||
// output: [1, 1, batch_size * keepTopK, 7]
|
||||
// kept_indices: [batch_size, keepTopK]
|
||||
// kept_count: [batch_size]
|
||||
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
// scores: [batch_size, num_classes, num_priors]
|
||||
|
||||
auto batch_size = kept_indices.get_axis_size(0);
|
||||
CV_Assert(kept_count.get_axis_size(0) == batch_size);
|
||||
CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
|
||||
CV_Assert(scores.get_axis_size(0) == batch_size);
|
||||
|
||||
auto keepTopK = kept_indices.get_axis_size(1);
|
||||
|
||||
auto num_classes = scores.get_axis_size(1);
|
||||
auto num_priors = scores.get_axis_size(2);
|
||||
|
||||
CV_Assert(batch_size * keepTopK * 7 == output.size());
|
||||
|
||||
auto kernel = raw::consolidate_detections<T>;
|
||||
auto policy = make_policy(kernel, keepTopK, 0, stream);
|
||||
launch_kernel(kernel, policy, output, kept_indices, kept_count, decoded_bboxes, scores, share_location, batch_size, num_classes, num_priors, keepTopK, num_detections);
|
||||
}
|
||||
|
||||
template void consolidate_detections(const Stream&, TensorSpan<__half>, TensorView<int>, TensorView<int>, TensorView<__half>, TensorView<__half>, bool, DevicePtr<int>);
|
||||
template void consolidate_detections(const Stream&, TensorSpan<float>, TensorView<int>, TensorView<int>, TensorView<float>, TensorView<float>, bool, DevicePtr<int>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
125
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_activation.cu
vendored
Normal file
125
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_activation.cu
vendored
Normal file
@ -0,0 +1,125 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "functors.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
|
||||
__global__ void eltwise_op_generic_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto x_vPtr = vector_type::get_pointer(x.data());
|
||||
auto y_vPtr = vector_type::get_pointer(y.data());
|
||||
|
||||
EltwiseOp eltwise_op(eltwise_params);
|
||||
ActivationOp activation_op(act_params);
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
vector_type vec_x, vec_y;
|
||||
v_load(vec_x, x_vPtr[i]);
|
||||
v_load(vec_y, y_vPtr[i]);
|
||||
for(int j = 0; j < vec_x.size(); j++)
|
||||
vec_x.data[j] = activation_op(eltwise_op(vec_x.data[j], vec_y.data[j]));
|
||||
v_store(output_vPtr[i], vec_x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
|
||||
void launch_vectorized_eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(x, N));
|
||||
CV_Assert(is_fully_aligned<T>(y, N));
|
||||
|
||||
auto kernel = raw::eltwise_op_generic_op_vec<T, EltwiseOp, ActivationOp, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, x, y, eltwise_params, act_params);
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, class ActivationOp> static
|
||||
void eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
|
||||
CV_Assert(output.size() == x.size());
|
||||
CV_Assert(output.size() == y.size());
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
|
||||
launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 4>(stream, output, x, y, eltwise_params, act_params);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 4)) {
|
||||
launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 2>(stream, output, x, y, eltwise_params, act_params);
|
||||
} else {
|
||||
launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 1>(stream, output, x, y, eltwise_params, act_params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T slope) {
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, ReLUFunctor<T>>(stream, output, x, y, {}, {slope});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_clipped_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T floor, T ceiling) {
|
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, output, x, y, {}, {floor, ceiling});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_tanh(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, TanHFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_swish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, SwishFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_mish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, MishFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_sigmoid(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_power(const Stream& stream, Span<T> output, View<T> x, View<T> y, T exp, T scale, T shift) {
|
||||
eltwise_op_generic_op<T, SumFunctor<T>, PowerFunctor<T>>(stream, output, x, y, {}, {exp, scale, shift});
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void eltwise_sum_2_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half);
|
||||
template void eltwise_sum_2_clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half);
|
||||
template void eltwise_sum_2_tanh<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
|
||||
template void eltwise_sum_2_swish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
|
||||
template void eltwise_sum_2_mish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
|
||||
template void eltwise_sum_2_sigmoid<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
|
||||
template void eltwise_sum_2_power<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half, __half);
|
||||
#endif
|
||||
|
||||
template void eltwise_sum_2_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float);
|
||||
template void eltwise_sum_2_clipped_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float, float);
|
||||
template void eltwise_sum_2_tanh<float>(const Stream&, Span<float>, View<float>, View<float>);
|
||||
template void eltwise_sum_2_swish<float>(const Stream&, Span<float>, View<float>, View<float>);
|
||||
template void eltwise_sum_2_mish<float>(const Stream&, Span<float>, View<float>, View<float>);
|
||||
template void eltwise_sum_2_sigmoid<float>(const Stream&, Span<float>, View<float>, View<float>);
|
||||
template void eltwise_sum_2_power<float>(const Stream&, Span<float>, View<float>, View<float>, float, float, float);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
334
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_ops.cu
vendored
Normal file
334
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/eltwise_ops.cu
vendored
Normal file
@ -0,0 +1,334 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "functors.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "kernel_dispatcher.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, class EltwiseOp, std::size_t N>
|
||||
__global__ void eltwise_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params params) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto x_vPtr = vector_type::get_pointer(x.data());
|
||||
auto y_vPtr = vector_type::get_pointer(y.data());
|
||||
|
||||
EltwiseOp eltwise_op(params);
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
vector_type vec_x, vec_y;
|
||||
v_load(vec_x, x_vPtr[i]);
|
||||
v_load(vec_y, y_vPtr[i]);
|
||||
for (int j = 0; j < vector_type::size(); j++)
|
||||
vec_x.data[j] = eltwise_op(vec_x.data[j], vec_y.data[j]);
|
||||
v_store(output_vPtr[i], vec_x);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, std::size_t Rank>
|
||||
__global__ void eltwise_op_bcast(
|
||||
Span<T> output, array<size_type, Rank> out_strides,
|
||||
View<T> x, array<size_type, Rank> x_strides, array<bool, Rank> x_bcast,
|
||||
View<T> y, array<size_type, Rank> y_strides, array<bool, Rank> y_bcast,
|
||||
const typename EltwiseOp::Params params) {
|
||||
EltwiseOp eltwise_op(params);
|
||||
|
||||
for (auto i : grid_stride_range(output.size())) {
|
||||
index_type out_index = i / out_strides[0];
|
||||
index_type x_index = x_bcast[0] ? 0 : out_index * x_strides[0];
|
||||
index_type y_index = y_bcast[0] ? 0 : out_index * y_strides[0];
|
||||
|
||||
for (int j = 1; j < Rank; j++)
|
||||
{
|
||||
out_index = (i % out_strides[j - 1]) / out_strides[j];
|
||||
if (!x_bcast[j])
|
||||
x_index += out_index * x_strides[j];
|
||||
if (!y_bcast[j])
|
||||
y_index += out_index * y_strides[j];
|
||||
}
|
||||
|
||||
output[i] = eltwise_op(x[x_index], y[y_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, std::size_t N> static
|
||||
void launch_vectorized_eltwise_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& params) {
|
||||
CV_Assert(x.size() == y.size());
|
||||
CV_Assert(x.size() == output.size());
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(x, N));
|
||||
CV_Assert(is_fully_aligned<T>(y, N));
|
||||
|
||||
auto kernel = raw::eltwise_op_vec<T, EltwiseOp, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, x, y, params);
|
||||
}
|
||||
|
||||
template <class T, class EltwiseOp, std::size_t Rank> static
|
||||
void launch_eltwise_op_bcast(
|
||||
const Stream& stream,
|
||||
Span<T> output, const std::vector<std::size_t>& outStride,
|
||||
View<T> x, const std::vector<std::size_t>& inStride1, const std::vector<int>& inBcast1,
|
||||
View<T> y, const std::vector<std::size_t>& inStride2, const std::vector<int>& inBcast2,
|
||||
const typename EltwiseOp::Params& params)
|
||||
{
|
||||
CV_Assert(outStride.size() == Rank);
|
||||
CV_Assert(inStride1.size() == Rank);
|
||||
CV_Assert(inStride2.size() == Rank);
|
||||
CV_Assert(inBcast1.size() == Rank);
|
||||
CV_Assert(inBcast2.size() == Rank);
|
||||
|
||||
array<size_type, Rank> outStride_k, inStride1_k, inStride2_k;
|
||||
outStride_k.assign(std::begin(outStride), std::end(outStride));
|
||||
inStride1_k.assign(std::begin(inStride1), std::end(inStride1));
|
||||
inStride2_k.assign(std::begin(inStride2), std::end(inStride2));
|
||||
|
||||
array<bool, Rank> inBcast1_k, inBcast2_k;
|
||||
inBcast1_k.assign(std::begin(inBcast1), std::end(inBcast1));
|
||||
inBcast2_k.assign(std::begin(inBcast2), std::end(inBcast2));
|
||||
|
||||
auto kernel = raw::eltwise_op_bcast<T, EltwiseOp, Rank>;
|
||||
auto policy = make_policy(kernel, output.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, outStride_k, x, inStride1_k, inBcast1_k, y, inStride2_k, inBcast2_k, params);
|
||||
}
|
||||
|
||||
GENERATE_KERNEL_DISPATCHER_2TP(eltwise_op_bcast_dispatcher, launch_eltwise_op_bcast);
|
||||
|
||||
template <class T, class EltwiseOp> static
|
||||
void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y, const typename EltwiseOp::Params& params = {}) {
|
||||
if (is_shape_same(output, x) && is_shape_same(output, y))
|
||||
{
|
||||
/* no broadcasting; use fast path */
|
||||
CV_Assert(x.size() == y.size());
|
||||
CV_Assert(x.size() == output.size());
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
|
||||
launch_vectorized_eltwise_op<T, EltwiseOp, 4>(stream, output, x, y, params);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
|
||||
launch_vectorized_eltwise_op<T, EltwiseOp, 2>(stream, output, x, y, params);
|
||||
} else {
|
||||
launch_vectorized_eltwise_op<T, EltwiseOp, 1>(stream, output, x, y, params);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(is_shape_compatible(output, x));
|
||||
CV_Assert(is_shape_compatible(output, y));
|
||||
|
||||
/* matching singleton axes in both input tensors can be eliminated
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Singleton axes do not contribute towards address calculation. They are redundant
|
||||
* unless there is broadcasting. If both input tensors have singleton axis at a
|
||||
* specified position, there is no broadcasting on that axis.
|
||||
*
|
||||
* Example:
|
||||
* ---------
|
||||
* x: [1, 256, 32, 32] -> [256, 32, 32]
|
||||
* y: [1, 256, 1, 1] -> [256, 1, 1]
|
||||
*/
|
||||
for (int r = 0; r < output.rank(); r++)
|
||||
{
|
||||
while (x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
|
||||
CV_Assert(output.get_axis_size(r) == 1);
|
||||
|
||||
x.squeeze(r);
|
||||
y.squeeze(r);
|
||||
output.squeeze(r);
|
||||
}
|
||||
}
|
||||
|
||||
auto inShape1 = x.shape_as_vector();
|
||||
auto inShape2 = y.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
/* contiguous axes that do not broadcast can be merged into one axis
|
||||
*
|
||||
* Example:
|
||||
* ---------
|
||||
* x: [32, 8, 8] -> [32, 64]
|
||||
* y: [1, 8, 8] -> [1, 64]
|
||||
*/
|
||||
for (int i = 0; i < inShape1.size(); i++) {
|
||||
/* check if axis `i` requires any broadcasting */
|
||||
if (inShape1[i] == inShape2[i]) {
|
||||
/* loop invariant: `i` is the first axis in the contiguous axis sequence */
|
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < inShape1.size() && inShape1[j] == inShape2[j]) {
|
||||
CV_Assert(outShape[j] == inShape1[j]);
|
||||
|
||||
/* `j` axis is also used fully; merge `i` and `j` */
|
||||
auto new_size = inShape1[i] * inShape1[j];
|
||||
inShape1[i] = new_size;
|
||||
inShape2[i] = new_size;
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape1.erase(std::begin(inShape1) + j);
|
||||
inShape2.erase(std::begin(inShape2) + j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(inShape1.size() == outShape.size());
|
||||
CV_Assert(inShape2.size() == outShape.size());
|
||||
CV_Assert(inShape1[i] == outShape[i]);
|
||||
CV_Assert(inShape2[i] == outShape[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* contiguous broadcasting axes on the same tensor can be merged into one axis
|
||||
*
|
||||
* Example:
|
||||
* ---------
|
||||
* x: [256, 8, 8] -> [256, 64]
|
||||
* y: [256, 1, 1] -> [256, 1]
|
||||
*/
|
||||
for (int i = 0; i < inShape1.size(); i++) {
|
||||
/* check if axis `i` requires any broadcasting in tensor 1 */
|
||||
if (inShape1[i] == 1 && inShape2[i] != 1) {
|
||||
/* loop invariant: `i` is the first axis in the contiguous axis sequence */
|
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < inShape1.size() && inShape1[j] == 1 && inShape2[j] != 1) {
|
||||
CV_Assert(outShape[j] == inShape2[j]);
|
||||
|
||||
/* `j` axis is also used fully; merge `i` and `j` */
|
||||
inShape1[i] = 1;
|
||||
inShape2[i] = inShape2[i] * inShape2[j];
|
||||
outShape[i] = inShape2[i];
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape1.erase(std::begin(inShape1) + j);
|
||||
inShape2.erase(std::begin(inShape2) + j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(inShape1.size() == outShape.size());
|
||||
CV_Assert(inShape2.size() == outShape.size());
|
||||
CV_Assert(inShape1[i] == 1);
|
||||
CV_Assert(inShape2[i] == outShape[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* check if axis `i` requires any broadcasting in tensor 2 */
|
||||
if (inShape1[i] != 1 && inShape2[i] == 1) {
|
||||
/* loop invariant: `i` is the first axis in the contiguous axis sequence */
|
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < inShape1.size() && inShape1[j] != 1 && inShape2[j] == 1) {
|
||||
CV_Assert(outShape[j] == inShape1[j]);
|
||||
|
||||
/* `j` axis is also used fully; merge `i` and `j` */
|
||||
inShape1[i] = inShape1[i] * inShape1[j];
|
||||
inShape2[i] = 1;
|
||||
outShape[i] = inShape1[i];
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape1.erase(std::begin(inShape1) + j);
|
||||
inShape2.erase(std::begin(inShape2) + j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(inShape1.size() == outShape.size());
|
||||
CV_Assert(inShape2.size() == outShape.size());
|
||||
CV_Assert(inShape1[i] == outShape[i]);
|
||||
CV_Assert(inShape2[i] == 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rank = outShape.size();
|
||||
|
||||
std::vector<std::size_t> inStride1(rank), inStride2(rank), outStride(rank);
|
||||
inStride1.back() = 1;
|
||||
inStride2.back() = 1;
|
||||
outStride.back() = 1;
|
||||
/* garbage, ..., garbage, 1 */
|
||||
|
||||
std::copy(std::begin(inShape1) + 1, std::end(inShape1), std::begin(inStride1));
|
||||
std::copy(std::begin(inShape2) + 1, std::end(inShape2), std::begin(inStride2));
|
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
|
||||
/* dim[0], dim[1], ..., dim[-1], 1 */
|
||||
|
||||
std::partial_sum(inStride1.rbegin(), inStride1.rend(), inStride1.rbegin(), std::multiplies<std::size_t>());
|
||||
std::partial_sum(inStride2.rbegin(), inStride2.rend(), inStride2.rbegin(), std::multiplies<std::size_t>());
|
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
|
||||
/* stride[0], stride[1], ..., stride[-2], 1 */
|
||||
|
||||
std::vector<int> inBcast1(rank), inBcast2(rank);
|
||||
std::transform(std::begin(inShape1), std::end(inShape1), std::begin(inBcast1), [](std::size_t sz) { return sz == 1; });
|
||||
std::transform(std::begin(inShape2), std::end(inShape2), std::begin(inBcast2), [](std::size_t sz) { return sz == 1; });
|
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
|
||||
eltwise_op_bcast_dispatcher<T, EltwiseOp, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, x, inStride1, inBcast1, y, inStride2, inBcast2, params);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_max_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
|
||||
eltwise_op<T, MaxFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_min_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
|
||||
eltwise_op<T, MinFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
|
||||
eltwise_op<T, SumFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_coeff_2(const Stream& stream, TensorSpan<T> output, T coeff_x, TensorView<T> x, T coeff_y, TensorView<T> y) {
|
||||
eltwise_op<T, ScaledSumFunctor<T>>(stream, output, x, y, {coeff_x, coeff_y});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_prod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
|
||||
eltwise_op<T, ProductFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
|
||||
eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
|
||||
template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
|
||||
template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
|
||||
template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
|
||||
template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
|
||||
template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
|
||||
#endif
|
||||
template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
|
||||
template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
|
||||
template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);
|
||||
template void eltwise_sum_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
|
||||
template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
|
||||
template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
81
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/execution.hpp
vendored
Normal file
81
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/execution.hpp
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
|
||||
|
||||
#include "../cuda4dnn/csl/error.hpp"
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
struct execution_policy {
|
||||
execution_policy(dim3 grid_size, dim3 block_size)
|
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
|
||||
|
||||
execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
|
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
|
||||
|
||||
execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
|
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
|
||||
|
||||
execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
|
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
|
||||
|
||||
dim3 grid;
|
||||
dim3 block;
|
||||
std::size_t sharedMem;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
/* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
|
||||
/*
|
||||
template <class Kernel> inline
|
||||
execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
|
||||
int grid_size, block_size;
|
||||
CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
|
||||
return execution_policy(grid_size, block_size, sharedMem, stream);
|
||||
}*/
|
||||
|
||||
template <class Kernel> inline
|
||||
execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
|
||||
CV_Assert(max_threads > 0);
|
||||
|
||||
int grid_size = 0, block_size = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
|
||||
if (grid_size * block_size > max_threads) {
|
||||
grid_size = (max_threads + block_size - 1) / block_size;
|
||||
if (block_size > max_threads)
|
||||
block_size = max_threads;
|
||||
}
|
||||
|
||||
CV_Assert(grid_size >= 1 && block_size >= 1);
|
||||
return execution_policy(grid_size, block_size, sharedMem, stream);
|
||||
}
|
||||
|
||||
template <class Kernel, typename ...Args> inline
|
||||
void launch_kernel(Kernel kernel, Args ...args) {
|
||||
auto policy = make_policy(kernel);
|
||||
kernel <<<policy.grid, policy.block>>> (args...);
|
||||
}
|
||||
|
||||
template <class Kernel, typename ...Args> inline
|
||||
void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
|
||||
kernel <<<grid, block>>> (args...);
|
||||
}
|
||||
|
||||
template <class Kernel, typename ...Args> inline
|
||||
void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
|
||||
kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (args...);
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */
|
98
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fill_copy.cu
vendored
Normal file
98
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fill_copy.cu
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t N>
|
||||
__global__ void fill_vec(Span<T> output, T value) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
vector_type vec;
|
||||
for (int j = 0; j < vector_type::size(); j++)
|
||||
vec.data[j] = value;
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
__global__ void copy_vec(Span<T> output, View<T> input) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
|
||||
auto kernel = raw::fill_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, value);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void fill(const Stream& stream, Span<T> output, T value) {
|
||||
if (is_fully_aligned<T>(output, 4)) {
|
||||
launch_vectorized_fill<T, 4>(stream, output, value);
|
||||
} else if (is_fully_aligned<T>(output, 2)) {
|
||||
launch_vectorized_fill<T, 2>(stream, output, value);
|
||||
} else {
|
||||
launch_vectorized_fill<T, 1>(stream, output, value);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void fill(const Stream&, Span<__half>, __half);
|
||||
#endif
|
||||
template void fill(const Stream&, Span<float>, float);
|
||||
template void fill(const Stream&, Span<int>, int);
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_vectorized_copy(const Stream& stream, Span<T> output, View<T> input) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
|
||||
auto kernel = raw::copy_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void copy(const Stream& stream, Span<T> output, View<T> input) {
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
|
||||
launch_vectorized_copy<T, 4>(stream, output, input);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
|
||||
launch_vectorized_copy<T, 2>(stream, output, input);
|
||||
} else {
|
||||
launch_vectorized_copy<T, 1>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void copy(const Stream&, Span<__half>, View<__half>);
|
||||
#endif
|
||||
template void copy(const Stream&, Span<float>, View<float>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
102
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fp_conversion.cu
vendored
Normal file
102
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/fp_conversion.cu
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <std::size_t N>
|
||||
__global__ void fp32_to_fp16(Span<__half> output, View<float> input) {
|
||||
using output_vector_type = get_vector_type_t<__half, N>;
|
||||
using input_vector_type = get_vector_type_t<float, N>;
|
||||
|
||||
auto output_vPtr = output_vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = input_vector_type::get_pointer(input.data());
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
|
||||
input_vector_type in_vec;
|
||||
v_load(in_vec, input_vPtr[i]);
|
||||
|
||||
output_vector_type out_vec;
|
||||
for (int j = 0; j < output_vector_type::size(); j++)
|
||||
out_vec.data[j] = __float2half(in_vec.data[j]);
|
||||
|
||||
v_store(output_vPtr[i], out_vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
__global__ void fp16_to_fp32(Span<float> output, View<__half> input) {
|
||||
using output_vector_type = get_vector_type_t<float, N>;
|
||||
using input_vector_type = get_vector_type_t<__half, N>;
|
||||
|
||||
auto output_vPtr = output_vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = input_vector_type::get_pointer(input.data());
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
|
||||
input_vector_type in_vec;
|
||||
v_load(in_vec, input_vPtr[i]);
|
||||
|
||||
output_vector_type out_vec;
|
||||
for (int j = 0; j < output_vector_type::size(); j++)
|
||||
out_vec.data[j] = __half2float(in_vec.data[j]);
|
||||
|
||||
v_store(output_vPtr[i], out_vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <std::size_t N> static
|
||||
void launch_vectorized_fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
|
||||
CV_Assert(is_fully_aligned<__half>(output, N));
|
||||
CV_Assert(is_fully_aligned<float>(input, N));
|
||||
|
||||
auto kernel = raw::fp32_to_fp16<N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input);
|
||||
}
|
||||
|
||||
void fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
|
||||
if (is_fully_aligned<__half>(output, 4) && is_fully_aligned<float>(input, 4)) {
|
||||
launch_vectorized_fp32_to_fp16<4>(stream, output, input);
|
||||
} else if (is_fully_aligned<__half>(output, 2) && is_fully_aligned<float>(input, 2)) {
|
||||
launch_vectorized_fp32_to_fp16<2>(stream, output, input);
|
||||
} else {
|
||||
launch_vectorized_fp32_to_fp16<1>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
template <std::size_t N> static
|
||||
void launch_vectorized_fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
|
||||
CV_Assert(is_fully_aligned<float>(output, N));
|
||||
CV_Assert(is_fully_aligned<__half>(input, N));
|
||||
|
||||
auto kernel = raw::fp16_to_fp32<N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input);
|
||||
}
|
||||
|
||||
void fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
|
||||
if (is_fully_aligned<float>(output, 4) && is_fully_aligned<__half>(input, 4)) {
|
||||
launch_vectorized_fp16_to_fp32<4>(stream, output, input);
|
||||
} else if (is_fully_aligned<float>(output, 2) && is_fully_aligned<__half>(input, 2)) {
|
||||
launch_vectorized_fp16_to_fp32<2>(stream, output, input);
|
||||
} else {
|
||||
launch_vectorized_fp16_to_fp32<1>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
334
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/functors.hpp
vendored
Normal file
334
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/functors.hpp
vendored
Normal file
@ -0,0 +1,334 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "math.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/nvcc_defs.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
struct IdentityFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE IdentityFunctor() { }
|
||||
CUDA4DNN_DEVICE IdentityFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
return value;
|
||||
};
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ReLUFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() : slope(0) { }
|
||||
CUDA4DNN_HOST_DEVICE Params(T slope_) : slope(slope_) { }
|
||||
T slope;
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE ReLUFunctor() : ReLUFunctor(Params{}) { }
|
||||
CUDA4DNN_DEVICE ReLUFunctor(const Params& params) : slope(params.slope) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::log1pexp;
|
||||
return value >= T(0) ? value : slope * value;
|
||||
}
|
||||
|
||||
T slope;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ClippedReLUFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() : floor(0), ceiling(6) { }
|
||||
CUDA4DNN_HOST_DEVICE Params(T floor_, T ceiling_) : floor(floor_), ceiling(ceiling_) { }
|
||||
T floor, ceiling;
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE ClippedReLUFunctor() : ClippedReLUFunctor(Params{}) { }
|
||||
CUDA4DNN_DEVICE ClippedReLUFunctor(const Params& params) : floor{params.floor}, ceiling{params.ceiling} { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::clamp;
|
||||
return clamp(value, floor, ceiling);
|
||||
}
|
||||
|
||||
T floor, ceiling;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct TanHFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE TanHFunctor() { }
|
||||
CUDA4DNN_DEVICE TanHFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::tanh;
|
||||
return tanh(value);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct SwishFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE SwishFunctor() { }
|
||||
CUDA4DNN_DEVICE SwishFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
// f(x) = x * sigmoid(x)
|
||||
using csl::device::fast_divide;
|
||||
using csl::device::fast_exp;
|
||||
return fast_divide(value, static_cast<T>(1) + fast_exp(-value));
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct MishFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE MishFunctor() { }
|
||||
CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::tanh;
|
||||
using csl::device::log1pexp;
|
||||
return value * tanh(log1pexp(value));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MishFunctor<float> {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE MishFunctor() { }
|
||||
CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE float operator()(float value) {
|
||||
// f(x) = x * tanh(log1pexp(x));
|
||||
using csl::device::fast_divide;
|
||||
using csl::device::fast_exp;
|
||||
|
||||
auto e = fast_exp(value);
|
||||
auto n = e * e + 2 * e;
|
||||
if (value <= -0.6f)
|
||||
return value * fast_divide(n, n + 2);
|
||||
return value - 2 * fast_divide(value, n + 2);
|
||||
}
|
||||
};
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <>
|
||||
struct MishFunctor<__half> {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE MishFunctor() { }
|
||||
CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE __half operator()(__half value) {
|
||||
return MishFunctor<float>()(value);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
struct SigmoidFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE SigmoidFunctor() { }
|
||||
CUDA4DNN_DEVICE SigmoidFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::fast_sigmoid;
|
||||
return fast_sigmoid(value);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ELUFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE ELUFunctor() { }
|
||||
CUDA4DNN_DEVICE ELUFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::expm1;
|
||||
return value >= T(0) ? value : expm1(value);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct AbsFunctor {
|
||||
struct Params { };
|
||||
|
||||
CUDA4DNN_DEVICE AbsFunctor() { }
|
||||
CUDA4DNN_DEVICE AbsFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::abs;
|
||||
return abs(value);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct BNLLFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE BNLLFunctor() { }
|
||||
CUDA4DNN_DEVICE BNLLFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::log1pexp;
|
||||
return value > T(0) ? value + log1pexp(-value) : log1pexp(value);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct PowerFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() : exp(1), scale(1), shift(0) { }
|
||||
CUDA4DNN_HOST_DEVICE Params(T exp_, T scale_, T shift_) : exp(exp_), scale(scale_), shift(shift_) { }
|
||||
T exp, scale, shift;
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE PowerFunctor() : PowerFunctor(Params{}) { }
|
||||
CUDA4DNN_DEVICE PowerFunctor(const Params& params) : exp{params.exp}, scale{params.scale}, shift{params.shift} { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::pow;
|
||||
return pow(shift + scale * value, exp);
|
||||
}
|
||||
|
||||
T exp, scale, shift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ExpFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() : normScale(1), normShift(0) { }
|
||||
CUDA4DNN_HOST_DEVICE Params(T nScale_, T nShift_) : normScale(nScale_), normShift(nShift_) { }
|
||||
T normScale, normShift;
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE ExpFunctor() : ExpFunctor(Params{}) { }
|
||||
CUDA4DNN_DEVICE ExpFunctor(const Params& params) : normScale{params.normScale}, normShift{params.normShift} { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T value) {
|
||||
using csl::device::fast_exp;
|
||||
return fast_exp(normShift + normScale * value);
|
||||
}
|
||||
|
||||
T normScale, normShift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct MaxFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE MaxFunctor() { }
|
||||
CUDA4DNN_DEVICE MaxFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T x, T y) {
|
||||
using csl::device::max;
|
||||
return max(x, y);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct MinFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE MinFunctor() { }
|
||||
CUDA4DNN_DEVICE MinFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T x, T y) {
|
||||
using csl::device::min;
|
||||
return min(x, y);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct SumFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE SumFunctor() { }
|
||||
CUDA4DNN_DEVICE SumFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T x, T y) { return x + y; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ScaledSumFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() : scale_x(1), scale_y(1) { }
|
||||
CUDA4DNN_HOST_DEVICE Params(T scale_x_, T scale_y_) : scale_x(scale_x_), scale_y(scale_y_) { }
|
||||
T scale_x, scale_y;
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE ScaledSumFunctor() : scale_x(1), scale_y(1) { }
|
||||
CUDA4DNN_DEVICE ScaledSumFunctor(const Params& params) : scale_x{params.scale_x}, scale_y{params.scale_y} { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T x, T y) { return scale_x * x + scale_y * y; }
|
||||
|
||||
T scale_x, scale_y;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct ProductFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE ProductFunctor() { }
|
||||
CUDA4DNN_DEVICE ProductFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T x, T y) { return x * y; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct DivFunctor {
|
||||
struct Params {
|
||||
CUDA4DNN_HOST_DEVICE Params() { }
|
||||
};
|
||||
|
||||
CUDA4DNN_DEVICE DivFunctor() { }
|
||||
CUDA4DNN_DEVICE DivFunctor(const Params& params) { }
|
||||
|
||||
CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
|
467
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_nms.cu
vendored
Normal file
467
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_nms.cu
vendored
Normal file
@ -0,0 +1,467 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "bbox_utils.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "block_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "memory.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, bool NORMALIZED_BBOX, int BLOCK_SIZE>
|
||||
__launch_bounds__(BLOCK_SIZE)
|
||||
__global__ void grid_nms(Span<unsigned int> mask_, Span<int> count_, View<T> bboxes_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs, float nms_threshold)
|
||||
{
|
||||
// topK_gs is topK rounded upwards to some size
|
||||
|
||||
// mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
|
||||
// bboxes: [batch_size, num_classes, topK, 4]
|
||||
// count: [batch_size, num_classes]
|
||||
|
||||
const index_type c = blockIdx.y;
|
||||
const index_type b = blockIdx.z;
|
||||
|
||||
if (c == background_class_id)
|
||||
return;
|
||||
|
||||
auto mask = mask_.data() + (b * num_classes + c) * topK_gs * topK_gs / 32;
|
||||
auto bboxes = bboxes_.data() + (b * num_classes + c) * topK * 4;
|
||||
auto count = count_.data() + b * num_classes + c;
|
||||
|
||||
const auto boxes = *count;
|
||||
if (boxes == 0)
|
||||
return;
|
||||
|
||||
/* We divide the set of boxes into groups containing BLOCK_SIZE boxes */
|
||||
const auto num_groups = (boxes + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
/* We need to calculate IOUs for every pair of boxes. We can generalize and say that
|
||||
* we need to compute IOUs of every group with every other group including itself.
|
||||
*/
|
||||
// Each block processes a pair of groups.
|
||||
const index_type group_i = blockIdx.x % num_groups;
|
||||
const index_type group_j = blockIdx.x / num_groups;
|
||||
|
||||
/* we use __syncthreads() later but note that the following condition will cause all threads
|
||||
* in the block to exit; hence, no thread will execute a divergent __syncthreads()
|
||||
*/
|
||||
if (group_i >= num_groups || group_j >= num_groups)
|
||||
return;
|
||||
|
||||
/* Note that IOU(A, B) = IOU(B, A). Hence, if we compute IOU(GROUP_A, GROUP_B), we do not need
|
||||
* to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
|
||||
* each group has many boxes and we need IOUs amongst boxes within a group.
|
||||
*
|
||||
* We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
|
||||
* compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
|
||||
* early on and the working blocks are compute heavy.
|
||||
*/
|
||||
if (group_i > group_j)
|
||||
return;
|
||||
|
||||
/* the following variables contain the absolute box number of the first box of their respective groups */
|
||||
const auto group_i_offset = group_i * BLOCK_SIZE;
|
||||
const auto group_j_offset = group_j * BLOCK_SIZE;
|
||||
|
||||
/* MAIN LOOP LOGIC:
|
||||
* We compare a box `i` from group_i with all boxes in group_j in each iteration. The box `j` is fixed
|
||||
* for each thread. The `j` exactly maps to the thread index. Hence, the `j` is a loop invariant. Each
|
||||
* thread of the block computes the overlap between box `i` and its box `j`.
|
||||
*
|
||||
* for (int i = 0; i < BLOCK_SIZE; i++)
|
||||
* {
|
||||
* // i = box 1
|
||||
* // j = threadIdx.x = box 2
|
||||
* }
|
||||
*/
|
||||
|
||||
/* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
|
||||
* We store the `i` boxes in shared memory to allow global memory coalesing.
|
||||
*/
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
__shared__ vector_type group_i_boxes[BLOCK_SIZE];
|
||||
|
||||
/* We will precompute the sizes of `i` boxes in the code where we load them. The size computation
|
||||
* is distributed across the block. Otherwise, all threads will have to compute the size of the same
|
||||
* box simultaneously in the main loop. The size is computed while the memory subsystem is busy
|
||||
* servicing requests for box coordinates; the compute resources would otherwise be idle in this phase.
|
||||
*/
|
||||
/* we store the size as a float since the size can exceed fp16 limits for unnormalized boxes */
|
||||
__shared__ float group_i_size[BLOCK_SIZE];
|
||||
|
||||
const auto bboxes_vPtr = vector_type::get_pointer(bboxes);
|
||||
|
||||
// load `i` boxes and precompute their sizes
|
||||
{
|
||||
int i = threadIdx.x;
|
||||
if (group_i_offset + i < boxes)
|
||||
{
|
||||
vector_type box;
|
||||
v_load(box, bboxes_vPtr[group_i_offset + i]);
|
||||
v_store(group_i_boxes[i], box);
|
||||
|
||||
BoundingBox bbox;
|
||||
bbox.xmin = box.data[0];
|
||||
bbox.ymin = box.data[1];
|
||||
bbox.xmax = box.data[2];
|
||||
bbox.ymax = box.data[3];
|
||||
|
||||
group_i_size[i] = compute_bbox_size<NORMALIZED_BBOX>(bbox);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/* We compute overlap between boxes and check if the IOU exceeds the nms threshold.
|
||||
* We store the result (exceeds or below nms_thresold) in a two-dimensional matrix.
|
||||
* (i, j) is set to one if the overlap between i and j is within the nms threshold.
|
||||
* We pack 32 results into one 32-bit integer. The effective memory layout of the
|
||||
* matrix hence is (BLOCK_SIZE, BLOCK_SIZE / 32).
|
||||
*/
|
||||
__shared__ unsigned int mask_shared[BLOCK_SIZE * BLOCK_SIZE / 32];
|
||||
|
||||
// load box `j` and precompute its size (fixed per thread)
|
||||
BoundingBox bbox_j;
|
||||
float bbox_j_size = 0;
|
||||
if (group_j_offset + threadIdx.x < boxes)
|
||||
{
|
||||
vector_type box;
|
||||
v_load(box, bboxes_vPtr[group_j_offset + threadIdx.x]);
|
||||
|
||||
bbox_j.xmin = box.data[0];
|
||||
bbox_j.ymin = box.data[1];
|
||||
bbox_j.xmax = box.data[2];
|
||||
bbox_j.ymax = box.data[3];
|
||||
|
||||
bbox_j_size = compute_bbox_size<NORMALIZED_BBOX>(bbox_j);
|
||||
}
|
||||
|
||||
/* Each thread computes a predicate which is broadcasted across the warp to obtain a 32-bit mask.
|
||||
* The lane zero thread of each warp saves the mask. We store the offset to the mask array beforehand
|
||||
* to save cycles in the compute-intensive main loop.
|
||||
*/
|
||||
auto mask_offset = threadIdx.x / 32;
|
||||
|
||||
/* The main loop is compute intensive and causes the kernel to be overall compute-bound. Hence,
|
||||
* this loop has been highly tuned. Please profile and verify carefully before making changes.
|
||||
*/
|
||||
/* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
|
||||
* the loop since the compiler cannot effectively unroll on its own preassumably due to presence
|
||||
* of instructions forcing warp synchronization.
|
||||
*/
|
||||
constexpr int UNROLL_SIZE = 4;
|
||||
|
||||
#pragma unroll 8
|
||||
for (int s = 0; s < BLOCK_SIZE; s += UNROLL_SIZE)
|
||||
{
|
||||
bool do_not_reject_j[UNROLL_SIZE];
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < UNROLL_SIZE; k++)
|
||||
{
|
||||
int i = s + k;
|
||||
|
||||
/* The number of boxes need not necessarily be a multiple of BLOCK_SIZE.
|
||||
* However, the shared memory allocated can hold BLOCK_SIZE boxes from
|
||||
* each group. Accessing the undefined regions of shared memory is
|
||||
* a valid memory operation as long as the memory has been allocated.
|
||||
*
|
||||
* The condition below is only required when one of the groups does not
|
||||
* fully filled with valid boxes. This situations are relatively rare. It's
|
||||
* more common to see both groups completely filled.
|
||||
*
|
||||
* We comment this condition to improve the performance of the common case.
|
||||
* This leads to a net improvement.
|
||||
*/
|
||||
// if (group_i_offset + i < boxes && group_j_offset + threadIdx.x < boxes)
|
||||
{
|
||||
BoundingBox bbox_i;
|
||||
float bbox_i_size;
|
||||
{
|
||||
vector_type box;
|
||||
v_load(box, group_i_boxes[i]);
|
||||
bbox_i.xmin = box.data[0];
|
||||
bbox_i.ymin = box.data[1];
|
||||
bbox_i.xmax = box.data[2];
|
||||
bbox_i.ymax = box.data[3];
|
||||
|
||||
bbox_i_size = group_i_size[i];
|
||||
}
|
||||
|
||||
using device::min;
|
||||
using device::max;
|
||||
|
||||
BoundingBox intersect_bbox;
|
||||
intersect_bbox.xmin = max(bbox_i.xmin, bbox_j.xmin);
|
||||
intersect_bbox.ymin = max(bbox_i.ymin, bbox_j.ymin);
|
||||
intersect_bbox.xmax = min(bbox_i.xmax, bbox_j.xmax);
|
||||
intersect_bbox.ymax = min(bbox_i.ymax, bbox_j.ymax);
|
||||
|
||||
float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
|
||||
|
||||
using device::fast_divide_ftz;
|
||||
float iou = fast_divide_ftz(intersect_size, bbox_i_size + bbox_j_size - intersect_size);
|
||||
do_not_reject_j[k] = iou <= nms_threshold;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < UNROLL_SIZE; k++)
|
||||
{
|
||||
// FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
|
||||
auto predicate = __ballot_sync(0xFFFFFFFF, do_not_reject_j[k]);
|
||||
if (threadIdx.x % 32 == 0)
|
||||
mask_shared[mask_offset] = predicate;
|
||||
|
||||
/* The following operation should logically be inside the previous if branch. Note that `mask_offset`
|
||||
* is only used by lane zero threads. Hence, there is no harm in executing it other threads as it is
|
||||
* unused there.
|
||||
*
|
||||
* Keeping it inside prevents the compiler from treating it as a constexpr addition to the address in
|
||||
* successive unrolled iterations. A register is used and instructions are emitted to multiply the
|
||||
* addend by four to obtain the byte offset. Pulling it out of the branch makes the compiler do constexpr
|
||||
* addition on the address in successive unrolled iterations.
|
||||
*/
|
||||
mask_offset += BLOCK_SIZE / 32;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/* The mask data is organized as a two-dimensional bit matrix of size topK_gs * topK_gs.
|
||||
* (i, j) is set to true if the overlap between `i` and `j` is beyond the nms threshold.
|
||||
* We pack 32 results into one 32-bit integer. So the effective memory layout is topK_gs * topK_gs / 32.
|
||||
*/
|
||||
|
||||
/* Each box `i` was compared with BLOCK_SIZE `j` boxes. This amounts to BLOCK_SIZE / 32
|
||||
* 32-bit integers per box `i`.
|
||||
*/
|
||||
using mask_vector_type = get_vector_type_t<unsigned int, BLOCK_SIZE / 32>;
|
||||
|
||||
const int i = threadIdx.x;
|
||||
|
||||
auto mask_shared_vPtr = mask_vector_type::get_pointer(DevicePtr<unsigned>(mask_shared));
|
||||
mask_vector_type temp;
|
||||
v_load(temp, mask_shared_vPtr[i]);
|
||||
for (int i = 0; i < mask_vector_type::size(); i++)
|
||||
temp.data[i] = __brev(temp.data[i]);
|
||||
|
||||
auto mask_vPtr = mask_vector_type::get_pointer(mask);
|
||||
v_store(mask_vPtr[((group_i_offset + i) * topK_gs + group_j_offset) / 32 / mask_vector_type::size()], temp);
|
||||
}
|
||||
|
||||
template <int ITEMS_PER_THREAD, int BLOCK_SIZE>
|
||||
__launch_bounds__(BLOCK_SIZE)
|
||||
__global__ void grid_nms_collect(Span<int> indices_, Span<int> count_, View<unsigned int> mask_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs_by32)
|
||||
{
|
||||
const index_type c = blockIdx.x;
|
||||
if (c == background_class_id)
|
||||
return;
|
||||
|
||||
const index_type b = blockIdx.y;
|
||||
|
||||
// topK_gs is topK rounded upwards to some size
|
||||
|
||||
// indices: [batch_size, num_classes, topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
|
||||
|
||||
auto indices = indices_.data() + (b * num_classes + c) * topK;
|
||||
auto count = count_.data() + (b * num_classes + c);
|
||||
auto mask = mask_.data() + (b * num_classes + c) * topK_gs_by32 * 32 * topK_gs_by32;
|
||||
|
||||
const auto boxes = *count;
|
||||
if (boxes == 0)
|
||||
return;
|
||||
|
||||
/* We have a fixed number of threads and an arbitary number of boxes. We use an array of
|
||||
* bits to store which boxes haven't been eliminated and which are still active. We organize
|
||||
* the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
|
||||
* is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
|
||||
* `num_rows` is the minimum number of rows required to cover all the boxes.
|
||||
*
|
||||
* Each thread handles a specific column in the matrix. To improve performance, we process
|
||||
* `ITEMS_PER_THREAD` number of elements per thread. This changes the shape to (num_rows,
|
||||
* ROW_WIDTH) where ROW_WIDTH is BLOCK_SIZE * ITEMS_PER_THREAD.
|
||||
*/
|
||||
constexpr int ROW_WIDTH = BLOCK_SIZE * ITEMS_PER_THREAD;
|
||||
|
||||
const index_type num_32b_masks = static_cast<unsigned>(boxes + 31) / 32;
|
||||
const index_type num_rows = static_cast<unsigned>(num_32b_masks + ROW_WIDTH - 1) / ROW_WIDTH;
|
||||
|
||||
extern __shared__ unsigned int active_boxes[]; // the matrix described earlier
|
||||
|
||||
#pragma unroll 1
|
||||
for (auto idx : block_stride_range<BLOCK_SIZE>(num_32b_masks))
|
||||
active_boxes[idx] = (idx == num_32b_masks - 1) ? __brev((1u << (boxes % 32)) - 1) : 0xFFFFFFFF;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
using vector_type = get_vector_type_t<unsigned int, ITEMS_PER_THREAD>;
|
||||
auto mask_vPtr = vector_type::get_pointer(mask);
|
||||
auto shared_vPtr = vector_type::get_pointer(DevicePtr<unsigned>(active_boxes));
|
||||
|
||||
int index_temp;
|
||||
int thread0_count = 0;
|
||||
int thread_id = threadIdx.x;
|
||||
|
||||
for (int step = 0; step < num_32b_masks; step++)
|
||||
{
|
||||
auto current_active = active_boxes[step];
|
||||
while (current_active)
|
||||
{
|
||||
const index_type bit = __clz(current_active);
|
||||
const index_type i = step * 32 + bit;
|
||||
|
||||
const int mask_offset = static_cast<unsigned>(i * topK_gs_by32) / ITEMS_PER_THREAD;
|
||||
|
||||
/* We fetch the index from the memory and store it in a register. We will not use it until
|
||||
* much later. This helps avoid a long scoreboard stall.
|
||||
*/
|
||||
if (thread_id == 0)
|
||||
index_temp = indices[i];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
active_boxes[step] = current_active ^ (0x80000000 >> bit);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll 1
|
||||
for (int r = 0; r < num_rows; r++)
|
||||
{
|
||||
const int idx = r * BLOCK_SIZE + thread_id;
|
||||
if ((step & ~(ITEMS_PER_THREAD - 1)) <= idx * ITEMS_PER_THREAD && idx * ITEMS_PER_THREAD < num_32b_masks)
|
||||
{
|
||||
auto active_boxes_vec = shared_vPtr[idx];
|
||||
auto mask_vec = mask_vPtr[mask_offset + idx];
|
||||
for (int i = 0; i < vector_type::size(); i++)
|
||||
active_boxes_vec.data[i] &= mask_vec.data[i];
|
||||
shared_vPtr[idx] = active_boxes_vec;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (thread_id == 0)
|
||||
{
|
||||
indices[thread0_count] = index_temp;
|
||||
thread0_count++;
|
||||
}
|
||||
|
||||
current_active = active_boxes[step];
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
*count = thread0_count;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int GROUP_SIZE = 128;
|
||||
|
||||
static std::size_t getAlignedTopK(std::size_t topK)
|
||||
{
|
||||
auto remainder = topK % GROUP_SIZE;
|
||||
if (remainder == 0)
|
||||
return topK;
|
||||
return topK + (GROUP_SIZE - remainder);
|
||||
}
|
||||
|
||||
std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK)
|
||||
{
|
||||
auto topK_gs = getAlignedTopK(classwise_topK);
|
||||
return num_classes * topK_gs * topK_gs / 32 * sizeof(unsigned int);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold)
|
||||
{
|
||||
// workspace: [batch_size, num_classes, topK_gs, topK_gs / 32]
|
||||
// indices: [batch_size, num_classes, topK]
|
||||
// count: [batch_size, num_classes]
|
||||
// bboxes: [batch_size, num_classes, topK, 4] (only first count[b][c] boxes are read)
|
||||
|
||||
const auto batch_size = indices.get_axis_size(0);
|
||||
CV_Assert(count.get_axis_size(0) == batch_size);
|
||||
CV_Assert(bboxes.get_axis_size(0) == batch_size);
|
||||
|
||||
const auto num_classes = indices.get_axis_size(1);
|
||||
CV_Assert(count.get_axis_size(1) == num_classes);
|
||||
CV_Assert(bboxes.get_axis_size(1) == num_classes);
|
||||
|
||||
const auto topK = indices.get_axis_size(2);
|
||||
CV_Assert(bboxes.get_axis_size(2) == topK);
|
||||
|
||||
CV_Assert(bboxes.get_axis_size(3) == 4);
|
||||
|
||||
const auto topK_gs = getAlignedTopK(topK);
|
||||
CV_Assert(workspace.size() >= topK_gs * topK_gs / 32);
|
||||
|
||||
const auto boxes = topK;
|
||||
const auto num_groups = (boxes + GROUP_SIZE - 1) / GROUP_SIZE;
|
||||
|
||||
{
|
||||
// grid = (num_groups * num_groups, num_classes, batch_size)
|
||||
// if the background class is the last class, we can reduce grid y dim by one
|
||||
auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
|
||||
|
||||
constexpr int BLOCK_SIZE = GROUP_SIZE;
|
||||
|
||||
dim3 grid_size(num_groups * num_groups, grid_num_classes, batch_size);
|
||||
dim3 block_size(BLOCK_SIZE);
|
||||
auto policy = execution_policy(grid_size, block_size, stream);
|
||||
|
||||
if (normalized_bbox)
|
||||
{
|
||||
auto kernel = raw::grid_nms<T, true, BLOCK_SIZE>;
|
||||
launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto kernel = raw::grid_nms<T, false, BLOCK_SIZE>;
|
||||
launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// grid = (num_classes, batch_size)
|
||||
// if the background class is the last class, we can reduce grid x dim by one
|
||||
auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
|
||||
|
||||
constexpr int BLOCK_SIZE = 64;
|
||||
|
||||
constexpr int ITEMS_PER_THREAD = 4;
|
||||
auto kernel = raw::grid_nms_collect<ITEMS_PER_THREAD, BLOCK_SIZE>;
|
||||
|
||||
dim3 grid_size(grid_num_classes, batch_size);
|
||||
|
||||
auto sharedMem = topK_gs / 32 * 4;
|
||||
auto policy = execution_policy(grid_size, BLOCK_SIZE, sharedMem, stream);
|
||||
launch_kernel(kernel, policy, indices, count, workspace, num_classes, background_class_id, topK, topK_gs / 32);
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
|
||||
|
||||
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
|
||||
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
68
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_stride_range.hpp
vendored
Normal file
68
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/grid_stride_range.hpp
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
|
||||
|
||||
#include "types.hpp"
|
||||
#include "index_helpers.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
template <int dim, class index_type = device::index_type, class size_type = device::size_type>
|
||||
class grid_stride_range_generic {
|
||||
public:
|
||||
__device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { }
|
||||
__device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
|
||||
|
||||
class iterator
|
||||
{
|
||||
public:
|
||||
__device__ iterator(index_type pos_) : pos(pos_) {}
|
||||
|
||||
/* these iterators return the index when dereferenced; this allows us to loop
|
||||
* through the indices using a range based for loop
|
||||
*/
|
||||
__device__ index_type operator*() const { return pos; }
|
||||
|
||||
__device__ iterator& operator++() {
|
||||
pos += getGridDim<dim>() * static_cast<index_type>(getBlockDim<dim>());
|
||||
return *this;
|
||||
}
|
||||
|
||||
__device__ bool operator!=(const iterator& other) const {
|
||||
/* NOTE HACK
|
||||
* 'pos' can move in large steps (see operator++)
|
||||
* expansion of range for loop uses != as the loop conditioion
|
||||
* => operator!= must return false if 'pos' crosses the end
|
||||
*/
|
||||
return pos < other.pos;
|
||||
}
|
||||
|
||||
private:
|
||||
index_type pos;
|
||||
};
|
||||
|
||||
__device__ iterator begin() const {
|
||||
return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>());
|
||||
}
|
||||
|
||||
__device__ iterator end() const {
|
||||
return iterator(to);
|
||||
}
|
||||
|
||||
private:
|
||||
index_type from, to;
|
||||
};
|
||||
|
||||
using grid_stride_range_x = grid_stride_range_generic<0>;
|
||||
using grid_stride_range_y = grid_stride_range_generic<1>;
|
||||
using grid_stride_range_z = grid_stride_range_generic<2>;
|
||||
using grid_stride_range = grid_stride_range_x;
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */
|
41
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/index_helpers.hpp
vendored
Normal file
41
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/index_helpers.hpp
vendored
Normal file
@ -0,0 +1,41 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
|
||||
|
||||
#include "types.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
namespace detail {
|
||||
using dim3_member_type = decltype(dim3::x);
|
||||
using uint3_member_type = decltype(uint3::x);
|
||||
}
|
||||
|
||||
template <int> __device__ detail::dim3_member_type getGridDim();
|
||||
template <> inline __device__ detail::dim3_member_type getGridDim<0>() { return gridDim.x; }
|
||||
template <> inline __device__ detail::dim3_member_type getGridDim<1>() { return gridDim.y; }
|
||||
template <> inline __device__ detail::dim3_member_type getGridDim<2>() { return gridDim.z; }
|
||||
|
||||
template <int> __device__ detail::dim3_member_type getBlockDim();
|
||||
template <> inline __device__ detail::dim3_member_type getBlockDim<0>() { return blockDim.x; }
|
||||
template <> inline __device__ detail::dim3_member_type getBlockDim<1>() { return blockDim.y; }
|
||||
template <> inline __device__ detail::dim3_member_type getBlockDim<2>() { return blockDim.z; }
|
||||
|
||||
template <int> __device__ detail::uint3_member_type getBlockIdx();
|
||||
template <> inline __device__ detail::uint3_member_type getBlockIdx<0>() { return blockIdx.x; }
|
||||
template <> inline __device__ detail::uint3_member_type getBlockIdx<1>() { return blockIdx.y; }
|
||||
template <> inline __device__ detail::uint3_member_type getBlockIdx<2>() { return blockIdx.z; }
|
||||
|
||||
template <int> __device__ detail::uint3_member_type getThreadIdx();
|
||||
template <> inline __device__ detail::uint3_member_type getThreadIdx<0>() { return threadIdx.x; }
|
||||
template <> inline __device__ detail::uint3_member_type getThreadIdx<1>() { return threadIdx.y; }
|
||||
template <> inline __device__ detail::uint3_member_type getThreadIdx<2>() { return threadIdx.z; }
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP */
|
94
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/kernel_dispatcher.hpp
vendored
Normal file
94
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/kernel_dispatcher.hpp
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
|
||||
* one kernel which can work with the maximally ranked tensors, we make one kernel for each supported
|
||||
* tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a
|
||||
* toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor
|
||||
* rank as a template parameter.
|
||||
*
|
||||
* The kernel is a template and we have different instantiations for each rank. This causes the following pattern
|
||||
* to arise frequently:
|
||||
*
|
||||
* if(rank == 3)
|
||||
* kernel<T, 3>();
|
||||
* else if(rank == 2)
|
||||
* kernel<T, 2>();
|
||||
* else
|
||||
* kernel<T, 1>();
|
||||
*
|
||||
* The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER.
|
||||
* This macro creates a function which selects the correct kernel instantiation at runtime.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* // function which setups the kernel and launches it
|
||||
* template <class T, std::size_t Rank>
|
||||
* void launch_some_kernel(...);
|
||||
*
|
||||
* // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
|
||||
* GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel);
|
||||
*
|
||||
* // internal API function
|
||||
* template <class T>
|
||||
* void some(...) {
|
||||
* // ...
|
||||
* auto rank = input.rank();
|
||||
* some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...);
|
||||
* }
|
||||
*/
|
||||
|
||||
/*
|
||||
* name name of the dispatcher function that is generated
|
||||
* func template function that requires runtime selection
|
||||
*
|
||||
* T first template parameter to `func`
|
||||
* start starting rank
|
||||
* end ending rank (inclusive)
|
||||
*
|
||||
* Executes func<T, selector> based on runtime `selector` argument given `selector` lies
|
||||
* within the range [start, end]. If outside the range, no instantiation of `func` is executed.
|
||||
*/
|
||||
#define GENERATE_KERNEL_DISPATCHER(name,func); \
|
||||
template <class T, std::size_t start, std::size_t end, class... Args> static \
|
||||
typename std::enable_if<start == end, void> \
|
||||
::type name(int selector, Args&& ...args) { \
|
||||
if(selector == start) \
|
||||
func<T, start>(std::forward<Args>(args)...); \
|
||||
} \
|
||||
\
|
||||
template <class T, std::size_t start, std::size_t end, class... Args> static \
|
||||
typename std::enable_if<start != end, void> \
|
||||
::type name(int selector, Args&& ...args) { \
|
||||
if(selector == start) \
|
||||
func<T, start>(std::forward<Args>(args)...); \
|
||||
else \
|
||||
name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...); \
|
||||
}
|
||||
|
||||
// Same as GENERATE_KERNEL_DISPATCHER but takes two class template parameters T and TP1 instead of just T
|
||||
#define GENERATE_KERNEL_DISPATCHER_2TP(name,func); \
|
||||
template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static \
|
||||
typename std::enable_if<start == end, void> \
|
||||
::type name(int selector, Args&& ...args) { \
|
||||
if(selector == start) \
|
||||
func<TP1, TP2, start>(std::forward<Args>(args)...); \
|
||||
} \
|
||||
\
|
||||
template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static \
|
||||
typename std::enable_if<start != end, void> \
|
||||
::type name(int selector, Args&& ...args) { \
|
||||
if(selector == start) \
|
||||
func<TP1, TP2, start>(std::forward<Args>(args)...); \
|
||||
else \
|
||||
name<TP1, TP2, start + 1, end, Args...>(selector, std::forward<Args>(args)...); \
|
||||
}
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */
|
36
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/limits.hpp
vendored
Normal file
36
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/limits.hpp
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
template <class T>
|
||||
struct numeric_limits;
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <>
|
||||
struct numeric_limits<__half> {
|
||||
__device__ static __half min() { return 0.0000610; }
|
||||
__device__ static __half max() { return 65504.0; }
|
||||
__device__ static __half lowest() { return -65504.0; }
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct numeric_limits<float> {
|
||||
__device__ static float min() { return FLT_MIN; }
|
||||
__device__ static float max() { return FLT_MAX; }
|
||||
__device__ static float lowest() { return -FLT_MAX; }
|
||||
};
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */
|
154
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/math.hpp
vendored
Normal file
154
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/math.hpp
vendored
Normal file
@ -0,0 +1,154 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_MATH_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); }
|
||||
template <> inline __device__ float abs(float val) { return fabsf(val); }
|
||||
template <> inline __device__ double abs(double val) { return fabs(val); }
|
||||
|
||||
template <class T> __device__ T exp(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half exp(__half val) { return hexp(val); }
|
||||
#endif
|
||||
template <> inline __device__ float exp(float val) { return expf(val); }
|
||||
template <> inline __device__ double exp(double val) { return ::exp(val); }
|
||||
|
||||
template <class T> __device__ T expm1(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half expm1(__half val) { return hexp(val) - __half(1); }
|
||||
#endif
|
||||
template <> inline __device__ float expm1(float val) { return expm1f(val); }
|
||||
template <> inline __device__ double expm1(double val) { return ::expm1(val); }
|
||||
|
||||
template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); }
|
||||
template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); }
|
||||
template <> inline __device__ double max(double x, double y) { return fmax(x, y); }
|
||||
|
||||
template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); }
|
||||
template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
|
||||
template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
|
||||
|
||||
template <class T> __device__ T log1p(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
|
||||
#endif
|
||||
template <> inline __device__ float log1p(float val) { return log1pf(val); }
|
||||
|
||||
template <class T> __device__ T log1pexp(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half log1pexp(__half val) {
|
||||
if (val <= __half(-4.0))
|
||||
return exp(val);
|
||||
else if (val <= __half(8.0))
|
||||
return log1p(exp(val));
|
||||
else if (val <= __half(8.7))
|
||||
return val + exp(-val);
|
||||
else
|
||||
return val;
|
||||
}
|
||||
#endif
|
||||
template <> inline __device__ float log1pexp(float val) {
|
||||
if (val <= -20)
|
||||
return expf(val);
|
||||
else if (val <= 9.0)
|
||||
return log1pf(expf(val));
|
||||
else if (val <= 14.6)
|
||||
return val + exp(-val);
|
||||
else
|
||||
return val;
|
||||
}
|
||||
template <> inline __device__ double log1pexp(double val) {
|
||||
if (val <= -37)
|
||||
return exp(val);
|
||||
else if (val <= 18)
|
||||
return log1p(exp(val));
|
||||
else if (val <= 33.3)
|
||||
return val + exp(-val);
|
||||
else
|
||||
return val;
|
||||
}
|
||||
|
||||
template <class T> __device__ T tanh(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half tanh(__half val) { return tanhf(val); }
|
||||
#endif
|
||||
template <> inline __device__ float tanh(float val) { return tanhf(val); }
|
||||
template <> inline __device__ double tanh(double val) { return ::tanh(val); }
|
||||
|
||||
template <class T> __device__ T pow(T val, T exp);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); }
|
||||
#endif
|
||||
template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); }
|
||||
template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); }
|
||||
|
||||
template <class T> __device__ T sqrt(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); }
|
||||
#endif
|
||||
template <> inline __device__ float sqrt(float val) { return sqrtf(val); }
|
||||
template <> inline __device__ double sqrt(double val) { return ::sqrt(val); }
|
||||
|
||||
template <class T> __device__ T rsqrt(T val);
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); }
|
||||
#endif
|
||||
template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); }
|
||||
template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); }
|
||||
|
||||
template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); }
|
||||
|
||||
template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
|
||||
|
||||
template <class T> __device__ long lround(T value);
|
||||
template <> inline __device__ long lround(double value) { return ::lround(value); }
|
||||
template <> inline __device__ long lround(float value) { return lroundf(value); }
|
||||
|
||||
template <class T> __device__ T round(T value);
|
||||
template <> inline __device__ double round(double value) { return ::round(value); }
|
||||
template <> inline __device__ float round(float value) { return roundf(value); }
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half round(__half value) { return hrint(value); }
|
||||
#endif
|
||||
|
||||
template <class T> __device__ T ceil(T value);
|
||||
template <> inline __device__ double ceil(double value) { return ::ceil(value); }
|
||||
template <> inline __device__ float ceil(float value) { return ceilf(value); }
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template <> inline __device__ __half ceil(__half value) { return hceil(value); }
|
||||
#endif
|
||||
|
||||
template <class T> __device__ T mul_ftz(T x, T y) { return x * y; }
|
||||
template <> inline __device__ float mul_ftz(float x, float y) {
|
||||
float result;
|
||||
asm("mul.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T> __device__ T fast_divide(T x, T y) { return x / y; }
|
||||
template <> inline __device__ float fast_divide(float x, float y) { return __fdividef(x, y); }
|
||||
|
||||
template <class T> __device__ T fast_divide_ftz(T x, T y) { return fast_divide(x, y); }
|
||||
template <> inline __device__ float fast_divide_ftz(float x, float y) {
|
||||
float result;
|
||||
asm("div.approx.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T> __device__ T fast_exp(T value) { return exp(value); }
|
||||
template <> inline __device__ float fast_exp(float value) { return __expf(value); }
|
||||
|
||||
template <class T> __device__ T fast_sigmoid(T value) { return sigmoid(value); }
|
||||
template <> inline __device__ float fast_sigmoid(float value) { return __fdividef(1, 1 + __expf(-value)); }
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */
|
328
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/max_unpooling.cu
vendored
Normal file
328
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/max_unpooling.cu
vendored
Normal file
@ -0,0 +1,328 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "array.hpp"
|
||||
#include "limits.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t Order,
|
||||
typename std::enable_if<Order == 1 || Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
|
||||
__global__ void max_pooling_with_indices(
|
||||
Span<T> output, Span<T> indices, View<T> input, size_type channels,
|
||||
array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
|
||||
array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
|
||||
{
|
||||
/* every element in the output is mapped to a window in the input and each thread processes several windows */
|
||||
for (auto idx : grid_stride_range(output.size())) {
|
||||
size_type out_spatial_size = 1;
|
||||
array<index_type, Order> window_idx;
|
||||
for (int i = Order - 1; i >= 0; i--) {
|
||||
window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
|
||||
out_spatial_size *= out_spatial_dims[i];
|
||||
}
|
||||
|
||||
const index_type n = idx / (out_spatial_size * channels);
|
||||
const index_type c = (idx / out_spatial_size) % channels;
|
||||
|
||||
array<index_type, Order> start;
|
||||
for(int i = 0; i < Order; i++)
|
||||
start[i] = window_idx[i] * strides[i] - padding_left[i];
|
||||
|
||||
array<index_type, Order> end;
|
||||
for (int i = 0; i < Order; i++) {
|
||||
using device::min;
|
||||
end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < Order; i++) {
|
||||
using device::max;
|
||||
start[i] = max(start[i], 0);
|
||||
}
|
||||
|
||||
T max_value = numeric_limits<T>::lowest();
|
||||
index_type max_idx = -1;
|
||||
|
||||
size_type in_spatial_size = 1;
|
||||
for (int i = 0; i < Order; i++)
|
||||
in_spatial_size *= in_spatial_dims[i];
|
||||
|
||||
const auto outer_offset = (n * channels + c) * in_spatial_size;
|
||||
if (Order == 1) {
|
||||
array<index_type, Order> idx;
|
||||
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
|
||||
index_type offset = 0;
|
||||
index_type stride = 1;
|
||||
for (int i = Order - 1; i >= 0; i--) {
|
||||
offset += stride * idx[i];
|
||||
stride *= in_spatial_dims[i];
|
||||
}
|
||||
|
||||
if (input[outer_offset + offset] > max_value) {
|
||||
max_idx = offset;
|
||||
max_value = input[outer_offset + offset];
|
||||
}
|
||||
}
|
||||
} else if (Order == 2) {
|
||||
array<index_type, Order> idx;
|
||||
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
|
||||
for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
|
||||
index_type offset = 0;
|
||||
index_type stride = 1;
|
||||
for (int i = Order - 1; i >= 0; i--) {
|
||||
offset += stride * idx[i];
|
||||
stride *= in_spatial_dims[i];
|
||||
}
|
||||
|
||||
if (input[outer_offset + offset] > max_value) {
|
||||
max_idx = offset;
|
||||
max_value = input[outer_offset + offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if(Order == 3) {
|
||||
array<index_type, Order> idx;
|
||||
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
|
||||
for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
|
||||
for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
|
||||
index_type offset = 0;
|
||||
index_type stride = 1;
|
||||
for (int i = Order - 1; i >= 0; i--) {
|
||||
offset += stride * idx[i];
|
||||
stride *= in_spatial_dims[i];
|
||||
}
|
||||
|
||||
if (input[outer_offset + offset] > max_value) {
|
||||
max_idx = offset;
|
||||
max_value = input[outer_offset + offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[idx] = max_value;
|
||||
indices[idx] = max_idx;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t Order>
|
||||
__global__ void max_unpooling(
|
||||
Span<T> output, View<T> input, View<T> indices, size_type channels,
|
||||
array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
|
||||
array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
|
||||
{
|
||||
/* the output has already been zero filled */
|
||||
/* Every input value represents a window in the output. The max unpooling operation
|
||||
* copies the input value to exactly one location in the output window which is given
|
||||
* by the indices tensor.
|
||||
*/
|
||||
for (auto idx : grid_stride_range(input.size())) {
|
||||
size_type in_spatial_size = 1;
|
||||
array<index_type, Order> window_idx;
|
||||
for (int i = Order - 1; i >= 0; i--) {
|
||||
window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
|
||||
in_spatial_size *= in_spatial_dims[i];
|
||||
}
|
||||
|
||||
const index_type n = idx / (in_spatial_size * channels);
|
||||
const index_type c = (idx / in_spatial_size) % channels;
|
||||
|
||||
array<index_type, Order> start;
|
||||
for (int i = 0; i < Order; i++) {
|
||||
using device::min;
|
||||
using device::max;
|
||||
start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
|
||||
}
|
||||
|
||||
size_type out_spatial_size = 1;
|
||||
for (int i = 0; i < Order; i++)
|
||||
out_spatial_size *= out_spatial_dims[i];
|
||||
|
||||
index_type outer_offset = (n * channels + c) * out_spatial_size;
|
||||
output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t Order> static
|
||||
void launch_max_pooling_kernel(
|
||||
const Stream& stream,
|
||||
Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
|
||||
const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
|
||||
const std::vector<std::size_t>& window_size,
|
||||
const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
|
||||
{
|
||||
CV_Assert(indices.size() == output.size());
|
||||
CV_Assert(out_spatial_dims.size() == Order);
|
||||
CV_Assert(in_spatial_dims.size() == Order);
|
||||
CV_Assert(window_size.size() == Order);
|
||||
CV_Assert(strides.size() == Order);
|
||||
CV_Assert(padding_left.size() == Order);
|
||||
|
||||
array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
|
||||
out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
|
||||
in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
|
||||
|
||||
array<size_type, Order> window_size_k, strides_k, padding_left_k;
|
||||
window_size_k.assign(std::begin(window_size), std::end(window_size));
|
||||
strides_k.assign(std::begin(strides), std::end(strides));
|
||||
padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
|
||||
|
||||
auto kernel = raw::max_pooling_with_indices<T, Order>;
|
||||
auto policy = make_policy(kernel, output.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, indices, input, channels,
|
||||
out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void max_pooling_with_indices(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
|
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
|
||||
const std::vector<std::size_t>& padding_left)
|
||||
{
|
||||
CV_Assert(is_shape_same(output, indices));
|
||||
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
|
||||
|
||||
auto order = window_size.size();
|
||||
CV_Assert(strides.size() == order);
|
||||
CV_Assert(padding_left.size() == order);
|
||||
CV_Assert(output.rank() == order + 2);
|
||||
CV_Assert(input.rank() == order + 2);
|
||||
|
||||
std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
|
||||
for (int i = 0; i < order; i++) {
|
||||
in_spatial_dims[i] = input.get_axis_size(2 + i);
|
||||
out_spatial_dims[i] = output.get_axis_size(2 + i);
|
||||
}
|
||||
|
||||
CV_Assert(1 <= order && order <= 3);
|
||||
std::size_t channels = input.get_axis_size(1);
|
||||
if (order == 3) {
|
||||
launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
|
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
|
||||
} else if (order == 2) {
|
||||
launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
|
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
|
||||
} else if (order == 1) {
|
||||
launch_max_pooling_kernel<T, 1>(stream, output, indices, input, channels,
|
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void max_pooling_with_indices(const Stream&,
|
||||
TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
|
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
|
||||
const std::vector<std::size_t>&);
|
||||
#endif
|
||||
|
||||
template void max_pooling_with_indices(const Stream&,
|
||||
TensorSpan<float>, TensorSpan<float>, TensorView<float>,
|
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
|
||||
const std::vector<std::size_t>&);
|
||||
|
||||
template <class T, std::size_t Order> static
|
||||
void launch_max_unpooling_kernel(
|
||||
const Stream& stream,
|
||||
Span<T> output, View<T> input, View<T> indices, std::size_t channels,
|
||||
const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
|
||||
const std::vector<std::size_t>& window_size,
|
||||
const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
|
||||
{
|
||||
CV_Assert(out_spatial_dims.size() == Order);
|
||||
CV_Assert(in_spatial_dims.size() == Order);
|
||||
CV_Assert(window_size.size() == Order);
|
||||
CV_Assert(strides.size() == Order);
|
||||
CV_Assert(padding_left.size() == Order);
|
||||
CV_Assert(indices.size() == input.size());
|
||||
|
||||
array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
|
||||
out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
|
||||
in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
|
||||
|
||||
array<size_type, Order> window_size_k, strides_k, padding_left_k;
|
||||
window_size_k.assign(std::begin(window_size), std::end(window_size));
|
||||
strides_k.assign(std::begin(strides), std::end(strides));
|
||||
padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
|
||||
|
||||
auto kernel = raw::max_unpooling<T, Order>;
|
||||
auto policy = make_policy(kernel, input.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, indices, channels,
|
||||
out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void max_unpooling(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
|
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
|
||||
const std::vector<std::size_t>& padding_left)
|
||||
{
|
||||
CV_Assert(is_shape_same(input, indices));
|
||||
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
|
||||
|
||||
auto order = window_size.size();
|
||||
CV_Assert(strides.size() == order);
|
||||
CV_Assert(padding_left.size() == order);
|
||||
CV_Assert(output.rank() == order + 2);
|
||||
CV_Assert(input.rank() == order + 2);
|
||||
|
||||
std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
|
||||
for (int i = 0; i < order; i++) {
|
||||
in_spatial_dims[i] = input.get_axis_size(2 + i);
|
||||
out_spatial_dims[i] = output.get_axis_size(2 + i);
|
||||
}
|
||||
|
||||
kernels::fill<T>(stream, output, 0.0);
|
||||
|
||||
/* only max_unpooling2d and max_unpooling3d are supported */
|
||||
CV_Assert(2 <= order && order <= 3);
|
||||
std::size_t channels = input.get_axis_size(1);
|
||||
if (order == 3) {
|
||||
launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
|
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
|
||||
} else if (order == 2) {
|
||||
launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
|
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void max_unpooling(const Stream&,
|
||||
TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
|
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
|
||||
const std::vector<std::size_t>&);
|
||||
#endif
|
||||
|
||||
template void max_unpooling(const Stream&,
|
||||
TensorSpan<float>, TensorView<float>, TensorView<float>,
|
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
|
||||
const std::vector<std::size_t>&);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
32
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/memory.hpp
vendored
Normal file
32
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/memory.hpp
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_MEMORY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_MEMORY_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
template <class T>
|
||||
__device__ T load_ldg(const T& src) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
|
||||
return __ldg(&src);
|
||||
#else
|
||||
return src;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__device__ T load_ldg(const T* src) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
|
||||
return __ldg(src);
|
||||
#else
|
||||
return *src;
|
||||
#endif
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_MEMORY_HPP */
|
145
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/mvn.cu
vendored
Normal file
145
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/mvn.cu
vendored
Normal file
@ -0,0 +1,145 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "types.hpp"
|
||||
#include "atomics.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T>
|
||||
__global__ void reduce_mean(Span<float> means, View<T> input, size_type inner_size) {
|
||||
for (auto idx : grid_stride_range(input.size())) {
|
||||
const index_type outer_idx = idx / inner_size;
|
||||
atomicAdd(&means[outer_idx], static_cast<float>(input[idx]) / inner_size);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void reduce_mean_sqr_sum(Span<float> means, Span<float> sum_sqrs, View<T> input, size_type inner_size) {
|
||||
for (auto idx : grid_stride_range(input.size())) {
|
||||
const index_type outer_idx = idx / inner_size;
|
||||
auto x = static_cast<float>(input[idx]);
|
||||
atomicAdd(&means[outer_idx], x / inner_size);
|
||||
atomicAdd(&sum_sqrs[outer_idx], x * x);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_normalization_scale(Span<float> scale, View<float> means, View<float> sums_sqr, size_type inner_size, float eps) {
|
||||
for (auto idx : grid_stride_range(scale.size())) {
|
||||
auto mean = means[idx];
|
||||
auto var = sums_sqr[idx] / inner_size - mean * mean;
|
||||
using device::rsqrt;
|
||||
scale[idx] = rsqrt(eps + var);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void normalize_mean(Span<T> output, View<T> input, View<float> means, size_type inner_size) {
|
||||
for (auto idx : grid_stride_range(output.size())) {
|
||||
const index_type outer_idx = idx / inner_size;
|
||||
output[idx] = static_cast<float>(input[idx]) - means[outer_idx];
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void normalize_mean_variance(Span<T> output, View<T> input, View<float> means, View<float> scale, size_type inner_size) {
|
||||
for (auto idx : grid_stride_range(output.size())) {
|
||||
const index_type outer_idx = idx / inner_size;
|
||||
output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * scale[outer_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void reduce_mean(const Stream& stream, Span<float> means, View<T> input, std::size_t inner_size)
|
||||
{
|
||||
CV_Assert(input.size() / inner_size == means.size());
|
||||
|
||||
auto kernel = raw::reduce_mean<T>;
|
||||
auto policy = make_policy(kernel, input.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, means, input, inner_size);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void reduce_mean(const Stream&, Span<float>, View<__half>, std::size_t);
|
||||
#endif
|
||||
template void reduce_mean(const Stream&, Span<float>, View<float>, std::size_t);
|
||||
|
||||
template <class T>
|
||||
void reduce_mean_sqr_sum(const Stream& stream, Span<float> means, Span<float> sum_sqrs, View<T> input, std::size_t inner_size)
|
||||
{
|
||||
CV_Assert(input.size() / inner_size == means.size());
|
||||
CV_Assert(input.size() / inner_size == sum_sqrs.size());
|
||||
|
||||
auto kernel = raw::reduce_mean_sqr_sum<T>;
|
||||
auto policy = make_policy(kernel, input.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, means, sum_sqrs, input, inner_size);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<__half>, std::size_t);
|
||||
#endif
|
||||
template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<float>, std::size_t);
|
||||
|
||||
void compute_normalization_scale(const Stream& stream, Span<float> scale, View<float> means, View<float> sum_sqrs, std::size_t inner_size, float eps)
|
||||
{
|
||||
CV_Assert(scale.size() == means.size());
|
||||
CV_Assert(scale.size() == sum_sqrs.size());
|
||||
|
||||
auto kernel = raw::compute_normalization_scale;
|
||||
auto policy = make_policy(kernel, scale.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, scale, means, sum_sqrs, inner_size, eps);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void normalize_mean(const Stream& stream, Span<T> output, View<T> input, View<float> means, std::size_t inner_size)
|
||||
{
|
||||
CV_Assert(output.size() == input.size());
|
||||
CV_Assert(input.size() / inner_size == means.size());
|
||||
|
||||
auto kernel = raw::normalize_mean<T>;
|
||||
auto policy = make_policy(kernel, output.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, means, inner_size);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void normalize_mean(const Stream&, Span<__half>, View<__half>, View<float>, std::size_t);
|
||||
#endif
|
||||
template void normalize_mean(const Stream&, Span<float>, View<float>, View<float>, std::size_t);
|
||||
|
||||
template <class T>
|
||||
void normalize_mean_variance(const Stream& stream, Span<T> output, View<T> input, View<float> means, View<float> scale, std::size_t inner_size)
|
||||
{
|
||||
CV_Assert(input.size() == output.size());
|
||||
CV_Assert(input.size() / inner_size == means.size());
|
||||
CV_Assert(input.size() / inner_size == scale.size());
|
||||
|
||||
auto kernel = raw::normalize_mean_variance<T>;
|
||||
auto policy = make_policy(kernel, output.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, means, scale, inner_size);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void normalize_mean_variance(const Stream&, Span<__half>, View<__half>, View<float>, View<float>, std::size_t);
|
||||
#endif
|
||||
template void normalize_mean_variance(const Stream&, Span<float>, View<float>, View<float>, View<float>, std::size_t);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
123
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu
vendored
Normal file
123
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/normalize.cu
vendored
Normal file
@ -0,0 +1,123 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "math.hpp"
|
||||
#include "types.hpp"
|
||||
#include "atomics.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||
#include "../cuda4dnn/kernels/scale_shift.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T>
|
||||
__global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
|
||||
for (auto idx : grid_stride_range(input.size())) {
|
||||
const index_type outer_idx = idx / outer_stride;
|
||||
const index_type inner_idx = idx % mid_stride;
|
||||
|
||||
const index_type sum_idx = outer_idx * mid_stride + inner_idx;
|
||||
atomicAdd(&output[sum_idx], device::abs(input[idx]));
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void reciprocal(Span<T> output, T epsilon) {
|
||||
for (auto idx : grid_stride_range(output.size()))
|
||||
output[idx] = T(1) / (output[idx] + epsilon);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
|
||||
for (auto idx : grid_stride_range(input.size())) {
|
||||
const index_type outer_idx = idx / outer_stride;
|
||||
const index_type inner_idx = idx % mid_stride;
|
||||
|
||||
const index_type sum_idx = outer_idx * mid_stride + inner_idx;
|
||||
atomicAdd(&output[sum_idx], input[idx] * input[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void rsqrt(Span<T> output, T epsilon) {
|
||||
for (auto idx : grid_stride_range(output.size())) {
|
||||
using device::sqrt;
|
||||
output[idx] = T(1) / sqrt(output[idx] + epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
|
||||
for (auto idx : grid_stride_range(output.size())) {
|
||||
const index_type outer_idx = idx / outer_stride;
|
||||
const index_type inner_idx = idx % mid_stride;
|
||||
|
||||
const index_type sum_idx = outer_idx * mid_stride + inner_idx;
|
||||
output[idx] = input[idx] * sums[sum_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void normalize(
|
||||
const Stream& stream,
|
||||
Span<T> output,
|
||||
View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
|
||||
Span<T> workspace)
|
||||
{
|
||||
CV_Assert(output.size() == input.size());
|
||||
CV_Assert(output.size() == outer_size * mid_size * inner_size);
|
||||
CV_Assert(norm == 1 || norm == 2);
|
||||
CV_Assert(workspace.size() >= outer_size * inner_size);
|
||||
|
||||
auto sums = Span<T>(workspace.data(), outer_size * inner_size);
|
||||
|
||||
fill<T>(stream, sums, 0.0);
|
||||
|
||||
if (norm == 1) {
|
||||
auto reduce_kernel = raw::reduce_sum_abs<T>;
|
||||
auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
|
||||
launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
|
||||
|
||||
auto reciprocal_kernel = raw::reciprocal<T>;
|
||||
policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
|
||||
launch_kernel(reciprocal_kernel, policy, sums, epsilon);
|
||||
} else {
|
||||
auto reduce_kernel = raw::reduce_sum_squared<T>;
|
||||
auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
|
||||
launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
|
||||
|
||||
auto rsqrt_kernel = raw::rsqrt<T>;
|
||||
policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
|
||||
launch_kernel(rsqrt_kernel, policy, sums, epsilon);
|
||||
}
|
||||
|
||||
auto scale_kernel = raw::apply_norm<T>;
|
||||
auto policy = make_policy(scale_kernel, output.size(), 0, stream);
|
||||
launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
|
||||
#endif
|
||||
template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
201
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/padding.cu
vendored
Normal file
201
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/padding.cu
vendored
Normal file
@ -0,0 +1,201 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "math.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "kernel_dispatcher.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t Rank>
|
||||
__global__ void copy_with_reflection101(
|
||||
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
|
||||
View<T> input, array<size_type, Rank> in_strides)
|
||||
{
|
||||
for (auto i : grid_stride_range(output.size())) {
|
||||
/* compute output axis indices corresponding to element 'i' */
|
||||
array<index_type, Rank> out_index;
|
||||
out_index[0] = i / out_strides[0];
|
||||
for (int j = 1; j < Rank; j++)
|
||||
out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
|
||||
|
||||
/* compute input axis indices corresponding to output axis indices */
|
||||
array<index_type, Rank> in_index;
|
||||
for (int j = 0; j < Rank; j++) {
|
||||
/* if out_index < start, the point is in the left reflection region
|
||||
* the reflected value's index is the absolute value of the difference
|
||||
*
|
||||
* otherwise, if the value is in the copy region, out_index - start gives the input index
|
||||
*/
|
||||
using device::abs;
|
||||
in_index[j] = abs(out_index[j] - start[j]);
|
||||
|
||||
/* if out_index >= end, it's in the right reflection region */
|
||||
if (out_index[j] >= end[j])
|
||||
in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
|
||||
}
|
||||
|
||||
/* compute input element number from input axis indices */
|
||||
index_type iidx = 0;
|
||||
for (int j = 0; j < Rank; j++)
|
||||
iidx += in_index[j] * in_strides[j];
|
||||
|
||||
output[i] = input[iidx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t Rank> static
|
||||
void launch_copy_with_reflection101(
|
||||
const Stream& stream,
|
||||
Span<T> output, const std::vector<std::size_t>& outStride,
|
||||
View<T> input, const std::vector<std::size_t>& inStride,
|
||||
const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
|
||||
{
|
||||
CV_Assert(outStride.size() == Rank);
|
||||
CV_Assert(inStride.size() == Rank);
|
||||
CV_Assert(ranges.size() == Rank);
|
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k;
|
||||
outStride_k.assign(std::begin(outStride), std::end(outStride));
|
||||
inStride_k.assign(std::begin(inStride), std::end(inStride));
|
||||
|
||||
array<index_type, Rank> start_k, end_k;
|
||||
for (int i = 0; i < Rank; i++) {
|
||||
start_k[i] = ranges[i].first;
|
||||
end_k[i] = ranges[i].second;
|
||||
}
|
||||
|
||||
auto kernel = raw::copy_with_reflection101<T, Rank>;
|
||||
auto policy = make_policy(kernel, output.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
|
||||
}
|
||||
|
||||
GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
|
||||
|
||||
template <class T>
|
||||
void copy_with_reflection101(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output, TensorView<T> input,
|
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges)
|
||||
{
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output.rank() == ranges.size());
|
||||
|
||||
/* squeezable axes at the beginning of both tensors can be eliminated
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
|
||||
* output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
|
||||
* The padding operation essentially copies items from the input tensor to new locations in the output tensor
|
||||
* and pads the remaining.
|
||||
*
|
||||
* If the size of the first axis of the input and output tensor is unity, the input and output indices
|
||||
* for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
|
||||
* there cannot be extra padding since the axes have unit size. The first index does not contribute to the
|
||||
* element's address calculation and hence does nothing apart from eating up few cycles.
|
||||
*/
|
||||
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
|
||||
CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
|
||||
|
||||
input.squeeze(0);
|
||||
output.squeeze(0);
|
||||
ranges.erase(std::begin(ranges));
|
||||
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output.rank() == ranges.size());
|
||||
}
|
||||
|
||||
auto inShape = input.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
/* contiguous axes which do not have any padding can be combined into one axis
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
|
||||
* padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
|
||||
*
|
||||
* Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
|
||||
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
|
||||
* a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
|
||||
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
|
||||
*/
|
||||
for (int i = 0; i < inShape.size(); i++) {
|
||||
/* check if axis `i` requires any padding */
|
||||
if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
|
||||
/* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
|
||||
CV_Assert(inShape[i] == outShape[i]);
|
||||
|
||||
/* we now iterate through the axes which follow and try to merge */
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
|
||||
CV_Assert(inShape[j] == outShape[j]);
|
||||
|
||||
/* `j` is also unpadded; merge `i` and `j` */
|
||||
auto new_size = inShape[i] * inShape[j];
|
||||
inShape[i] = new_size;
|
||||
outShape[i] = new_size;
|
||||
ranges[i].second = new_size;
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape.erase(std::begin(inShape) + j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
ranges.erase(std::begin(ranges) + j);
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(inShape.size() == outShape.size());
|
||||
CV_Assert(inShape.size() == ranges.size());
|
||||
CV_Assert(inShape[i] == outShape[i]);
|
||||
CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rank = inShape.size();
|
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank);
|
||||
inStride.back() = 1;
|
||||
outStride.back() = 1;
|
||||
/* garbage, ..., garbage, 1 */
|
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
|
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
|
||||
/* dim[0], dim[1], ..., dim[-1], 1 */
|
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
|
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
|
||||
/* stride[0], stride[1], ..., stride[-2], 1 */
|
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
|
||||
copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
|
||||
#endif
|
||||
template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
|
||||
|
||||
}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */
|
288
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/permute.cu
vendored
Normal file
288
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/permute.cu
vendored
Normal file
@ -0,0 +1,288 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "kernel_dispatcher.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t Rank>
|
||||
__global__ void permute(
|
||||
array<index_type, Rank> axis_order,
|
||||
Span<T> output, array<size_type, Rank> outStrides,
|
||||
View<T> input, array<size_type, Rank> inStrides)
|
||||
{
|
||||
for (auto i : grid_stride_range(input.size())) {
|
||||
index_type oldPosition = 0;
|
||||
index_type newPosition = i;
|
||||
|
||||
for (int j = 0; j < Rank; j++)
|
||||
{
|
||||
auto order = axis_order[j];
|
||||
oldPosition += (newPosition / outStrides[j]) * inStrides[order];
|
||||
newPosition %= outStrides[j];
|
||||
}
|
||||
|
||||
output[i] = input[oldPosition];
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int TILE_SIZE, int ROWS_PER_THREAD>
|
||||
__global__ void transpose(Span<T> output, View<T> input, size_type in_width, size_type out_width)
|
||||
{
|
||||
__shared__ T tile[TILE_SIZE][TILE_SIZE + 1];
|
||||
|
||||
/* blockDim.y = TILE_SIZE / ROWS_PER_THREAD, blockDim.x = TILE_SIZE */
|
||||
const index_type in_x = blockIdx.x * TILE_SIZE + threadIdx.x;
|
||||
const index_type in_y_begin = blockIdx.y * TILE_SIZE + threadIdx.y;
|
||||
|
||||
/* Every valid input location has a corresponding output location and vice versa.
|
||||
* Hence, if we do not load values into the shared memory for a given location, we
|
||||
* also won't read them for storing in the output.
|
||||
*/
|
||||
for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
|
||||
{
|
||||
const auto in_y_current = in_y_begin + j;
|
||||
if (in_x < in_width && in_y_current < out_width)
|
||||
tile[threadIdx.y + j][threadIdx.x] = input[in_y_current * in_width + in_x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/* We interchange `threadIdx.x` and `threadIdx.y` so that consecutive output indices map to
|
||||
* consecutive threads. This would allow writes across threds in a warp to be coalesced.
|
||||
*/
|
||||
const index_type out_x = blockIdx.y * TILE_SIZE + threadIdx.x;
|
||||
const index_type out_y_begin = blockIdx.x * TILE_SIZE + threadIdx.y;
|
||||
|
||||
for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
|
||||
{
|
||||
const auto out_y_current = out_y_begin + j;
|
||||
if (out_x < out_width && out_y_current < in_width)
|
||||
output[out_y_current * out_width + out_x] = tile[threadIdx.x][threadIdx.y + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void transpose(const Stream& stream, Span<T> output, View<T> input, std::size_t in_width, std::size_t out_width)
|
||||
{
|
||||
/* Each block processes a TILE_SIZE x TILE_SIZE piece */
|
||||
constexpr int TILE_SIZE = 32;
|
||||
|
||||
/* Each thread processes ROWS_PER_THREAD rows. We do this to decrease the number of threads required
|
||||
* in a block so that the cost of the block-wide synchronization is minimized.
|
||||
*/
|
||||
constexpr int ROWS_PER_THREAD = 4;
|
||||
|
||||
dim3 grid_size((in_width + TILE_SIZE - 1) / TILE_SIZE, (out_width + TILE_SIZE - 1) / TILE_SIZE);
|
||||
dim3 block_size(TILE_SIZE, TILE_SIZE / ROWS_PER_THREAD);
|
||||
auto policy = execution_policy(grid_size, block_size, stream);
|
||||
|
||||
auto kernel = raw::transpose<T, TILE_SIZE, ROWS_PER_THREAD>;
|
||||
launch_kernel(kernel, policy, output, input, in_width, out_width);
|
||||
}
|
||||
|
||||
template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t);
|
||||
template void transpose(const Stream&, Span<float>, View<float>, std::size_t, std::size_t);
|
||||
|
||||
template <class T, std::size_t Rank> static
|
||||
void launch_permute_kernel(
|
||||
const Stream& stream,
|
||||
const std::vector<std::size_t>& order,
|
||||
Span<T> output, const std::vector<std::size_t>& outStride,
|
||||
View<T> input, const std::vector<std::size_t>& inStride)
|
||||
{
|
||||
CV_Assert(order.size() == Rank);
|
||||
CV_Assert(outStride.size() == Rank);
|
||||
CV_Assert(inStride.size() == Rank);
|
||||
|
||||
array<index_type, Rank> order_k;
|
||||
order_k.assign(std::begin(order), std::end(order));
|
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k;
|
||||
outStride_k.assign(std::begin(outStride), std::end(outStride));
|
||||
inStride_k.assign(std::begin(inStride), std::end(inStride));
|
||||
|
||||
auto kernel = raw::permute<T, Rank>;
|
||||
auto policy = make_policy(kernel, input.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k);
|
||||
}
|
||||
|
||||
GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel);
|
||||
|
||||
template <class T>
|
||||
void permute(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output, TensorView<T> input,
|
||||
std::vector<std::size_t> order)
|
||||
{
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(input.rank() == order.size());
|
||||
CV_Assert(input.size() == output.size());
|
||||
|
||||
auto rank = output.rank();
|
||||
auto inShape = input.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
/* singleton axes do not contribute towards address calculation
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
|
||||
* output tensor will be some permutation of the input tensor indices. Let the output
|
||||
* tensor indices be [o1, o2, ...]. The permutation operation essentially copies items
|
||||
* from the input tensor to new locations in the output tensor as dictated by the indices.
|
||||
*
|
||||
* If the size of the nth axis (say i2) of the input is one the input and output indicies for
|
||||
* all the elements will be of the form be [i1, 0, ...] and [..., 0, ...] respectively.
|
||||
* The index does not contribute to the element's address calculation and hence would give
|
||||
* identical result if it weren't there.
|
||||
*/
|
||||
for (int i = 0; i < rank; i++)
|
||||
{
|
||||
/* index `i` corresponds to the axis index in the output; order[i] has the corresponding axis index in the input */
|
||||
while (i < rank && outShape[i] == 1)
|
||||
{
|
||||
int in_i = order[i];
|
||||
CV_Assert(inShape[in_i] == 1);
|
||||
|
||||
/* delete axis `i` */
|
||||
inShape.erase(std::begin(inShape) + in_i);
|
||||
outShape.erase(std::begin(outShape) + i);
|
||||
|
||||
/* deletion of an axis reduces an axis in the input tensor which would cause the indices
|
||||
* of the axes that come after the deleted axis to reduce by one
|
||||
*/
|
||||
order.erase(order.begin() + i);
|
||||
for (auto& axis : order)
|
||||
if (axis > in_i)
|
||||
axis--;
|
||||
|
||||
rank--;
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(rank == order.size());
|
||||
CV_Assert(inShape.size() == order.size());
|
||||
CV_Assert(outShape.size() == order.size());
|
||||
CV_Assert(input.size() == output.size());
|
||||
}
|
||||
}
|
||||
|
||||
/* contiguous axes whose relative ordering stays same before and after permutation can be merged into one axis
|
||||
* example: in permute order 0 2 3 1, axes 2 and 3 can be grouped into a single axis
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the input tensor is [i0, i1, i2, i3, ...]. Let the permutation order be [0, 3, 1, 2, ...].
|
||||
* Note that i1 and i2 are adjacent axes in the same order in input as well as output. The indices in the output tensor
|
||||
* will be [i0, i3, i1, i2, ...].
|
||||
*
|
||||
* Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
|
||||
* the two axes add a total offset of `i1 * (size2 * stride2) + i2 * stride2` which is `(i1 * size2 + i2) * stride2`,
|
||||
* in both input and output. Note stride2 can be different in the input and output. We can merge the two axes into one axis
|
||||
* with a size of `size1 * size2`. The new offset added will be `i12 * stride12` as the kernel iterates through `i12`. Note
|
||||
* that `i12` is actually `(i1 * size2 + i2)` and `stride12` is `stride2`.
|
||||
*/
|
||||
for (int i = 0; i < rank; i++) {
|
||||
/* the indices used in the loops such as `i` and `j` are axis indices in the output tensor */
|
||||
/* the corresponding input axis indices are `order[i]` and `order[j]`*/
|
||||
|
||||
/* loop invariant: `i` is the first axis in the contiguous unpermuted axis sequence */
|
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < rank && (order[i] + 1) == order[j]) {
|
||||
/* axis `i` and axis `j` do not change relative order */
|
||||
|
||||
auto in_i = order[i], in_j = order[j];
|
||||
|
||||
auto new_size = inShape[in_i] * inShape[in_j];
|
||||
inShape[in_i] = new_size;
|
||||
outShape[i] = new_size;
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape.erase(std::begin(inShape) + in_j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
|
||||
/* deletion of an axis reduces an axis in the input tensor which would cause the indices
|
||||
* of the axes that come after the deleted axis to reduce by one
|
||||
*/
|
||||
order.erase(order.begin() + j);
|
||||
for (auto& axis : order)
|
||||
if (axis > order[i])
|
||||
axis--;
|
||||
|
||||
rank--;
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(rank == order.size());
|
||||
CV_Assert(inShape.size() == order.size());
|
||||
CV_Assert(outShape.size() == order.size());
|
||||
CV_Assert(input.size() == output.size());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank);
|
||||
inStride.back() = 1;
|
||||
outStride.back() = 1;
|
||||
/* garbage, ..., garbage, 1 */
|
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
|
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
|
||||
/* dim[0], dim[1], ..., dim[-1], 1 */
|
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
|
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
|
||||
/* stride[0], stride[1], ..., stride[-2], 1 */
|
||||
|
||||
const bool is_in_order = [&order] {
|
||||
for (int i = 0; i < order.size(); i++)
|
||||
if (order[i] != i)
|
||||
return false;
|
||||
return true;
|
||||
}();
|
||||
|
||||
if (is_in_order)
|
||||
{
|
||||
kernels::copy<T>(stream, output, input);
|
||||
}
|
||||
else if(rank == 2)
|
||||
{
|
||||
/* use the more efficient transpose kernel */
|
||||
transpose<T>(stream, output, input, inShape[1], outShape[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(3 <= rank && rank <= CSL_MAX_TENSOR_RANK);
|
||||
permute_dispatcher<T, 3, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
|
||||
#endif
|
||||
template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
176
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/prior_box.cu
vendored
Normal file
176
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/prior_box.cu
vendored
Normal file
@ -0,0 +1,176 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "math.hpp"
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, bool Normalize>
|
||||
__global__ void prior_box(
|
||||
Span<T> output,
|
||||
View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
|
||||
size_type layerWidth, size_type layerHeight,
|
||||
size_type imageWidth, size_type imageHeight)
|
||||
{
|
||||
/* each box consists of two pair of coordinates and hence 4 values in total */
|
||||
/* since the entire output consists (first channel at least) of these boxes,
|
||||
* we are garunteeed that the output is aligned to a boundary of 4 values
|
||||
*/
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
|
||||
/* num_points contains the number of points in the feature map of interest
|
||||
* each iteration of the stride loop selects a point and generates prior boxes for it
|
||||
*/
|
||||
size_type num_points = layerWidth * layerHeight;
|
||||
for (auto idx : grid_stride_range(num_points)) {
|
||||
const index_type x = idx % layerWidth,
|
||||
y = idx / layerWidth;
|
||||
|
||||
index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size();
|
||||
for (int i = 0; i < boxWidth.size(); i++) {
|
||||
for (int j = 0; j < offsetX.size(); j++) {
|
||||
float center_x = (x + offsetX[j]) * stepX;
|
||||
float center_y = (y + offsetY[j]) * stepY;
|
||||
|
||||
vector_type vec;
|
||||
if(Normalize) {
|
||||
vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth;
|
||||
vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight;
|
||||
vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth;
|
||||
vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight;
|
||||
} else {
|
||||
vec.data[0] = center_x - boxWidth[i] * 0.5f;
|
||||
vec.data[1] = center_y - boxHeight[i] * 0.5f;
|
||||
vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f;
|
||||
vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f;
|
||||
}
|
||||
|
||||
v_store(output_vPtr[output_offset_v4], vec);
|
||||
output_offset_v4++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void prior_box_clip(Span<T> output) {
|
||||
for (auto i : grid_stride_range(output.size())) {
|
||||
using device::clamp;
|
||||
output[i] = clamp<T>(output[i], 0.0, 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void prior_box_set_variance1(Span<T> output, float variance) {
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
for (auto i : grid_stride_range(output.size() / 4)) {
|
||||
vector_type vec;
|
||||
for (int j = 0; j < 4; j++)
|
||||
vec.data[j] = variance;
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) {
|
||||
using vector_type = get_vector_type_t<T, 4>;
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
for (auto i : grid_stride_range(output.size() / 4)) {
|
||||
vector_type vec;
|
||||
for(int j = 0; j < 4; j++)
|
||||
vec.data[j] = variance[j];
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, bool Normalize> static
|
||||
void launch_prior_box_kernel(
|
||||
const Stream& stream,
|
||||
Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
|
||||
std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight)
|
||||
{
|
||||
auto num_points = layerWidth * layerHeight;
|
||||
auto kernel = raw::prior_box<T, Normalize>;
|
||||
auto policy = make_policy(kernel, num_points, 0, stream);
|
||||
launch_kernel(kernel, policy,
|
||||
output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
|
||||
layerWidth, layerHeight, imageWidth, imageHeight);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void generate_prior_boxes(
|
||||
const Stream& stream,
|
||||
Span<T> output,
|
||||
View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
|
||||
std::vector<float> variance,
|
||||
std::size_t numPriors,
|
||||
std::size_t layerWidth, std::size_t layerHeight,
|
||||
std::size_t imageWidth, std::size_t imageHeight,
|
||||
bool normalize, bool clip)
|
||||
{
|
||||
if (normalize) {
|
||||
launch_prior_box_kernel<T, true>(
|
||||
stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
|
||||
layerWidth, layerHeight, imageWidth, imageHeight
|
||||
);
|
||||
} else {
|
||||
launch_prior_box_kernel<T, false>(
|
||||
stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
|
||||
layerWidth, layerHeight, imageWidth, imageHeight
|
||||
);
|
||||
}
|
||||
|
||||
std::size_t channel_size = layerHeight * layerWidth * numPriors * 4;
|
||||
CV_Assert(channel_size * 2 == output.size());
|
||||
|
||||
if (clip) {
|
||||
auto output_span_c1 = Span<T>(output.data(), channel_size);
|
||||
auto kernel = raw::prior_box_clip<T>;
|
||||
auto policy = make_policy(kernel, output_span_c1.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output_span_c1);
|
||||
}
|
||||
|
||||
auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size);
|
||||
if (variance.size() == 1) {
|
||||
auto kernel = raw::prior_box_set_variance1<T>;
|
||||
auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
|
||||
launch_kernel(kernel, policy, output_span_c2, variance[0]);
|
||||
} else {
|
||||
array<float, 4> variance_k;
|
||||
variance_k.assign(std::begin(variance), std::end(variance));
|
||||
auto kernel = raw::prior_box_set_variance4<T>;
|
||||
auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
|
||||
launch_kernel(kernel, policy, output_span_c2, variance_k);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float,
|
||||
std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
|
||||
#endif
|
||||
|
||||
template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float,
|
||||
std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
216
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/region.cu
vendored
Normal file
216
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/region.cu
vendored
Normal file
@ -0,0 +1,216 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "limits.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T>
|
||||
__global__ void region_box(
|
||||
Span<T> output, View<T> input, View<T> bias,
|
||||
size_type boxes_per_cell, size_type box_size,
|
||||
size_type rows, size_type cols, T scale_x_y,
|
||||
size_type height_norm, size_type width_norm,
|
||||
T object_prob_cutoff, bool new_coords)
|
||||
{
|
||||
using vector2_type = get_vector_type_t<T, 2>;
|
||||
auto bias_vPtr = vector2_type::get_pointer(bias.data());
|
||||
|
||||
for (auto box_index : grid_stride_range(output.size() / box_size)) {
|
||||
const auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
|
||||
const auto box_offset = box_index * box_size;
|
||||
|
||||
const auto batch_inner_size = rows * cols * boxes_per_cell;
|
||||
const auto row_inner_size = cols * boxes_per_cell;
|
||||
const auto col_inner_size = boxes_per_cell;
|
||||
|
||||
const auto y = (box_index % batch_inner_size) / row_inner_size;
|
||||
const auto x = (box_index % row_inner_size) / col_inner_size;
|
||||
|
||||
/* When new_coords is true, we shouldn't use logistic activation again */
|
||||
T objectness_prob;
|
||||
if (new_coords)
|
||||
{
|
||||
const auto tmp_x = (input[box_offset + 0] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
|
||||
const auto tmp_y = (input[box_offset + 1] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
|
||||
|
||||
output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
|
||||
output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
|
||||
|
||||
vector2_type bias_xy;
|
||||
v_load(bias_xy, bias_vPtr[box_of_the_cell]);
|
||||
|
||||
output[box_offset + 2] = input[box_offset + 2] * input[box_offset + 2] *
|
||||
static_cast<T>(4) * bias_xy.data[0] / static_cast<T>(width_norm);
|
||||
output[box_offset + 3] = input[box_offset + 3] * input[box_offset + 3] *
|
||||
static_cast<T>(4) * bias_xy.data[1] / static_cast<T>(height_norm);
|
||||
|
||||
objectness_prob = input[box_offset + 4];
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto tmp_x = (fast_sigmoid(input[box_offset + 0]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
|
||||
const auto tmp_y = (fast_sigmoid(input[box_offset + 1]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
|
||||
|
||||
output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
|
||||
output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
|
||||
|
||||
vector2_type bias_xy;
|
||||
v_load(bias_xy, bias_vPtr[box_of_the_cell]);
|
||||
|
||||
output[box_offset + 2] = fast_exp(input[box_offset + 2]) * bias_xy.data[0] / static_cast<T>(width_norm);
|
||||
output[box_offset + 3] = fast_exp(input[box_offset + 3]) * bias_xy.data[1] / static_cast<T>(height_norm);
|
||||
|
||||
/* squash objectness score into a probability */
|
||||
objectness_prob = fast_sigmoid(input[box_offset + 4]);
|
||||
}
|
||||
|
||||
/* ignore prediction if the objectness probability is less than the cutoff */
|
||||
if (objectness_prob < object_prob_cutoff)
|
||||
objectness_prob = 0;
|
||||
|
||||
output[box_offset + 4] = objectness_prob;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void region_sigmoid_class_score(Span<T> output, View<T> input, T class_prob_cutoff,
|
||||
size_type box_size, bool new_coords)
|
||||
{
|
||||
for (auto idx : grid_stride_range(output.size())) {
|
||||
const index_type box_no = idx / box_size;
|
||||
const index_type start_of_box = box_no * box_size;
|
||||
const index_type box_offset = idx % box_size;
|
||||
|
||||
if (box_offset < 5) {
|
||||
/* continue as we have already processed these in region_box */
|
||||
continue;
|
||||
}
|
||||
|
||||
auto objectness_prob = output[start_of_box + 4];
|
||||
|
||||
/* the class probabilities we currently have are conditional class probabilities
|
||||
* given the object
|
||||
*
|
||||
* to obtain the actual class probability, we multiply the conditional probability
|
||||
* with the object probability
|
||||
*
|
||||
* when new_coords is true, we shouldn't use logistic activation again.
|
||||
*/
|
||||
|
||||
T actual_class_prob;
|
||||
if (new_coords)
|
||||
{
|
||||
actual_class_prob = objectness_prob * input[idx];
|
||||
}
|
||||
else
|
||||
{
|
||||
actual_class_prob = objectness_prob * fast_sigmoid(input[idx]);
|
||||
}
|
||||
|
||||
if (actual_class_prob <= class_prob_cutoff)
|
||||
actual_class_prob = T(0);
|
||||
output[idx] = actual_class_prob;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void region_softmax_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size) {
|
||||
for (auto box_no : grid_stride_range(output.size() / box_size)) {
|
||||
const index_type start_of_box = box_no * box_size;
|
||||
const index_type start_idx = start_of_box + 5;
|
||||
const index_type end_idx = start_of_box + box_size;
|
||||
|
||||
auto largest = numeric_limits<T>::lowest();
|
||||
for (int idx = start_idx; idx < end_idx; idx++) {
|
||||
using device::max;
|
||||
largest = max(largest, input[idx]);
|
||||
}
|
||||
|
||||
auto sum = T(0);
|
||||
for (int idx = start_idx; idx < end_idx; idx++) {
|
||||
using device::exp;
|
||||
auto temp = exp(input[idx] - largest);
|
||||
sum += temp;
|
||||
output[idx] = temp;
|
||||
}
|
||||
|
||||
for (int idx = start_idx; idx < end_idx; idx++) {
|
||||
auto softmax_score = output[idx] / sum;
|
||||
|
||||
/* the class probabilities we currently have are conditional class probabilities
|
||||
* given the object
|
||||
*
|
||||
* to obtain the actual class probability, we multiply the conditional probability
|
||||
* with the object probability
|
||||
*/
|
||||
auto objectness_prob = output[start_of_box + 4];
|
||||
auto actual_class_prob = objectness_prob * softmax_score;
|
||||
if (actual_class_prob <= class_prob_cutoff)
|
||||
actual_class_prob = T(0);
|
||||
output[idx] = actual_class_prob;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void region(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
|
||||
T object_prob_cutoff, T class_prob_cutoff,
|
||||
std::size_t boxes_per_cell, std::size_t box_size,
|
||||
std::size_t rows, std::size_t cols, T scale_x_y,
|
||||
std::size_t height_norm, std::size_t width_norm,
|
||||
bool if_true_sigmoid_else_softmax, /* true = sigmoid, false = softmax */
|
||||
bool new_coords)
|
||||
{
|
||||
CV_Assert(output.size() == input.size());
|
||||
CV_Assert(output.size() % box_size == 0);
|
||||
CV_Assert(is_fully_aligned(bias, 2));
|
||||
|
||||
auto box_kernel = raw::region_box<T>;
|
||||
auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream);
|
||||
launch_kernel(box_kernel, box_policy,
|
||||
output, input, bias, boxes_per_cell, box_size,
|
||||
rows, cols, scale_x_y, height_norm, width_norm,
|
||||
object_prob_cutoff, new_coords);
|
||||
|
||||
if (if_true_sigmoid_else_softmax) {
|
||||
auto kernel_score = raw::region_sigmoid_class_score<T>;
|
||||
auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
|
||||
launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size, new_coords);
|
||||
} else {
|
||||
auto kernel_score = raw::region_softmax_class_score<T>;
|
||||
auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
|
||||
launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void region(const Stream&, Span<__half>, View<__half>, View<__half>,
|
||||
__half, __half, std::size_t, std::size_t, std::size_t, std::size_t, __half, std::size_t, std::size_t, bool, bool);
|
||||
#endif
|
||||
|
||||
template void region(const Stream&, Span<float>, View<float>, View<float>,
|
||||
float, float, std::size_t, std::size_t, std::size_t, std::size_t, float, std::size_t, std::size_t, bool, bool);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
245
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/resize.cu
vendored
Normal file
245
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/resize.cu
vendored
Normal file
@ -0,0 +1,245 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "memory.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t CHANNELS_PER_ITER>
|
||||
__global__ void resize_nn(
|
||||
Span<T> output, size_type out_height, size_type out_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
float o2i_fy, float o2i_fx, bool round, bool half_pixel_centers)
|
||||
{
|
||||
auto in_image_size = in_height * in_width;
|
||||
auto out_image_size = out_height * out_width;
|
||||
|
||||
/* think of the output and input as a collection of 2d images with the last axis
|
||||
* representing the width and the last but one axis representing the height
|
||||
*
|
||||
* the remaining axis together form a collection of these images/channels
|
||||
*/
|
||||
auto num_effective_channels = output.size() / out_image_size;
|
||||
|
||||
/* we process multiple channels every iteration to reuse the identical computation
|
||||
* involved with the spatial dimensions
|
||||
*
|
||||
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
|
||||
* (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
|
||||
*/
|
||||
auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
|
||||
|
||||
/* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
|
||||
* combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
|
||||
* iterations in total to finish the resize operation
|
||||
*/
|
||||
auto iters_required = num_channel_iters_per_xy * out_image_size;
|
||||
for (auto iter : grid_stride_range(iters_required)) {
|
||||
const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
|
||||
|
||||
/* note here that consecutive `iter` values will often have consecutive `x` values
|
||||
* => stores into output will be coalesced across threads
|
||||
*/
|
||||
const index_type y = (iter % out_image_size) / out_width;
|
||||
const index_type x = iter % out_width;
|
||||
|
||||
auto in_yf = half_pixel_centers ? (y + 0.5f) * o2i_fy : y * o2i_fy;
|
||||
auto in_xf = half_pixel_centers ? (x + 0.5f) * o2i_fx : x * o2i_fx;
|
||||
|
||||
using device::lround;
|
||||
index_type in_y = round ? lround(in_yf) : static_cast<index_type>(in_yf);
|
||||
index_type in_x = round ? lround(in_xf) : static_cast<index_type>(in_xf);
|
||||
|
||||
using device::min;
|
||||
in_y = min(in_y, in_height - 1);
|
||||
in_x = min(in_x, in_width - 1);
|
||||
|
||||
index_type in_idx = c_start * in_image_size + in_y * in_width + in_x;
|
||||
index_type out_idx = c_start * out_image_size + y * out_width + x;
|
||||
|
||||
for (int i = 0; i < CHANNELS_PER_ITER; i++) {
|
||||
output[out_idx] = load_ldg(input[in_idx]);
|
||||
|
||||
in_idx += in_image_size;
|
||||
out_idx += out_image_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER>
|
||||
__global__ void resize_bilinear(
|
||||
Span<T> output, size_type out_height, size_type out_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
float o2i_fy, float o2i_fx, bool half_pixel_centers)
|
||||
{
|
||||
auto in_image_size = in_height * in_width;
|
||||
auto out_image_size = out_height * out_width;
|
||||
|
||||
/* think of the output and input as a collection of 2d images with the last axis
|
||||
* representing the width and the last but one axis representing the height
|
||||
*
|
||||
* the remaining axis together form a collection of these images/channels
|
||||
*/
|
||||
auto num_effective_channels = output.size() / out_image_size;
|
||||
|
||||
/* we process multiple channels every iteration to reuse the identical computation
|
||||
* involved with the spatial dimensions
|
||||
*
|
||||
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
|
||||
* (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
|
||||
*/
|
||||
auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
|
||||
|
||||
/* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
|
||||
* combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
|
||||
* iterations in total to finish the resize operation
|
||||
*/
|
||||
auto iters_required = num_channel_iters_per_xy * out_image_size;
|
||||
|
||||
for (auto iter : grid_stride_range(iters_required)) {
|
||||
const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
|
||||
const index_type c_end = c_start + CHANNELS_PER_ITER;
|
||||
|
||||
/* note here that consecutive `iter` values will often have consecutive `x` values
|
||||
* => stores into output will be coalesced across threads
|
||||
*/
|
||||
const index_type y = (iter % out_image_size) / out_width;
|
||||
const index_type x = iter % out_width;
|
||||
|
||||
using device::max;
|
||||
auto in_x = half_pixel_centers ? max<float>((x + 0.5f) * o2i_fx - 0.5f, 0.0f) : x * o2i_fx;
|
||||
auto in_y = half_pixel_centers ? max<float>((y + 0.5f) * o2i_fy - 0.5f, 0.0f) : y * o2i_fy;
|
||||
|
||||
auto in_x0 = static_cast<index_type>(in_x);
|
||||
auto in_y0 = static_cast<index_type>(in_y);
|
||||
|
||||
using device::min;
|
||||
auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
|
||||
auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
|
||||
|
||||
index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
|
||||
index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
|
||||
index_type out_idx = c_start * out_image_size + y * out_width + x;
|
||||
|
||||
#pragma unroll 1 /* disable unrolling to reduce register pressure; not sure how but it works */
|
||||
for (auto c = c_start; c < c_end; c++) {
|
||||
auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
|
||||
v_01 = load_ldg(input[in_offset_r0 + in_x1]),
|
||||
v_10 = load_ldg(input[in_offset_r1 + in_x0]),
|
||||
v_11 = load_ldg(input[in_offset_r1 + in_x1]);
|
||||
|
||||
output[out_idx] =
|
||||
v_00 +
|
||||
T(in_y - in_y0) * T(v_10 - v_00) +
|
||||
T(in_x - in_x0) * T(v_01 - v_00) +
|
||||
T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
|
||||
|
||||
in_offset_r0 += in_image_size;
|
||||
in_offset_r1 += in_image_size;
|
||||
out_idx += out_image_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER> static
|
||||
void launch_multichannel_resize_nn(const Stream& stream,
|
||||
Span<T> output, size_type out_height, size_type out_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
float scale_y, float scale_x, bool round, bool half_pixel_centers)
|
||||
{
|
||||
auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>;
|
||||
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
|
||||
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers) {
|
||||
auto out_height = output.get_axis_size(-2);
|
||||
auto out_width = output.get_axis_size(-1);
|
||||
|
||||
auto in_height = input.get_axis_size(-2);
|
||||
auto in_width = input.get_axis_size(-1);
|
||||
|
||||
auto num_effective_channels = input.size_range(0, 2);
|
||||
auto num_iters = num_effective_channels * out_height * out_width;
|
||||
|
||||
if (num_effective_channels % 32 == 0 && num_iters > 655360) {
|
||||
launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
} else if (num_effective_channels % 16 == 0 && num_iters > 327680) {
|
||||
launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
} else if (num_effective_channels % 8 == 0 && num_iters > 163840) {
|
||||
launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
} else if (num_effective_channels % 4 == 0 && num_iters > 81920) {
|
||||
launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
} else if (num_effective_channels % 2 == 0) {
|
||||
launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
} else {
|
||||
launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool, bool);
|
||||
#endif
|
||||
template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool,bool);
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER> static
|
||||
void launch_multichannel_resize_bilinear(const Stream& stream,
|
||||
Span<T> output, size_type out_height, size_type out_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
float scale_y, float scale_x, bool half_pixel_centers)
|
||||
{
|
||||
auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>;
|
||||
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
|
||||
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers) {
|
||||
auto out_height = output.get_axis_size(-2);
|
||||
auto out_width = output.get_axis_size(-1);
|
||||
|
||||
auto in_height = input.get_axis_size(-2);
|
||||
auto in_width = input.get_axis_size(-1);
|
||||
|
||||
auto num_effective_channels = input.size_range(0, 2);
|
||||
auto num_iters = num_effective_channels * out_height * out_width;
|
||||
|
||||
if (num_effective_channels % 16 == 0 && num_iters > 163840) {
|
||||
launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
|
||||
} else if (num_effective_channels % 8 == 0 && num_iters > 81920) {
|
||||
launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
|
||||
} else if (num_effective_channels % 4 == 0 && num_iters > 40960) {
|
||||
launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
|
||||
} else if (num_effective_channels % 2 == 0) {
|
||||
launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
|
||||
} else {
|
||||
launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool);
|
||||
#endif
|
||||
template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
181
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/roi_pooling.cu
vendored
Normal file
181
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/roi_pooling.cu
vendored
Normal file
@ -0,0 +1,181 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "math.hpp"
|
||||
#include "limits.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "memory.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER>
|
||||
__global__ void roi_pooling(
|
||||
Span<T> output, size_type pooled_height, size_type pooled_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
View<T> rois, size_type num_channels, float spatial_scale)
|
||||
{
|
||||
// input: [1, num_channels, in_height, in_width]
|
||||
const auto in_image_size = in_height * in_width;
|
||||
|
||||
// rois: [num_rois, 5]
|
||||
auto num_rois = rois.size() / 5;
|
||||
|
||||
// output: [num_rois, num_channels, pooled_height, pooled_width]
|
||||
const auto out_spatial_size = pooled_height * pooled_width;
|
||||
const auto out_roi_size = num_channels * out_spatial_size;
|
||||
|
||||
/* we have to compute the output value for every combination of (roi, c, y, x) in the output
|
||||
*
|
||||
* the computation involving (y, x) are identical for all non-spatial dimensions
|
||||
* the computation and memory requests involving the roi are identical for remaining three axes
|
||||
*
|
||||
* we process multiple channels every iteration to reuse the identical computation
|
||||
* and memory requests involved with the roi and spatial dimensions
|
||||
*/
|
||||
/*
|
||||
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
|
||||
* (num_channels / CHANNELS_PER_ITER) iterations per (roi, x, y)
|
||||
*/
|
||||
auto num_channel_iters_per_roi_xy = num_channels / CHANNELS_PER_ITER;
|
||||
|
||||
/* we need `num_channel_iters_per_roi_xy` iterations per (roi, x, y) and there are
|
||||
* `num_rois` rois and `out_spatial_size` combinations of (x, y)
|
||||
*/
|
||||
auto iters_per_roi = num_channel_iters_per_roi_xy * out_spatial_size;
|
||||
auto iters_required = num_rois * iters_per_roi;
|
||||
|
||||
for (auto iter : grid_stride_range(iters_required))
|
||||
{
|
||||
const index_type roi_no = iter / iters_per_roi;
|
||||
const index_type c_start = ((iter % iters_per_roi) / out_spatial_size) * CHANNELS_PER_ITER;
|
||||
|
||||
/* note here that consecutive `iter` values will often have consecutive `x` values
|
||||
* => stores into output will be coalesced across threads
|
||||
*/
|
||||
const index_type y = (iter % out_spatial_size) / pooled_width;
|
||||
const index_type x = iter % pooled_width;
|
||||
|
||||
const index_type roi_offset = roi_no * 5;
|
||||
|
||||
using device::round;
|
||||
const index_type batch_id = rois[roi_offset + 0];
|
||||
const index_type x_start_roi = round(static_cast<float>(rois[roi_offset + 1]) * spatial_scale);
|
||||
const index_type y_start_roi = round(static_cast<float>(rois[roi_offset + 2]) * spatial_scale);
|
||||
const index_type x_end_roi = round(static_cast<float>(rois[roi_offset + 3]) * spatial_scale);
|
||||
const index_type y_end_roi = round(static_cast<float>(rois[roi_offset + 4]) * spatial_scale);
|
||||
|
||||
using device::max;
|
||||
const auto roi_width = max<index_type>(x_end_roi - x_start_roi + 1, 1);
|
||||
const auto roi_height = max<index_type>(y_end_roi - y_start_roi + 1, 1);
|
||||
|
||||
const auto roi_width_ratio = static_cast<float>(roi_width) / pooled_width;
|
||||
const auto roi_height_ratio = static_cast<float>(roi_height) / pooled_height;
|
||||
|
||||
auto x_start = x_start_roi + static_cast<index_type>(x * roi_width_ratio);
|
||||
auto y_start = y_start_roi + static_cast<index_type>(y * roi_height_ratio);
|
||||
|
||||
using device::ceil;
|
||||
auto x_end = x_start_roi + static_cast<index_type>(ceil((x + 1) * roi_width_ratio));
|
||||
auto y_end = y_start_roi + static_cast<index_type>(ceil((y + 1) * roi_height_ratio));
|
||||
|
||||
using device::max;
|
||||
x_start = max<index_type>(x_start, 0);
|
||||
y_start = max<index_type>(y_start, 0);
|
||||
|
||||
using device::min;
|
||||
x_end = min<index_type>(x_end, in_width);
|
||||
y_end = min<index_type>(y_end, in_height);
|
||||
|
||||
index_type in_offset = (batch_id * num_channels + c_start) * in_height * in_width;
|
||||
index_type out_idx = roi_no * out_roi_size + c_start * out_spatial_size + y * pooled_width + x;
|
||||
|
||||
for (int i = 0; i < CHANNELS_PER_ITER; i++)
|
||||
{
|
||||
/* We have to set the output to zero if (x_start >= x_end) or (y_start >= y_end). If either
|
||||
* condition is true, the loops below won't execute even a single iteration. Hence, by setting
|
||||
* `max_val` to zero in this case, we can combine it with the `else` code.
|
||||
*/
|
||||
T max_val = (x_start >= x_end || y_start >= y_end) ? T(0) : device::numeric_limits<T>::lowest();
|
||||
|
||||
for (auto iy = y_start; iy < y_end; iy++)
|
||||
{
|
||||
const auto in_idx = in_offset + iy * in_width;
|
||||
for (auto ix = x_start; ix < x_end; ix++)
|
||||
{
|
||||
max_val = max(max_val, load_ldg(input[in_idx + ix]));
|
||||
}
|
||||
}
|
||||
|
||||
output[out_idx] = max_val;
|
||||
|
||||
in_offset += in_image_size;
|
||||
out_idx += out_spatial_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t CHANNELS_PER_ITER> static
|
||||
void launch_multichannel_roi_pooling(const Stream& stream,
|
||||
Span<T> output, size_type pooled_height, size_type pooled_width,
|
||||
View<T> input, size_type in_height, size_type in_width,
|
||||
View<T> rois, size_type num_channels, float spatial_scale)
|
||||
{
|
||||
auto kernel = raw::roi_pooling<T, CHANNELS_PER_ITER>;
|
||||
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
|
||||
launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois, float spatial_scale)
|
||||
{
|
||||
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
|
||||
|
||||
size_type num_channels = output.get_axis_size(1);
|
||||
|
||||
size_type pooled_height = output.get_axis_size(2);
|
||||
size_type pooled_width = output.get_axis_size(3);
|
||||
|
||||
size_type in_height = input.get_axis_size(2);
|
||||
size_type in_width = input.get_axis_size(3);
|
||||
|
||||
if (num_channels % 64 == 0) {
|
||||
launch_multichannel_roi_pooling<T, 64>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
} else if (num_channels % 32 == 0) {
|
||||
launch_multichannel_roi_pooling<T, 32>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
} else if (num_channels % 16 == 0) {
|
||||
launch_multichannel_roi_pooling<T, 16>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
} else if (num_channels % 8 == 0) {
|
||||
launch_multichannel_roi_pooling<T, 8>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
} else if (num_channels % 4 == 0) {
|
||||
launch_multichannel_roi_pooling<T, 4>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
} else if (num_channels % 2 == 0) {
|
||||
launch_multichannel_roi_pooling<T, 2>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
} else {
|
||||
launch_multichannel_roi_pooling<T, 1>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois, float spatial_scale);
|
||||
#endif
|
||||
template void roi_pooling(const Stream& stream, TensorSpan<float> output, TensorView<float> input, View<float> rois, float spatial_scale);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
235
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/scale_shift.cu
vendored
Normal file
235
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/scale_shift.cu
vendored
Normal file
@ -0,0 +1,235 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "types.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t N>
|
||||
__global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
inner_size /= vector_type::size();
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
const index_type bias_idx = (i / inner_size) % bias.size();
|
||||
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
for(int j = 0; j < vec.size(); j++)
|
||||
vec.data[j] = vec.data[j] + bias[bias_idx];
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
__global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
|
||||
{
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
inner_size /= vector_type::size();
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
const index_type scale_idx = (i / inner_size) % weights.size();
|
||||
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
for (int j = 0; j < vec.size(); j++)
|
||||
vec.data[j] = vec.data[j] * weights[scale_idx];
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
__global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
|
||||
{
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
for (int j = 0; j < vec.size(); j++)
|
||||
vec.data[j] = alpha * vec.data[j] + beta;
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
__global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
|
||||
{
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
|
||||
inner_size /= vector_type::size();
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
const index_type scale_idx = (i / inner_size) % weights.size();
|
||||
|
||||
vector_type vec;
|
||||
v_load(vec, input_vPtr[i]);
|
||||
for (int j = 0; j < vec.size(); j++)
|
||||
vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
|
||||
v_store(output_vPtr[i], vec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::biasN_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, inner_size, bias);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void biasN(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output,
|
||||
TensorView<T> input, std::size_t inner_size,
|
||||
TensorView<T> bias)
|
||||
{
|
||||
CV_Assert(is_shape_same(input, output));
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
|
||||
launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
|
||||
launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
|
||||
} else {
|
||||
launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
|
||||
#endif
|
||||
template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::scaleN_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, inner_size, weights);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void scaleN(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output,
|
||||
TensorView<T> input, std::size_t inner_size,
|
||||
TensorView<T> weights)
|
||||
{
|
||||
CV_Assert(is_shape_same(input, output));
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
|
||||
launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
|
||||
launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
|
||||
} else {
|
||||
launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
|
||||
#endif
|
||||
template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
|
||||
auto kernel = raw::scale1_with_bias1_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, alpha, beta);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
|
||||
CV_Assert(output.size() == input.size());
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
|
||||
launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
|
||||
launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
|
||||
} else {
|
||||
launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
|
||||
#endif
|
||||
template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
|
||||
|
||||
template <class T, std::size_t N> static
|
||||
void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
CV_Assert(inner_size % N == 0);
|
||||
|
||||
auto kernel = raw::scaleN_with_biasN_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void scaleN_with_biasN(
|
||||
const Stream& stream,
|
||||
TensorSpan<T> output,
|
||||
TensorView<T> input, std::size_t inner_size,
|
||||
TensorView<T> weights, TensorView<T> bias)
|
||||
{
|
||||
CV_Assert(is_shape_same(input, output));
|
||||
CV_Assert(weights.size() == bias.size());
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
|
||||
launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
|
||||
launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
|
||||
} else {
|
||||
launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
|
||||
#endif
|
||||
template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
111
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/shortcut.cu
vendored
Normal file
111
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/shortcut.cu
vendored
Normal file
@ -0,0 +1,111 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "vector_traits.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t N>
|
||||
__global__ void input_shortcut_vec(
|
||||
Span<T> output,
|
||||
View<T> input, index_type c_input, /* `c_input` = number of channels in `input` */
|
||||
View<T> from, index_type c_from, /* `c_from` = number of channels in `from` */
|
||||
size_type channel_stride /* common for both `input` and `from` */)
|
||||
{
|
||||
using vector_type = get_vector_type_t<T, N>;
|
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data());
|
||||
auto input_vPtr = vector_type::get_pointer(input.data());
|
||||
auto from_vPtr = vector_type::get_pointer(from.data());
|
||||
|
||||
auto batch_stride_input = c_input * channel_stride;
|
||||
auto batch_stride_from = c_from * channel_stride;
|
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
|
||||
const auto actual_idx = i * vector_type::size();
|
||||
const auto b = actual_idx / batch_stride_input; /* `input` and `output` have the same shape */
|
||||
const auto c = (actual_idx % batch_stride_input) / channel_stride;
|
||||
const auto c_offset = actual_idx % channel_stride;
|
||||
|
||||
vector_type vec_input;
|
||||
v_load(vec_input, input_vPtr[i]);
|
||||
|
||||
/* We can break down the shortcut operation into two steps:
|
||||
* - copy `input` to `output`
|
||||
* - add `from` to corresponding channels in `output`
|
||||
*
|
||||
* In this scheme, only some channels in the `output` differ from `input`. They differ in the channels
|
||||
* which have a corresponding channel in `from`.
|
||||
*/
|
||||
if (c < c_from) {
|
||||
const auto from_actual_idx = b * batch_stride_from + c * channel_stride + c_offset;
|
||||
const auto from_vec_idx = from_actual_idx / vector_type::size();
|
||||
|
||||
vector_type vec_from;
|
||||
v_load(vec_from, from_vPtr[from_vec_idx]);
|
||||
for (int j = 0; j < vector_type::size(); j++)
|
||||
vec_input.data[j] += vec_from.data[j];
|
||||
}
|
||||
|
||||
v_store(output_vPtr[i], vec_input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
void launch_vectorized_input_shortcut(const Stream& stream, Span<T> output, View<T> input, std::size_t c_input, View<T> from, std::size_t c_from, std::size_t channel_stride) {
|
||||
CV_Assert(is_fully_aligned<T>(output, N));
|
||||
CV_Assert(is_fully_aligned<T>(input, N));
|
||||
CV_Assert(is_fully_aligned<T>(from, N));
|
||||
CV_Assert(channel_stride % N == 0);
|
||||
|
||||
auto kernel = raw::input_shortcut_vec<T, N>;
|
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream);
|
||||
launch_kernel(kernel, policy, output, input, c_input, from, c_from, channel_stride);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void input_shortcut(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> from) {
|
||||
CV_Assert(is_shape_same(output, input));
|
||||
CV_Assert(output.rank() == from.rank());
|
||||
for (int i = 0; i < output.rank(); i++) {
|
||||
if (i != 1) {
|
||||
CV_Assert(from.get_axis_size(i) == output.get_axis_size(i));
|
||||
}
|
||||
}
|
||||
|
||||
auto channel_stride = output.size_range(2, output.rank()); /* same for `output`, `input` and `from` */
|
||||
auto c_input = input.get_axis_size(1);
|
||||
auto c_from = from.get_axis_size(1);
|
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_fully_aligned<T>(from, 4) && channel_stride % 4 == 0) {
|
||||
launch_vectorized_input_shortcut<T, 4>(stream, output, input, c_input, from, c_from, channel_stride);
|
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_fully_aligned<T>(from, 2) && channel_stride % 2 == 0) {
|
||||
launch_vectorized_input_shortcut<T, 2>(stream, output, input, c_input, from, c_from, channel_stride);
|
||||
} else {
|
||||
launch_vectorized_input_shortcut<T, 1>(stream, output, input, c_input, from, c_from, channel_stride);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void input_shortcut(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<__half>);
|
||||
#endif
|
||||
template void input_shortcut(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<float>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
203
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/slice.cu
vendored
Normal file
203
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/slice.cu
vendored
Normal file
@ -0,0 +1,203 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "array.hpp"
|
||||
#include "types.hpp"
|
||||
#include "grid_stride_range.hpp"
|
||||
#include "execution.hpp"
|
||||
#include "kernel_dispatcher.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp"
|
||||
#include "../cuda4dnn/csl/tensor.hpp"
|
||||
#include "../cuda4dnn/csl/span.hpp"
|
||||
|
||||
#include "../cuda4dnn/kernels/fill_copy.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl;
|
||||
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
namespace raw {
|
||||
template <class T, std::size_t Rank>
|
||||
__global__ void slice(
|
||||
Span<T> output, array<size_type, Rank> out_strides,
|
||||
View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset)
|
||||
{
|
||||
for (auto i : grid_stride_range(output.size())) {
|
||||
index_type out_index = i / out_strides[0];
|
||||
index_type in_index = in_offset[0] + out_index;
|
||||
index_type iidx = in_index * in_strides[0];
|
||||
for (int j = 1; j < Rank; j++) {
|
||||
out_index = (i % out_strides[j - 1]) / out_strides[j];
|
||||
in_index = in_offset[j] + out_index;
|
||||
iidx += in_index * in_strides[j];
|
||||
}
|
||||
|
||||
output[i] = input[iidx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, std::size_t Rank> static
|
||||
void launch_slice(
|
||||
const Stream& stream,
|
||||
Span<T> output, const std::vector<std::size_t>& outStride,
|
||||
View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset)
|
||||
{
|
||||
CV_Assert(outStride.size() == Rank);
|
||||
CV_Assert(inStride.size() == Rank);
|
||||
CV_Assert(inOffset.size() == Rank);
|
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k;
|
||||
outStride_k.assign(std::begin(outStride), std::end(outStride));
|
||||
inStride_k.assign(std::begin(inStride), std::end(inStride));
|
||||
|
||||
array<index_type, Rank> inOffset_k;
|
||||
inOffset_k.assign(std::begin(inOffset), std::end(inOffset));
|
||||
|
||||
auto kernel = raw::slice<T, Rank>;
|
||||
auto policy = make_policy(kernel, output.size(), 0, stream);
|
||||
launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k);
|
||||
}
|
||||
|
||||
GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice);
|
||||
|
||||
template <class T>
|
||||
void slice(const Stream& stream,
|
||||
TensorSpan<T> output, TensorView<T> input,
|
||||
std::vector<std::size_t> offsets)
|
||||
{
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output.rank() == offsets.size());
|
||||
|
||||
/* copy directly if no slicing is required */
|
||||
if (is_shape_same(output, input))
|
||||
{
|
||||
CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
|
||||
kernels::copy<T>(stream, output, input);
|
||||
return;
|
||||
}
|
||||
|
||||
/* squeezable axes at the beginning of both tensors can be eliminated
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input
|
||||
* tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are ignored.
|
||||
*
|
||||
* If the size of the first axis of the input and output tensor is unity, the input and output indices
|
||||
* for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that
|
||||
* there cannot be any ignored items since the axes have unit size. The first index does not contribute to the
|
||||
* element's address calculation and hence does nothing apart from eating up few cycles.
|
||||
*/
|
||||
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
|
||||
CV_Assert(offsets[0] == 0);
|
||||
|
||||
input.squeeze(0);
|
||||
output.squeeze(0);
|
||||
offsets.erase(std::begin(offsets));
|
||||
|
||||
CV_Assert(output.rank() == input.rank());
|
||||
CV_Assert(output.rank() == offsets.size());
|
||||
}
|
||||
|
||||
auto inShape = input.shape_as_vector();
|
||||
auto outShape = output.shape_as_vector();
|
||||
|
||||
/* contiguous axes which do not undergo slicing can be combined into one axis
|
||||
*
|
||||
* Reasoning:
|
||||
* ----------
|
||||
* Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any
|
||||
* slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...].
|
||||
*
|
||||
* Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example,
|
||||
* the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with
|
||||
* a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`.
|
||||
* Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor.
|
||||
*/
|
||||
for (int i = 0; i < inShape.size(); i++) {
|
||||
/* check if axis `i` requires any slicing */
|
||||
if (offsets[i] == 0 && inShape[i] == outShape[i]) {
|
||||
/* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
|
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */
|
||||
while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
|
||||
/* `j` axis is also unsliced; merge `i` and `j` */
|
||||
auto new_size = inShape[i] * inShape[j];
|
||||
inShape[i] = new_size;
|
||||
outShape[i] = new_size;
|
||||
offsets[i] = 0; /* redundant */
|
||||
|
||||
/* delete axis `j` */
|
||||
inShape.erase(std::begin(inShape) + j);
|
||||
outShape.erase(std::begin(outShape) + j);
|
||||
offsets.erase(std::begin(offsets) + j);
|
||||
|
||||
/* optimizations should not break the invariants */
|
||||
CV_Assert(inShape.size() == outShape.size());
|
||||
CV_Assert(inShape.size() == offsets.size());
|
||||
CV_Assert(inShape[i] == outShape[i]);
|
||||
CV_Assert(offsets[i] == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto rank = inShape.size();
|
||||
|
||||
/* We can do a copy if the reduced rank is two and only the first axis is sliced.
|
||||
* The general requirement is that only one axis is sliced and all the axes that
|
||||
* preceed the sliced axis are singleton. However, the reductions above will remove
|
||||
* all the leading singleton axes and merge the trailing unsliced axes into one, or
|
||||
* zero if there are no trailing unsliced axes. The latter is handled separately.
|
||||
*/
|
||||
if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
|
||||
{
|
||||
auto stride = inShape[1];
|
||||
auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
|
||||
kernels::copy<T>(stream, output, sliced_input);
|
||||
return;
|
||||
}
|
||||
|
||||
if (rank == 1)
|
||||
{
|
||||
auto sliced_input = View<T>(input.get() + offsets[0], output.size());
|
||||
kernels::copy<T>(stream, output, sliced_input);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank);
|
||||
inStride.back() = 1;
|
||||
outStride.back() = 1;
|
||||
/* garbage, ..., garbage, 1 */
|
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
|
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
|
||||
/* dim[0], dim[1], ..., dim[-1], 1 */
|
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
|
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
|
||||
/* stride[0], stride[1], ..., stride[-2], 1 */
|
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
|
||||
slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets);
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
||||
template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
|
||||
#endif
|
||||
template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
27
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/types.hpp
vendored
Normal file
27
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/types.hpp
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_TYPES_HPP
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
/* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
|
||||
* Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions.
|
||||
*
|
||||
* If we do not need to handle huge tensors, we can use 32-bit indices and get better performance.
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
using size_type = int;
|
||||
using index_type = int;
|
||||
#else
|
||||
using size_type = std::int32_t;
|
||||
using index_type = std::int32_t;
|
||||
#endif
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */
|
120
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/vector_traits.hpp
vendored
Normal file
120
3rdparty/opencv-4.5.4/modules/dnn/src/cuda/vector_traits.hpp
vendored
Normal file
@ -0,0 +1,120 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "types.hpp"
|
||||
#include "memory.hpp"
|
||||
|
||||
#include "../cuda4dnn/csl/pointer.hpp"
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
|
||||
|
||||
/** \file vector_traits.hpp
|
||||
* \brief utility classes and functions for vectorized memory loads/stores
|
||||
*
|
||||
* Example:
|
||||
* using vector_type = get_vector_type_t<float, 4>;
|
||||
*
|
||||
* auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
|
||||
* auto output_vPtr = type::get_pointer(optr); // optr is of type DevicePtr<float>
|
||||
*
|
||||
* vector_type vec;
|
||||
* v_load(vec, input_vPtr);
|
||||
*
|
||||
* for(int i = 0; i < vector_type::size(); i++)
|
||||
* vec[i] = do_something(vec[i]);
|
||||
*
|
||||
* v_store(output_vPtr, vec);
|
||||
*/
|
||||
|
||||
namespace detail {
|
||||
template <size_type N> struct raw_type_ { };
|
||||
template <> struct raw_type_<256> { typedef ulonglong4 type; };
|
||||
template <> struct raw_type_<128> { typedef uint4 type; };
|
||||
template <> struct raw_type_<64> { typedef uint2 type; };
|
||||
template <> struct raw_type_<32> { typedef uint1 type; };
|
||||
template <> struct raw_type_<16> { typedef uchar2 type; };
|
||||
template <> struct raw_type_<8> { typedef uchar1 type; };
|
||||
|
||||
template <size_type N> struct raw_type {
|
||||
using type = typename raw_type_<N>::type;
|
||||
static_assert(sizeof(type) * 8 == N, "");
|
||||
};
|
||||
}
|
||||
|
||||
/* \tparam T type of element in the vector
|
||||
* \tparam N "number of elements" of type T in the vector
|
||||
*/
|
||||
template <class T, size_type N>
|
||||
union vector_type {
|
||||
using value_type = T;
|
||||
using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type;
|
||||
|
||||
__device__ vector_type() { }
|
||||
|
||||
__device__ static constexpr size_type size() { return N; }
|
||||
|
||||
raw_type raw;
|
||||
T data[N];
|
||||
|
||||
template <class U> static __device__
|
||||
typename std::enable_if<std::is_const<U>::value, const vector_type*>
|
||||
::type get_pointer(csl::DevicePtr<U> ptr) {
|
||||
return reinterpret_cast<const vector_type*>(ptr.get());
|
||||
}
|
||||
|
||||
template <class U> static __device__
|
||||
typename std::enable_if<!std::is_const<U>::value, vector_type*>
|
||||
::type get_pointer(csl::DevicePtr<U> ptr) {
|
||||
return reinterpret_cast<vector_type*>(ptr.get());
|
||||
}
|
||||
};
|
||||
|
||||
template <class V>
|
||||
__device__ void v_load(V& dest, const V& src) {
|
||||
dest.raw = src.raw;
|
||||
}
|
||||
|
||||
template <class V>
|
||||
__device__ void v_load(V& dest, const V* src) {
|
||||
dest.raw = src->raw;
|
||||
}
|
||||
|
||||
template <class V>
|
||||
__device__ void v_load_ldg(V& dest, const V& src) {
|
||||
dest.raw = load_ldg(src.raw);
|
||||
}
|
||||
|
||||
template <class V>
|
||||
__device__ void v_load_ldg(V& dest, const V* src) {
|
||||
dest.raw = load_ldg(src->raw);
|
||||
}
|
||||
|
||||
template <class V>
|
||||
__device__ void v_store(V* dest, const V& src) {
|
||||
dest->raw = src.raw;
|
||||
}
|
||||
|
||||
template <class V>
|
||||
__device__ void v_store(V& dest, const V& src) {
|
||||
dest.raw = src.raw;
|
||||
}
|
||||
|
||||
template <class T, size_type N>
|
||||
struct get_vector_type {
|
||||
typedef vector_type<T, N> type;
|
||||
};
|
||||
|
||||
template <class T, size_type N>
|
||||
using get_vector_type_t = typename get_vector_type<T, N>::type;
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */
|
368
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cublas.hpp
vendored
Normal file
368
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cublas.hpp
vendored
Normal file
@ -0,0 +1,368 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
#include "stream.hpp"
|
||||
#include "pointer.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cublas_v2.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#define CUDA4DNN_CHECK_CUBLAS(call) \
|
||||
::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
|
||||
|
||||
/** @brief exception class for errors thrown by the cuBLAS API */
|
||||
class cuBLASException : public CUDAException {
|
||||
public:
|
||||
using CUDAException::CUDAException;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
static void check(cublasStatus_t status, const char* func, const char* file, int line) {
|
||||
auto cublasGetErrorString = [](cublasStatus_t err) {
|
||||
switch (err) {
|
||||
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
return "UNKNOWN_CUBLAS_ERROR";
|
||||
};
|
||||
|
||||
if (status != CUBLAS_STATUS_SUCCESS)
|
||||
throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
|
||||
}
|
||||
}
|
||||
|
||||
/** non-copyable cuBLAS smart handle
|
||||
*
|
||||
* UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
|
||||
* is destroyed after use. The handle must always be associated with a non-default stream. The stream
|
||||
* must be specified during construction.
|
||||
*
|
||||
* Refer to stream API for more information for the choice of forcing non-default streams.
|
||||
*/
|
||||
class UniqueHandle {
|
||||
public:
|
||||
UniqueHandle() noexcept : handle{ nullptr } { }
|
||||
UniqueHandle(UniqueHandle&) = delete;
|
||||
UniqueHandle(UniqueHandle&& other) noexcept {
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
|
||||
/** creates a cuBLAS handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
UniqueHandle(Stream strm) : stream(std::move(strm)) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
|
||||
} catch (...) {
|
||||
/* cublasDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
~UniqueHandle() noexcept {
|
||||
if (handle) {
|
||||
/* cublasDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
|
||||
}
|
||||
}
|
||||
|
||||
UniqueHandle& operator=(const UniqueHandle&) = delete;
|
||||
UniqueHandle& operator=(UniqueHandle&& other) noexcept {
|
||||
CV_Assert(other);
|
||||
if (&other != this) {
|
||||
UniqueHandle(std::move(*this)); /* destroy current handle */
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** returns the raw cuBLAS handle */
|
||||
cublasHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle;
|
||||
}
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
private:
|
||||
Stream stream;
|
||||
cublasHandle_t handle;
|
||||
};
|
||||
|
||||
/** @brief sharable cuBLAS smart handle
|
||||
*
|
||||
* Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
|
||||
* is destroyed after all references to the handle are destroyed. The handle must always
|
||||
* be associated with a non-default stream. The stream must be specified during construction.
|
||||
*
|
||||
* @note Moving a Handle object to another invalidates the former
|
||||
*/
|
||||
class Handle {
|
||||
public:
|
||||
Handle() = default;
|
||||
Handle(const Handle&) = default;
|
||||
Handle(Handle&&) = default;
|
||||
|
||||
/** creates a cuBLAS handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
|
||||
|
||||
Handle& operator=(const Handle&) = default;
|
||||
Handle& operator=(Handle&&) = default;
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
/** returns the raw cuBLAS handle */
|
||||
cublasHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle->get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<UniqueHandle> handle;
|
||||
};
|
||||
|
||||
/** @brief GEMM for colummn-major matrices
|
||||
*
|
||||
* \f$ C = \alpha AB + \beta C \f$
|
||||
*
|
||||
* @tparam T matrix element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuBLAS Handle
|
||||
* @param transa use transposed matrix of A for computation
|
||||
* @param transb use transposed matrix of B for computation
|
||||
* @param rows_c number of rows in C
|
||||
* @param cols_c number of columns in C
|
||||
* @param common_dim common dimension of A (or trans A) and B (or trans B)
|
||||
* @param alpha scale factor for AB
|
||||
* @param[in] A pointer to column-major matrix A in device memory
|
||||
* @param lda leading dimension of matrix A
|
||||
* @param[in] B pointer to column-major matrix B in device memory
|
||||
* @param ldb leading dimension of matrix B
|
||||
* @param beta scale factor for C
|
||||
* @param[in,out] C pointer to column-major matrix C in device memory
|
||||
* @param ldc leading dimension of matrix C
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void gemm(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
T alpha, const DevicePtr<const T> A, std::size_t lda,
|
||||
const DevicePtr<const T> B, std::size_t ldb,
|
||||
T beta, const DevicePtr<T> C, std::size_t ldc);
|
||||
|
||||
template <> inline
|
||||
void gemm<half>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
half alpha, const DevicePtr<const half> A, std::size_t lda,
|
||||
const DevicePtr<const half> B, std::size_t ldb,
|
||||
half beta, const DevicePtr<half> C, std::size_t ldc)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
int irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasHgemm(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda,
|
||||
B.get(), ildb,
|
||||
&beta, C.get(), ildc
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void gemm<float>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
float alpha, const DevicePtr<const float> A, std::size_t lda,
|
||||
const DevicePtr<const float> B, std::size_t ldb,
|
||||
float beta, const DevicePtr<float> C, std::size_t ldc)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
int irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasSgemm(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda,
|
||||
B.get(), ildb,
|
||||
&beta, C.get(), ildc
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief Strided batched GEMM for colummn-major matrices
|
||||
*
|
||||
* \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i
|
||||
*
|
||||
* @tparam T matrix element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuBLAS Handle
|
||||
* @param transa use transposed matrix of A_i for computation
|
||||
* @param transb use transposed matrix of B_i for computation
|
||||
* @param rows_c number of rows in C_i
|
||||
* @param cols_c number of columns in C_i
|
||||
* @param common_dim common dimension of A_i (or trans A_i) and B_i (or trans B_i)
|
||||
* @param alpha scale factor for A_i B_i
|
||||
* @param[in] A pointer to stack of column-major matrices A in device memory
|
||||
* @param lda leading dimension of matrix A_i
|
||||
* @param strideA stride between matrices in A
|
||||
* @param[in] B pointer to stack of column-major matrices B in device memory
|
||||
* @param ldb leading dimension of matrix B_i
|
||||
* @param strideB stride between matrices in B
|
||||
* @param beta scale factor for C_i
|
||||
* @param[in,out] C pointer to stack of column-major matrices C in device memory
|
||||
* @param ldc leading dimension of matrix C_i
|
||||
* @param strideC stride between matrices in C
|
||||
* @param batchCount number of matrices in the batch
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void gemmStridedBatched(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
T alpha, const DevicePtr<const T> A, std::size_t lda, std::size_t strideA,
|
||||
const DevicePtr<const T> B, std::size_t ldb, std::size_t strideB,
|
||||
T beta, const DevicePtr<T> C, std::size_t ldc, std::size_t strideC,
|
||||
std::size_t batchCount);
|
||||
|
||||
template <> inline
|
||||
void gemmStridedBatched<half>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
half alpha, const DevicePtr<const half> A, std::size_t lda, std::size_t strideA,
|
||||
const DevicePtr<const half> B, std::size_t ldb, std::size_t strideB,
|
||||
half beta, const DevicePtr<half> C, std::size_t ldc, std::size_t strideC,
|
||||
std::size_t batchCount)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
const auto irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
const auto batch_count = static_cast<int>(batchCount);
|
||||
const auto stride_a = static_cast<long long int>(strideA),
|
||||
stride_b = static_cast<long long int>(strideB),
|
||||
stride_c = static_cast<long long int>(strideC);
|
||||
|
||||
CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasHgemmStridedBatched(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda, stride_a,
|
||||
B.get(), ildb, stride_b,
|
||||
&beta, C.get(), ildc, stride_c,
|
||||
batch_count
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void gemmStridedBatched<float>(const Handle& handle,
|
||||
bool transa, bool transb,
|
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
|
||||
float alpha, const DevicePtr<const float> A, std::size_t lda, std::size_t strideA,
|
||||
const DevicePtr<const float> B, std::size_t ldb, std::size_t strideB,
|
||||
float beta, const DevicePtr<float> C, std::size_t ldc, std::size_t strideC,
|
||||
std::size_t batchCount)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
|
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
const auto irows_c = static_cast<int>(rows_c),
|
||||
icols_c = static_cast<int>(cols_c),
|
||||
icommon_dim = static_cast<int>(common_dim),
|
||||
ilda = static_cast<int>(lda),
|
||||
ildb = static_cast<int>(ldb),
|
||||
ildc = static_cast<int>(ldc);
|
||||
|
||||
const auto batch_count = static_cast<int>(batchCount);
|
||||
const auto stride_a = static_cast<long long int>(strideA),
|
||||
stride_b = static_cast<long long int>(strideB),
|
||||
stride_c = static_cast<long long int>(strideC);
|
||||
|
||||
CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
|
||||
|
||||
CUDA4DNN_CHECK_CUBLAS(
|
||||
cublasSgemmStridedBatched(
|
||||
handle.get(),
|
||||
opa, opb,
|
||||
irows_c, icols_c, icommon_dim,
|
||||
&alpha, A.get(), ilda, stride_a,
|
||||
B.get(), ildb, stride_b,
|
||||
&beta, C.get(), ildc, stride_c,
|
||||
batch_count
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
|
10
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn.hpp
vendored
Normal file
10
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn.hpp
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
|
||||
|
||||
#include "cudnn/cudnn.hpp"
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP */
|
80
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
vendored
Normal file
80
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/activation.hpp
vendored
Normal file
@ -0,0 +1,80 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
class ActivationDescriptor {
|
||||
public:
|
||||
enum class ActivationType {
|
||||
IDENTITY,
|
||||
RELU,
|
||||
CLIPPED_RELU,
|
||||
TANH,
|
||||
SIGMOID,
|
||||
ELU
|
||||
};
|
||||
|
||||
ActivationDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
ActivationDescriptor(const ActivationDescriptor&) = delete;
|
||||
ActivationDescriptor(ActivationDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/* `relu_ceiling_or_elu_alpha`:
|
||||
* - `alpha` coefficient in ELU activation
|
||||
* - `ceiling` for CLIPPED_RELU activation
|
||||
*/
|
||||
ActivationDescriptor(ActivationType type, double relu_ceiling_or_elu_alpha = 0.0) {
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateActivationDescriptor(&descriptor));
|
||||
try {
|
||||
const auto mode = [type] {
|
||||
switch(type) {
|
||||
case ActivationType::IDENTITY: return CUDNN_ACTIVATION_IDENTITY;
|
||||
case ActivationType::RELU: return CUDNN_ACTIVATION_RELU;
|
||||
case ActivationType::CLIPPED_RELU: return CUDNN_ACTIVATION_CLIPPED_RELU;
|
||||
case ActivationType::SIGMOID: return CUDNN_ACTIVATION_SIGMOID;
|
||||
case ActivationType::TANH: return CUDNN_ACTIVATION_TANH;
|
||||
case ActivationType::ELU: return CUDNN_ACTIVATION_ELU;
|
||||
}
|
||||
CV_Assert(0);
|
||||
return CUDNN_ACTIVATION_IDENTITY;
|
||||
} ();
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetActivationDescriptor(descriptor, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling_or_elu_alpha));
|
||||
} catch(...) {
|
||||
/* cudnnDestroyActivationDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
~ActivationDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyActivationDescriptor will not fail */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyActivationDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
ActivationDescriptor& operator=(const ActivationDescriptor&) = delete;
|
||||
ActivationDescriptor& operator=(ActivationDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnActivationDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
cudnnActivationDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_ACTIVATION_HPP */
|
637
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
vendored
Normal file
637
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
vendored
Normal file
@ -0,0 +1,637 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
#include "activation.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
#include "../workspace.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** describe convolution filters
|
||||
*
|
||||
* @tparam T type of elements in the kernels
|
||||
*/
|
||||
template <class T>
|
||||
class FilterDescriptor {
|
||||
public:
|
||||
FilterDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
FilterDescriptor(const FilterDescriptor&) = delete;
|
||||
FilterDescriptor(FilterDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided in \p shape
|
||||
*
|
||||
* Shape dimensions:
|
||||
* 0: number of filters
|
||||
* 1: number of input feature maps
|
||||
* 2..n: kernel dimensions
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
FilterDescriptor(const SequenceContainer& shape) {
|
||||
constructor(shape.begin(), shape.end());
|
||||
}
|
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided in [begin, end)
|
||||
*
|
||||
* Shape dimensions:
|
||||
* 0: number of filters
|
||||
* 1: number of input feature maps
|
||||
* 2..n: kernel dimensions
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
|
||||
FilterDescriptor(ForwardItr begin, ForwardItr end) {
|
||||
constructor(begin, end);
|
||||
}
|
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided as arguments
|
||||
*
|
||||
* Shape dimensions:
|
||||
* 0: number of filters
|
||||
* 1: number of input feature maps
|
||||
* 2..n: kernel dimensions
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class ...Sizes>
|
||||
FilterDescriptor(Sizes ...sizes) {
|
||||
static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
|
||||
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
|
||||
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
|
||||
constructor(std::begin(dims), std::end(dims));
|
||||
}
|
||||
|
||||
~FilterDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
FilterDescriptor& operator=(const FilterDescriptor&) = delete;
|
||||
FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class ForwardItr>
|
||||
void constructor(ForwardItr start, ForwardItr end) {
|
||||
CV_Assert(start != end);
|
||||
CV_Assert(std::distance(start, end) >= 3);
|
||||
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
|
||||
try {
|
||||
const auto rank = std::distance(start, end);
|
||||
if (rank == 4) {
|
||||
std::array<int, 4> dims;
|
||||
std::copy(start, end, std::begin(dims));
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetFilter4dDescriptor(
|
||||
descriptor,
|
||||
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
|
||||
dims[0], dims[1], dims[2], dims[3]
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> dims(start, end);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetFilterNdDescriptor(
|
||||
descriptor,
|
||||
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
|
||||
dims.size(), dims.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (...) {
|
||||
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnFilterDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
/** describes a convolution operation
|
||||
*
|
||||
* @tparam T type of element participating in convolution
|
||||
*/
|
||||
template <class T>
|
||||
class ConvolutionDescriptor {
|
||||
public:
|
||||
ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
|
||||
ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a convolution descriptor
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p zero_padding, \p stride and \p dilation must have the same size
|
||||
*
|
||||
* The length of the containers is interpreted as the order of the convolution.
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
ConvolutionDescriptor(
|
||||
const SequenceContainer& zero_padding,
|
||||
const SequenceContainer& stride,
|
||||
const SequenceContainer& dilation,
|
||||
std::size_t group_count)
|
||||
{
|
||||
constructor(zero_padding, stride, dilation, group_count);
|
||||
}
|
||||
|
||||
~ConvolutionDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
|
||||
ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class SequenceContainer>
|
||||
void constructor(
|
||||
const SequenceContainer& zero_padding,
|
||||
const SequenceContainer& stride,
|
||||
const SequenceContainer& dilation,
|
||||
std::size_t group_count)
|
||||
{
|
||||
CV_Assert(zero_padding.size() == stride.size());
|
||||
CV_Assert(zero_padding.size() == dilation.size());
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
|
||||
try {
|
||||
const auto rank = zero_padding.size();
|
||||
if (rank == 2) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetConvolution2dDescriptor(
|
||||
descriptor,
|
||||
zero_padding[0], zero_padding[1],
|
||||
stride[0], stride[1],
|
||||
dilation[0], dilation[1],
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
detail::get_data_type<T>()
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
|
||||
std::vector<int> istride(std::begin(stride), std::end(stride));
|
||||
std::vector<int> idilation(std::begin(dilation), std::end(dilation));
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetConvolutionNdDescriptor(
|
||||
descriptor,
|
||||
rank, ipadding.data(), istride.data(), idilation.data(),
|
||||
CUDNN_CROSS_CORRELATION,
|
||||
detail::get_data_type<T>()
|
||||
)
|
||||
);
|
||||
}
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
|
||||
|
||||
#if CUDNN_MAJOR >= 8
|
||||
/* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
|
||||
* in the default setting. TF32 convolutions have lower precision than FP32.
|
||||
* Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
|
||||
*/
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
|
||||
#endif
|
||||
|
||||
if (std::is_same<T, half>::value)
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
|
||||
} catch (...) {
|
||||
/* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnConvolutionDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
/** wrapper around a convolution algorithm
|
||||
*
|
||||
* @tparam T type of elements being convolved
|
||||
*/
|
||||
template <class T>
|
||||
class ConvolutionAlgorithm {
|
||||
public:
|
||||
ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
|
||||
ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
|
||||
ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
|
||||
|
||||
/** selects a good algorithm for convolution for given configuration
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
ConvolutionAlgorithm(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
const TensorDescriptor<T>& outputDesc)
|
||||
{
|
||||
#if CUDNN_MAJOR >= 8
|
||||
int requestedAlgoCount = 0, returnedAlgoCount = 0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
|
||||
std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionForwardAlgorithm_v7(
|
||||
handle.get(),
|
||||
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
requestedAlgoCount,
|
||||
&returnedAlgoCount,
|
||||
&results[0]
|
||||
)
|
||||
);
|
||||
|
||||
size_t free_memory, total_memory;
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
|
||||
|
||||
bool found_conv_algorithm = false;
|
||||
for (int i = 0; i < returnedAlgoCount; i++)
|
||||
{
|
||||
if (results[i].status == CUDNN_STATUS_SUCCESS &&
|
||||
results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
|
||||
results[i].memory < free_memory)
|
||||
{
|
||||
found_conv_algorithm = true;
|
||||
algo = results[i].algo;
|
||||
workspace_size = results[i].memory;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found_conv_algorithm)
|
||||
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
|
||||
#else
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionForwardAlgorithm(
|
||||
handle.get(),
|
||||
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
|
||||
0, /* no memory limit */
|
||||
&algo
|
||||
)
|
||||
);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionForwardWorkspaceSize(
|
||||
handle.get(),
|
||||
inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
algo, &workspace_size
|
||||
)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
|
||||
ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
|
||||
|
||||
cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
|
||||
|
||||
/** number of bytes of workspace memory required by the algorithm */
|
||||
std::size_t get_workspace_size() const noexcept { return workspace_size; }
|
||||
|
||||
private:
|
||||
cudnnConvolutionFwdAlgo_t algo;
|
||||
std::size_t workspace_size;
|
||||
};
|
||||
|
||||
/** gives the shape of the output tensor of convolution
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void getConvolutionForwardOutputDim(
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
std::vector<int>& output)
|
||||
{
|
||||
output.clear();
|
||||
output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
|
||||
|
||||
std::vector<int> temp(CUDNN_DIM_MAX);
|
||||
cudnnDataType_t tempDataType;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetTensorNdDescriptor(
|
||||
inputDesc.get(),
|
||||
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
|
||||
&tempDataType,
|
||||
output.data(),
|
||||
temp.data(),
|
||||
temp.data()
|
||||
)
|
||||
);
|
||||
|
||||
const auto rank = output[0];
|
||||
output.resize(rank);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionNdForwardOutputDim(
|
||||
convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief performs convolution
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param convAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const ConvolutionAlgorithm<T>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionForward(
|
||||
handle.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const ConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionForward(
|
||||
handle.get(),
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief performs convolution, bias addition and activation simultaneously
|
||||
*
|
||||
* dstValue = act(alpha * conv(input) + bias)
|
||||
*
|
||||
* @tparam T convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param convAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param alpha convolution scale factor
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param biasDesc tensor descriptor describing the bias
|
||||
* @param[in] biasPtr pointer to bias tensor in device memory
|
||||
* @param actDesc activation descriptor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void convolve_with_bias_activation(
|
||||
const Handle& handle,
|
||||
T alpha,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const ConvolutionAlgorithm<T>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
const TensorDescriptor<T>& biasDesc,
|
||||
DevicePtr<const T> biasPtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
T alpha2 = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2, outputDesc.get(), outputPtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void convolve_with_bias_activation(
|
||||
const Handle& handle,
|
||||
half alpha,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const ConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
const TensorDescriptor<half>& biasDesc,
|
||||
DevicePtr<const half> biasPtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
float alpha_ = alpha, alpha2 = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2, outputDesc.get(), outputPtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
/** @brief performs convolution, bias addition, eltwise addition and activation simultaneously
|
||||
*
|
||||
* dstValue = act(alpha1 * conv(input) + bias + alpha2 * eltwise)
|
||||
*
|
||||
* @tparam T convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param convAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param alpha1 convolution scale factor
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param biasDesc tensor descriptor describing the bias
|
||||
* @param[in] biasPtr pointer to bias tensor in device memory
|
||||
* @param alpha2 eltwise scale factor
|
||||
* @param eltwiseDesc tensor descriptor describing the eltwise tensor
|
||||
* @param[in] eltwisePtr pointer to the eltwise tensor in device memory
|
||||
* @param actDesc activation descriptor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void convolve_with_bias_eltwise_activation(
|
||||
const Handle& handle,
|
||||
T alpha1,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const ConvolutionAlgorithm<T>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
const TensorDescriptor<T>& biasDesc,
|
||||
DevicePtr<const T> biasPtr,
|
||||
T alpha2,
|
||||
const TensorDescriptor<T>& eltwiseDesc,
|
||||
DevicePtr<const T> eltwisePtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha1, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2, eltwiseDesc.get(), eltwisePtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void convolve_with_bias_eltwise_activation(
|
||||
const Handle& handle,
|
||||
half alpha1,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const ConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
const TensorDescriptor<half>& biasDesc,
|
||||
DevicePtr<const half> biasPtr,
|
||||
half alpha2,
|
||||
const TensorDescriptor<half>& eltwiseDesc,
|
||||
DevicePtr<const half> eltwisePtr,
|
||||
const ActivationDescriptor& actDesc,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
float alpha1_ = alpha1, alpha2_ = alpha2;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
|
||||
handle.get(),
|
||||
&alpha1_, inputDesc.get(), inputPtr.get(),
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&alpha2_, eltwiseDesc.get(), eltwisePtr.get(),
|
||||
biasDesc.get(), biasPtr.get(),
|
||||
actDesc.get(),
|
||||
outputDesc.get(), outputPtr.get()));
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */
|
292
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp
vendored
Normal file
292
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp
vendored
Normal file
@ -0,0 +1,292 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
#define CUDA4DNN_CHECK_CUDNN(call) \
|
||||
::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__)
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** @brief exception class for errors thrown by the cuDNN API */
|
||||
class cuDNNException : public CUDAException {
|
||||
public:
|
||||
cuDNNException(cudnnStatus_t code, const std::string& msg, const std::string& func, const std::string& file, int line)
|
||||
: CUDAException(Error::GpuApiCallError, msg, func, file, line), cudnnError{code}
|
||||
{
|
||||
}
|
||||
|
||||
cudnnStatus_t getCUDNNStatus() const noexcept { return cudnnError; }
|
||||
|
||||
private:
|
||||
cudnnStatus_t cudnnError;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
inline void check(cudnnStatus_t status, const char* func, const char* file, int line) {
|
||||
if (status != CUDNN_STATUS_SUCCESS)
|
||||
throw cuDNNException(status, cudnnGetErrorString(status), func, file, line);
|
||||
}
|
||||
|
||||
/** get_data_type<T> returns the equivalent cudnn enumeration constant for type T */
|
||||
using cudnn_data_enum_type = decltype(CUDNN_DATA_FLOAT);
|
||||
template <class> cudnn_data_enum_type get_data_type();
|
||||
template <> inline cudnn_data_enum_type get_data_type<half>() { return CUDNN_DATA_HALF; }
|
||||
template <> inline cudnn_data_enum_type get_data_type<float>() { return CUDNN_DATA_FLOAT; }
|
||||
}
|
||||
|
||||
/** @brief noncopyable cuDNN smart handle
|
||||
*
|
||||
* UniqueHandle is a smart non-sharable wrapper for cuDNN handle which ensures that the handle
|
||||
* is destroyed after use.
|
||||
*/
|
||||
class UniqueHandle {
|
||||
public:
|
||||
UniqueHandle() noexcept : handle{ nullptr } { }
|
||||
UniqueHandle(UniqueHandle&) = delete;
|
||||
UniqueHandle(UniqueHandle&& other) noexcept {
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
|
||||
/** creates a cuDNN handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
UniqueHandle(Stream strm) : stream(std::move(strm)) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetStream(handle, stream.get()));
|
||||
} catch (...) {
|
||||
/* cudnnDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
~UniqueHandle() noexcept {
|
||||
if (handle != nullptr) {
|
||||
/* cudnnDestroy won't throw if a valid handle is passed */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
|
||||
}
|
||||
}
|
||||
|
||||
UniqueHandle& operator=(const UniqueHandle&) = delete;
|
||||
UniqueHandle& operator=(UniqueHandle&& other) noexcept {
|
||||
CV_Assert(other);
|
||||
if (&other != this) {
|
||||
UniqueHandle(std::move(*this)); /* destroy current handle */
|
||||
stream = std::move(other.stream);
|
||||
handle = other.handle;
|
||||
other.handle = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** returns the raw cuDNN handle */
|
||||
cudnnHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle;
|
||||
}
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
private:
|
||||
Stream stream;
|
||||
cudnnHandle_t handle;
|
||||
};
|
||||
|
||||
/** @brief sharable cuDNN smart handle
|
||||
*
|
||||
* Handle is a smart sharable wrapper for cuDNN handle which ensures that the handle
|
||||
* is destroyed after all references to the handle are destroyed. The handle must always
|
||||
* be associated with a non-default stream. The stream must be specified during construction.
|
||||
*
|
||||
* @note Moving a Handle object to another invalidates the former
|
||||
*/
|
||||
class Handle {
|
||||
public:
|
||||
Handle() = default;
|
||||
Handle(const Handle&) = default;
|
||||
Handle(Handle&&) = default;
|
||||
|
||||
/** creates a cuDNN handle and associates it with the stream specified
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
|
||||
|
||||
Handle& operator=(const Handle&) = default;
|
||||
Handle& operator=(Handle&&) = default;
|
||||
|
||||
/** returns true if the handle is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
|
||||
|
||||
/** returns the raw cuDNN handle */
|
||||
cudnnHandle_t get() const noexcept {
|
||||
CV_Assert(handle);
|
||||
return handle->get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<UniqueHandle> handle;
|
||||
};
|
||||
|
||||
/** describe a tensor
|
||||
*
|
||||
* @tparam T type of elements in the tensor
|
||||
*/
|
||||
template <class T>
|
||||
class TensorDescriptor {
|
||||
public:
|
||||
TensorDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
TensorDescriptor(const TensorDescriptor&) = delete;
|
||||
TensorDescriptor(TensorDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided in \p shape
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
TensorDescriptor(const SequenceContainer& shape) {
|
||||
constructor(shape.begin(), shape.end());
|
||||
}
|
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided in [begin, end)
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
|
||||
TensorDescriptor(ForwardItr begin, ForwardItr end) {
|
||||
constructor(begin, end);
|
||||
}
|
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided as arguments
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class ...Sizes>
|
||||
TensorDescriptor(Sizes ...sizes) {
|
||||
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
|
||||
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
|
||||
constructor(std::begin(dims), std::end(dims));
|
||||
}
|
||||
|
||||
~TensorDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyTensorDescriptor will not fail */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
TensorDescriptor& operator=(const TensorDescriptor&) = delete;
|
||||
TensorDescriptor& operator=(TensorDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnTensorDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class ForwardItr>
|
||||
void constructor(ForwardItr start, ForwardItr end) {
|
||||
CV_Assert(start != end);
|
||||
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor));
|
||||
try {
|
||||
/* cuDNN documentation recommends using the 4d tensor API whenever possible
|
||||
* hence, we create a 4d tensor descriptors for 3d tensor
|
||||
*/
|
||||
const auto rank = std::distance(start, end);
|
||||
if (rank <= 4) {
|
||||
std::array<int, 4> dims;
|
||||
std::fill(std::begin(dims), std::end(dims), 1);
|
||||
|
||||
/* suppose we have a 3d tensor, the first axis is the batch axis and
|
||||
* the second axis is the channel axis (generally)
|
||||
*
|
||||
* cuDNN frequently assumes that the first axis is the batch axis and the
|
||||
* second axis is the channel axis; hence, we copy the shape of a lower rank
|
||||
* tensor to the beginning of `dims`
|
||||
*/
|
||||
std::copy(start, end, std::begin(dims));
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetTensor4dDescriptor(descriptor,
|
||||
CUDNN_TENSOR_NCHW, detail::get_data_type<T>(),
|
||||
dims[0], dims[1], dims[2], dims[3]
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> stride(rank);
|
||||
stride.back() = 1;
|
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1
|
||||
* stride[-2] = garbage
|
||||
* stride[-3] = garbage
|
||||
* stride[-4] = garbage
|
||||
* ...
|
||||
*/
|
||||
|
||||
std::copy(start + 1, end, stride.begin());
|
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1
|
||||
* stride[-2] = dim[-1]
|
||||
* stride[-3] = dim[-2]
|
||||
* stride[-4] = dim[-3]
|
||||
* ...
|
||||
*/
|
||||
|
||||
std::partial_sum(stride.rbegin(), stride.rend(), stride.rbegin(), std::multiplies<int>());
|
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1
|
||||
* stride[-2] = stride[-1] * dim[-1]
|
||||
* stride[-3] = stride[-2] * dim[-2]
|
||||
* stride[-4] = stride[-3] * dim[-3]
|
||||
* ...
|
||||
*/
|
||||
|
||||
std::vector<int> dims(start, end);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetTensorNdDescriptor(descriptor,
|
||||
detail::get_data_type<T>(), rank,
|
||||
dims.data(), stride.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (...) {
|
||||
/* cudnnDestroyTensorDescriptor will not fail */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnTensorDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */
|
205
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp
vendored
Normal file
205
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp
vendored
Normal file
@ -0,0 +1,205 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
#include "../workspace.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
class LRNDescriptor {
|
||||
public:
|
||||
enum class LRNType {
|
||||
ACROSS_CHANNELS,
|
||||
WITHIN_CHANNEL
|
||||
};
|
||||
|
||||
LRNDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
LRNDescriptor(const LRNDescriptor&) = delete;
|
||||
LRNDescriptor(LRNDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor }, type{ other.type } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** sets up a LRN descriptor
|
||||
*
|
||||
* @param local_size size of the normalization window
|
||||
* @param alpha variance scaling parameter
|
||||
* @param beta power parameter
|
||||
* @param k bias parameter
|
||||
*
|
||||
* @note \p alpha is divided by the window width in across channels mode
|
||||
* @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode
|
||||
*
|
||||
* @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
|
||||
constructor(local_size, alpha, beta, k, type_);
|
||||
}
|
||||
|
||||
~LRNDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
LRNDescriptor& operator=(const LRNDescriptor&) = delete;
|
||||
LRNDescriptor& operator=(LRNDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
type = other.type;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnLRNDescriptor_t get() const noexcept { return descriptor; }
|
||||
LRNType getType() const noexcept { return type; }
|
||||
|
||||
private:
|
||||
void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
|
||||
CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N);
|
||||
|
||||
type = type_;
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetLRNDescriptor(
|
||||
descriptor,
|
||||
local_size,
|
||||
alpha,
|
||||
beta,
|
||||
k
|
||||
)
|
||||
);
|
||||
} catch (...) {
|
||||
/* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnLRNDescriptor_t descriptor;
|
||||
LRNType type;
|
||||
};
|
||||
|
||||
/** @brief performs local response normalization
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param lrnDesc LRN description
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void LRNForward(
|
||||
const Handle& handle,
|
||||
const LRNDescriptor& lrnDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr,
|
||||
WorkspaceInstance workspace)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnLRNCrossChannelForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
} else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
|
||||
std::size_t size;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
|
||||
|
||||
DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
|
||||
DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnDivisiveNormalizationForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
NULL,
|
||||
static_cast<void*>(temp1), static_cast<void*>(temp2),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void LRNForward(
|
||||
const Handle& handle,
|
||||
const LRNDescriptor& lrnDesc,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr,
|
||||
WorkspaceInstance workspace)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnLRNCrossChannelForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
} else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
|
||||
std::size_t size;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
|
||||
|
||||
DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
|
||||
DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnDivisiveNormalizationForward(
|
||||
handle.get(),
|
||||
lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
NULL,
|
||||
static_cast<void*>(temp1), static_cast<void*>(temp2),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */
|
236
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp
vendored
Normal file
236
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp
vendored
Normal file
@ -0,0 +1,236 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
class PoolingDescriptor {
|
||||
public:
|
||||
enum class PoolingType {
|
||||
MAX,
|
||||
MAX_DETERMINISTIC,
|
||||
AVERAGE_EXCLUDE_PADDING,
|
||||
AVERAGE_INCLUDE_PADDING
|
||||
};
|
||||
|
||||
PoolingDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
PoolingDescriptor(const PoolingDescriptor&) = delete;
|
||||
PoolingDescriptor(PoolingDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a pooling descriptor
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p window_size, \p padding and \p stride must have the same size
|
||||
*
|
||||
* The length of the containers is interpreted as the order of the pooling operation.
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
PoolingDescriptor(
|
||||
const SequenceContainer& window_size,
|
||||
const SequenceContainer& padding,
|
||||
const SequenceContainer& stride,
|
||||
PoolingType type)
|
||||
{
|
||||
constructor(window_size, padding, stride, type);
|
||||
}
|
||||
|
||||
~PoolingDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
|
||||
PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnPoolingDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class SequenceContainer>
|
||||
void constructor(
|
||||
const SequenceContainer& window_size,
|
||||
const SequenceContainer& padding,
|
||||
const SequenceContainer& stride,
|
||||
PoolingType type)
|
||||
{
|
||||
CV_Assert(window_size.size() == padding.size());
|
||||
CV_Assert(window_size.size() == stride.size());
|
||||
|
||||
auto get_pooling_type = [] (PoolingType type) {
|
||||
switch (type) {
|
||||
case PoolingType::MAX:
|
||||
return CUDNN_POOLING_MAX;
|
||||
case PoolingType::MAX_DETERMINISTIC:
|
||||
return CUDNN_POOLING_MAX_DETERMINISTIC;
|
||||
case PoolingType::AVERAGE_EXCLUDE_PADDING:
|
||||
return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
|
||||
case PoolingType::AVERAGE_INCLUDE_PADDING:
|
||||
return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
|
||||
}
|
||||
CV_Error(Error::StsBadArg, "unknown pooling type");
|
||||
};
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor));
|
||||
try {
|
||||
const auto rank = window_size.size();
|
||||
if (rank == 2) {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetPooling2dDescriptor(
|
||||
descriptor,
|
||||
get_pooling_type(type), CUDNN_PROPAGATE_NAN,
|
||||
window_size[0], window_size[1],
|
||||
padding[0], padding[1],
|
||||
stride[0], stride[1]
|
||||
)
|
||||
);
|
||||
} else {
|
||||
std::vector<int> iwindow_size(std::begin(window_size), std::end(window_size));
|
||||
std::vector<int> ipadding(std::begin(padding), std::end(padding));
|
||||
std::vector<int> istride(std::begin(stride), std::end(stride));
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetPoolingNdDescriptor(
|
||||
descriptor,
|
||||
get_pooling_type(type), CUDNN_PROPAGATE_NAN,
|
||||
rank, iwindow_size.data(), ipadding.data(), istride.data()
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (...) {
|
||||
/* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnPoolingDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
/** gives the shape of the output tensor after pooling
|
||||
*
|
||||
* @note it's not required to enforce the this shape in the output tensor; slightly different shapes will work
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void getPoolingForwardOutputDim(
|
||||
const PoolingDescriptor& poolingDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
std::vector<int>& output_dim)
|
||||
{
|
||||
output_dim.clear();
|
||||
output_dim.resize(CUDNN_DIM_MAX); /* we use `output_dim` to hold temporaries */
|
||||
|
||||
std::vector<int> temp(CUDNN_DIM_MAX);
|
||||
cudnnDataType_t tempDataType;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetTensorNdDescriptor(
|
||||
inputDesc.get(),
|
||||
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
|
||||
&tempDataType,
|
||||
output_dim.data(),
|
||||
temp.data(),
|
||||
temp.data()
|
||||
)
|
||||
);
|
||||
|
||||
const auto rank = output_dim[0];
|
||||
output_dim.resize(rank);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetPoolingNdForwardOutputDim(poolingDesc.get(), inputDesc.get(), rank, output_dim.data())
|
||||
);
|
||||
}
|
||||
|
||||
/** @brief performs pooling operation
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T pooling element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param poolingDesc pooling description
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void pool(
|
||||
const Handle& handle,
|
||||
const PoolingDescriptor& poolingDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
const DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnPoolingForward(
|
||||
handle.get(),
|
||||
poolingDesc.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void pool(
|
||||
const Handle& handle,
|
||||
const PoolingDescriptor& poolingDesc,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
const DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
CV_Assert(handle);
|
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnPoolingForward(
|
||||
handle.get(),
|
||||
poolingDesc.get(),
|
||||
&alpha_, inputDesc.get(), inputPtr.get(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP */
|
68
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp
vendored
Normal file
68
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** @brief computes softmax (or log softmax)
|
||||
*
|
||||
* @tparam T element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN handle
|
||||
* @param outputDesc tensor descriptor for A
|
||||
* @param[out] output pointer to tensor in device memory
|
||||
* @param inputDesc tensor descriptor for C
|
||||
* @param[in] input pointer to tensor in device memory
|
||||
* @param log apply log on probabilities
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void softmax(const cudnn::Handle& handle,
|
||||
const TensorDescriptor<T>& outputDesc, DevicePtr<T> output,
|
||||
const TensorDescriptor<T>& inputDesc, DevicePtr<const T> input,
|
||||
bool log)
|
||||
{
|
||||
T alpha = 1.0, beta = 0.0;
|
||||
cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSoftmaxForward(
|
||||
handle.get(),
|
||||
algo, CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha, inputDesc.get(), input.get(),
|
||||
&beta, outputDesc.get(), output.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void softmax(const cudnn::Handle& handle,
|
||||
const TensorDescriptor<half>& outputDesc, DevicePtr<half> output,
|
||||
const TensorDescriptor<half>& inputDesc, DevicePtr<const half> input,
|
||||
bool log)
|
||||
{
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha = 1.0, beta = 0.0;
|
||||
cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSoftmaxForward(
|
||||
handle.get(),
|
||||
algo, CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha, inputDesc.get(), input.get(),
|
||||
&beta, outputDesc.get(), output.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP */
|
142
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp
vendored
Normal file
142
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp
vendored
Normal file
@ -0,0 +1,142 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
|
||||
|
||||
#include "../pointer.hpp"
|
||||
|
||||
#include "cudnn.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** describes a tensor transform operation
|
||||
*
|
||||
* Supported transformations:
|
||||
* - add or remove asymmetric padding
|
||||
*/
|
||||
class TensorTransformDescriptor {
|
||||
public:
|
||||
TensorTransformDescriptor() noexcept : descriptor{ nullptr } { }
|
||||
TensorTransformDescriptor(const TensorTransformDescriptor&) = delete;
|
||||
TensorTransformDescriptor(TensorTransformDescriptor&& other) noexcept
|
||||
: descriptor{ other.descriptor } {
|
||||
other.descriptor = nullptr;
|
||||
}
|
||||
|
||||
/** constructs a convolution descriptor
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p padding_left and \p padding_right must have the same size
|
||||
*
|
||||
* The length of the containers is interpreted as the rank of the tensors which will be given.
|
||||
*
|
||||
* @note \p padding_left and \p padding_right may have negative values to remove padding
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
|
||||
TensorTransformDescriptor(
|
||||
const SequenceContainer& padding_left,
|
||||
const SequenceContainer& padding_right)
|
||||
{
|
||||
constructor(padding_left, padding_right);
|
||||
}
|
||||
|
||||
~TensorTransformDescriptor() noexcept {
|
||||
if (descriptor != nullptr) {
|
||||
/* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
|
||||
}
|
||||
}
|
||||
|
||||
TensorTransformDescriptor& operator=(const TensorTransformDescriptor&) = delete;
|
||||
TensorTransformDescriptor& operator=(TensorTransformDescriptor&& other) noexcept {
|
||||
descriptor = other.descriptor;
|
||||
other.descriptor = nullptr;
|
||||
return *this;
|
||||
};
|
||||
|
||||
cudnnTensorTransformDescriptor_t get() const noexcept { return descriptor; }
|
||||
|
||||
private:
|
||||
template <class SequenceContainer>
|
||||
void constructor(
|
||||
const SequenceContainer& padding_left,
|
||||
const SequenceContainer& padding_right
|
||||
)
|
||||
{
|
||||
CV_Assert(padding_left.size() == padding_right.size());
|
||||
|
||||
auto ipadding_left = std::vector<int32_t>(std::begin(padding_left), std::end(padding_left));
|
||||
auto ipadding_right = std::vector<int32_t>(std::begin(padding_right), std::end(padding_right));
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorTransformDescriptor(&descriptor));
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnSetTensorTransformDescriptor(
|
||||
descriptor,
|
||||
ipadding_left.size(), CUDNN_TENSOR_NCHW,
|
||||
ipadding_left.data(), ipadding_right.data(),
|
||||
NULL, CUDNN_TRANSFORM_FOLD
|
||||
)
|
||||
);
|
||||
} catch (...) {
|
||||
/* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cudnnTensorTransformDescriptor_t descriptor;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
void transform(
|
||||
const Handle& handle,
|
||||
const TensorTransformDescriptor& transDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
T alpha = 1.0, beta = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnTransformTensorEx(
|
||||
handle.get(),
|
||||
transDesc.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void transform(
|
||||
const Handle& handle,
|
||||
const TensorTransformDescriptor& transDesc,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha = 1.0, beta = 0.0;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnTransformTensorEx(
|
||||
handle.get(),
|
||||
transDesc.get(),
|
||||
&alpha, inputDesc.get(), inputPtr.get(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP */
|
183
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
vendored
Normal file
183
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
vendored
Normal file
@ -0,0 +1,183 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
|
||||
|
||||
#include "cudnn.hpp"
|
||||
#include "convolution.hpp"
|
||||
|
||||
#include "../pointer.hpp"
|
||||
#include "../workspace.hpp"
|
||||
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
|
||||
|
||||
/** wrapper around a transpose convolution algorithm
|
||||
*
|
||||
* @tparam T type of elements being transpose-convolved
|
||||
*/
|
||||
template <class T>
|
||||
class TransposeConvolutionAlgorithm {
|
||||
public:
|
||||
TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
|
||||
TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default;
|
||||
TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default;
|
||||
|
||||
TransposeConvolutionAlgorithm(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
const TensorDescriptor<T>& outputDesc)
|
||||
{
|
||||
#if CUDNN_MAJOR >= 8
|
||||
int requestedAlgoCount = 0, returnedAlgoCount = 0;
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
|
||||
std::vector<cudnnConvolutionBwdDataAlgoPerf_t> results(requestedAlgoCount);
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
||||
handle.get(),
|
||||
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
requestedAlgoCount,
|
||||
&returnedAlgoCount,
|
||||
&results[0]
|
||||
)
|
||||
);
|
||||
|
||||
size_t free_memory, total_memory;
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
|
||||
|
||||
bool found_conv_algorithm = false;
|
||||
for (int i = 0; i < returnedAlgoCount; i++)
|
||||
{
|
||||
if (results[i].status == CUDNN_STATUS_SUCCESS &&
|
||||
results[i].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
|
||||
results[i].memory < free_memory)
|
||||
{
|
||||
found_conv_algorithm = true;
|
||||
dalgo = results[i].algo;
|
||||
workspace_size = results[i].memory;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found_conv_algorithm)
|
||||
CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for transpose convolution.");
|
||||
#else
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionBackwardDataAlgorithm(
|
||||
handle.get(),
|
||||
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
|
||||
0, /* no memory limit */
|
||||
&dalgo
|
||||
)
|
||||
);
|
||||
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnGetConvolutionBackwardDataWorkspaceSize(
|
||||
handle.get(),
|
||||
filterDesc.get(), inputDesc.get(), convDesc.get(), outputDesc.get(),
|
||||
dalgo, &workspace_size
|
||||
)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
|
||||
TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default;
|
||||
|
||||
cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; }
|
||||
|
||||
std::size_t get_workspace_size() const noexcept { return workspace_size; }
|
||||
|
||||
private:
|
||||
cudnnConvolutionBwdDataAlgo_t dalgo;
|
||||
std::size_t workspace_size;
|
||||
};
|
||||
|
||||
/** @brief performs transpose convolution
|
||||
*
|
||||
* dstValue = alpha * result + beta * priorDstValue
|
||||
*
|
||||
* @tparam T transpose convolution element type (must be `half` or `float`)
|
||||
*
|
||||
* @param handle valid cuDNN Handle
|
||||
* @param convDesc convolution description
|
||||
* @param transConvAlgo algorithm to use for convolution
|
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo
|
||||
* @param filterDesc filter descriptor
|
||||
* @param[in] filterPtr pointer to device memory containing the filters
|
||||
* @param inputDesc tensor descriptor describing the input
|
||||
* @param[in] inputPtr pointer to input tensor in device memory
|
||||
* @param alpha result scale factor
|
||||
* @param beta previous value scale factor
|
||||
* @param outputDesc tensor descriptor describing the output
|
||||
* @param[out] outputPtr pointer to output tensor in device memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void transpose_convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<T>& convDesc,
|
||||
const TransposeConvolutionAlgorithm<T>& transConvAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<T>& filterDesc,
|
||||
DevicePtr<const T> filterPtr,
|
||||
const TensorDescriptor<T>& inputDesc,
|
||||
DevicePtr<const T> inputPtr,
|
||||
T alpha, T beta,
|
||||
const TensorDescriptor<T>& outputDesc,
|
||||
DevicePtr<T> outputPtr)
|
||||
{
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionBackwardData(
|
||||
handle.get(),
|
||||
&alpha,
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
inputDesc.get(), inputPtr.get(),
|
||||
convDesc.get(), transConvAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
void transpose_convolve(
|
||||
const Handle& handle,
|
||||
const ConvolutionDescriptor<half>& convDesc,
|
||||
const TransposeConvolutionAlgorithm<half>& convAlgo,
|
||||
WorkspaceInstance workspace,
|
||||
const FilterDescriptor<half>& filterDesc,
|
||||
DevicePtr<const half> filterPtr,
|
||||
const TensorDescriptor<half>& inputDesc,
|
||||
DevicePtr<const half> inputPtr,
|
||||
half alpha, half beta,
|
||||
const TensorDescriptor<half>& outputDesc,
|
||||
DevicePtr<half> outputPtr)
|
||||
{
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */
|
||||
float alpha_ = alpha, beta_ = beta;
|
||||
CUDA4DNN_CHECK_CUDNN(
|
||||
cudnnConvolutionBackwardData(
|
||||
handle.get(),
|
||||
&alpha_,
|
||||
filterDesc.get(), filterPtr.get(),
|
||||
inputDesc.get(), inputPtr.get(),
|
||||
convDesc.get(), convAlgo.get(),
|
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
|
||||
&beta_, outputDesc.get(), outputPtr.get()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
|
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */
|
30
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/error.hpp
vendored
Normal file
30
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/error.hpp
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#define CUDA4DNN_CHECK_CUDA(call) \
|
||||
::cv::dnn::cuda4dnn::csl::detail::check((call), CV_Func, __FILE__, __LINE__)
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
/** @brief exception class for errors thrown by the CUDA APIs */
|
||||
class CUDAException : public cv::Exception {
|
||||
public:
|
||||
using cv::Exception::Exception;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
inline void check(cudaError_t err, const char* func, const char* file, int line) {
|
||||
if (err != cudaSuccess)
|
||||
throw CUDAException(Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
|
||||
}
|
||||
}
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP */
|
103
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/event.hpp
vendored
Normal file
103
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/event.hpp
vendored
Normal file
@ -0,0 +1,103 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
#include "stream.hpp"
|
||||
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief sharable CUDA event
|
||||
*
|
||||
* Event is a smart sharable wrapper for CUDA event handle which ensures that
|
||||
* the handle is destroyed after use.
|
||||
*
|
||||
* @note Moving an Event object to another invalidates the former
|
||||
*/
|
||||
class Event {
|
||||
public:
|
||||
Event() noexcept : event{ nullptr } { }
|
||||
Event(const Event&) = delete;
|
||||
Event(Event&& other) noexcept
|
||||
: event{ other.event } {
|
||||
other.event = nullptr;
|
||||
}
|
||||
|
||||
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
|
||||
Event(bool create, bool timing_event = false) : event{nullptr} {
|
||||
if (create) {
|
||||
unsigned int flags = (timing_event ? 0 : cudaEventDisableTiming);
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
|
||||
}
|
||||
}
|
||||
|
||||
~Event() {
|
||||
try {
|
||||
if (event != nullptr)
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventDestroy(event));
|
||||
} catch (const CUDAException& ex) {
|
||||
std::ostringstream os;
|
||||
os << "Asynchronous exception caught during CUDA event destruction.\n";
|
||||
os << ex.what();
|
||||
os << "Exception will be ignored.\n";
|
||||
CV_LOG_WARNING(0, os.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
Event& operator=(const Event&) noexcept = delete;
|
||||
Event& operator=(Event&& other) noexcept {
|
||||
event = other.event;
|
||||
other.event = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** mark a point in \p stream */
|
||||
void record(const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
|
||||
}
|
||||
|
||||
/** blocks the caller thread until all operations before the event finish */
|
||||
void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaEventSynchronize(event)); }
|
||||
|
||||
/** returns true if there are operations pending before the event completes */
|
||||
bool busy() const {
|
||||
auto status = cudaEventQuery(event);
|
||||
if (status == cudaErrorNotReady)
|
||||
return true;
|
||||
CUDA4DNN_CHECK_CUDA(status);
|
||||
return false;
|
||||
}
|
||||
|
||||
cudaEvent_t get() const noexcept { return event; }
|
||||
|
||||
/** returns true if the event is valid */
|
||||
explicit operator bool() const noexcept { return event; }
|
||||
|
||||
private:
|
||||
cudaEvent_t event;
|
||||
};
|
||||
|
||||
/** makes a stream wait on an event */
|
||||
inline void StreamWaitOnEvent(const Stream& stream, const Event& event) {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
|
||||
}
|
||||
|
||||
/** returns the time elapsed between two events in milliseconds */
|
||||
inline float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
|
||||
float temp;
|
||||
CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
|
||||
return temp;
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP */
|
303
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/memory.hpp
vendored
Normal file
303
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/memory.hpp
vendored
Normal file
@ -0,0 +1,303 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
#include "pointer.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/* @brief smart device pointer with allocation/deallocation methods
|
||||
*
|
||||
* ManagedPtr is a smart shared device pointer which also handles memory allocation.
|
||||
*/
|
||||
template <class T>
|
||||
class ManagedPtr {
|
||||
static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
|
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
|
||||
|
||||
public:
|
||||
using element_type = T;
|
||||
|
||||
using pointer = DevicePtr<element_type>;
|
||||
using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
|
||||
|
||||
using size_type = std::size_t;
|
||||
|
||||
ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
|
||||
ManagedPtr(const ManagedPtr&) noexcept = default;
|
||||
ManagedPtr(ManagedPtr&& other) noexcept
|
||||
: wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
|
||||
{
|
||||
other.reset();
|
||||
}
|
||||
|
||||
/** allocates device memory for \p count number of element */
|
||||
ManagedPtr(size_type count) {
|
||||
if (count <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements is zero or negative");
|
||||
}
|
||||
|
||||
void* temp = nullptr;
|
||||
CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
|
||||
|
||||
auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
|
||||
wrapped.reset(ptr, [](element_type* ptr) {
|
||||
if (ptr != nullptr) {
|
||||
/* contract violation for std::shared_ptr if cudaFree throws */
|
||||
try {
|
||||
CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
|
||||
} catch (const CUDAException& ex) {
|
||||
std::ostringstream os;
|
||||
os << "Device memory deallocation failed in deleter.\n";
|
||||
os << ex.what();
|
||||
os << "Exception will be ignored.\n";
|
||||
CV_LOG_WARNING(0, os.str().c_str());
|
||||
}
|
||||
}
|
||||
});
|
||||
/* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
|
||||
* need to have a try-catch block to free the allocated device memory
|
||||
*/
|
||||
|
||||
n = capacity = count;
|
||||
}
|
||||
|
||||
ManagedPtr& operator=(ManagedPtr&& other) noexcept {
|
||||
wrapped = std::move(other.wrapped);
|
||||
n = other.n;
|
||||
capacity = other.capacity;
|
||||
|
||||
other.reset();
|
||||
return *this;
|
||||
}
|
||||
|
||||
size_type size() const noexcept { return n; }
|
||||
|
||||
void reset() noexcept { wrapped.reset(); n = capacity = 0; }
|
||||
|
||||
/**
|
||||
* deallocates any previously allocated memory and allocates device memory
|
||||
* for \p count number of elements
|
||||
*
|
||||
* @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
|
||||
* @note use move constructor to guarantee a deallocation of the previously allocated memory
|
||||
*
|
||||
* Exception Guarantee: Strong
|
||||
*/
|
||||
void reset(size_type count) {
|
||||
/* we need to fully own the memory to perform optimizations */
|
||||
if (wrapped.use_count() == 1) {
|
||||
/* avoid reallocation if the existing capacity is sufficient */
|
||||
if (count <= capacity) {
|
||||
n = count;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* no optimization performed; allocate memory */
|
||||
ManagedPtr tmp(count);
|
||||
swap(tmp, *this);
|
||||
}
|
||||
|
||||
pointer get() const noexcept { return pointer(wrapped.get()); }
|
||||
|
||||
explicit operator bool() const noexcept { return wrapped; }
|
||||
|
||||
friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
|
||||
friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
|
||||
|
||||
friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.wrapped, rhs.wrapped);
|
||||
swap(lhs.n, rhs.n);
|
||||
swap(lhs.capacity, rhs.capacity);
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<element_type> wrapped;
|
||||
size_type n, capacity;
|
||||
};
|
||||
|
||||
/** copies entire memory block pointed by \p src to \p dest
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, const ManagedPtr<T>& src) {
|
||||
memcpy<T>(dest, src.get(), src.size());
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to fully fill \p dest
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p src must be at least as big as the memory block held by \p dest
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(const ManagedPtr<T>& dest, const T* src) {
|
||||
memcpy<T>(dest.get(), src, dest.size());
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest
|
||||
*
|
||||
* if the two \p src and \p dest have different sizes, the number of elements copied is
|
||||
* equal to the size of the smaller memory block
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
|
||||
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
|
||||
}
|
||||
|
||||
/** sets device memory block to a specific 8-bit value
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
|
||||
memset<T>(dest.get(), ch, dest.size());
|
||||
}
|
||||
|
||||
/** copies entire memory block pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
|
||||
* - \p dest points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memcpy<T>(dest, src.get(), src.size(), stream);
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
|
||||
* - \p src points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memcpy<T>(dest.get(), src, dest.size(), stream);
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* if the two \p src and \p dest have different sizes, the number of elements copied is
|
||||
* equal to the size of the smaller memory block
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
|
||||
}
|
||||
|
||||
/** sets device memory block to a specific 8-bit value asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
* \param stream CUDA stream that has to be used for the memory operation
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
|
||||
CV_Assert(stream);
|
||||
memset<T>(dest.get(), ch, dest.size(), stream);
|
||||
}
|
||||
|
||||
/** @brief registers host memory as page-locked and unregisters on destruction */
|
||||
class MemoryLockGuard {
|
||||
public:
|
||||
MemoryLockGuard() noexcept : ptr { nullptr } { }
|
||||
MemoryLockGuard(const MemoryLockGuard&) = delete;
|
||||
MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
|
||||
other.ptr = nullptr;
|
||||
}
|
||||
|
||||
/** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - host memory should be unregistered
|
||||
*/
|
||||
MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
|
||||
CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
|
||||
ptr = ptr_;
|
||||
}
|
||||
|
||||
MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
|
||||
MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
|
||||
if (&other != this) {
|
||||
if(ptr != nullptr) {
|
||||
/* cudaHostUnregister does not throw for a valid ptr */
|
||||
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
|
||||
}
|
||||
ptr = other.ptr;
|
||||
other.ptr = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
~MemoryLockGuard() {
|
||||
if(ptr != nullptr) {
|
||||
/* cudaHostUnregister does not throw for a valid ptr */
|
||||
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void *ptr;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */
|
20
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp
vendored
Normal file
20
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#ifdef __CUDACC__
|
||||
# define CUDA4DNN_HOST __host__
|
||||
# define CUDA4DNN_DEVICE __device__
|
||||
# define CUDA4DNN_HOST_DEVICE CUDA4DNN_HOST CUDA4DNN_DEVICE
|
||||
#else
|
||||
# define CUDA4DNN_HOST
|
||||
# define CUDA4DNN_DEVICE
|
||||
# define CUDA4DNN_HOST_DEVICE
|
||||
#endif
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP */
|
411
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/pointer.hpp
vendored
Normal file
411
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/pointer.hpp
vendored
Normal file
@ -0,0 +1,411 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
|
||||
|
||||
#include "nvcc_defs.hpp"
|
||||
#include "error.hpp"
|
||||
#include "stream.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
#include <ostream>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief provides a type-safe device pointer
|
||||
*
|
||||
* DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert
|
||||
* to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen.
|
||||
*
|
||||
* It is meant to point to locations in device memory. Hence, it provides dereferencing or
|
||||
* array subscript capability for device code only.
|
||||
*
|
||||
* A `const DevicePtr<T>` represents an immutable pointer to a mutable memory.
|
||||
* A `DevicePtr<const T>` represents a mutable pointer to an immutable memory.
|
||||
* A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory.
|
||||
*
|
||||
* A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`.
|
||||
*
|
||||
* Specializations:
|
||||
* - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided)
|
||||
* - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void>
|
||||
* - any device pointer is implicitly convertible to DevicePtr<const void>
|
||||
* - DevicePtr<void> can be explicitly converted to any device pointer
|
||||
* - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory
|
||||
*/
|
||||
template <class T>
|
||||
class DevicePtr {
|
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
|
||||
|
||||
public:
|
||||
using element_type = T;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using pointer = typename std::add_pointer<element_type>::type;
|
||||
using reference = typename std::add_lvalue_reference<element_type>::type;
|
||||
|
||||
DevicePtr() = default;
|
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
|
||||
|
||||
CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; }
|
||||
CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); }
|
||||
CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); }
|
||||
|
||||
template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
|
||||
CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept {
|
||||
return DevicePtr<typename std::add_const<U>::type>{ptr};
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept {
|
||||
++ptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept {
|
||||
auto tmp = DevicePtr(*this);
|
||||
ptr++;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept {
|
||||
--ptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept {
|
||||
auto tmp = DevicePtr(*this);
|
||||
ptr--;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept {
|
||||
ptr += offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept {
|
||||
ptr -= offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
|
||||
return lhs += offset;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
|
||||
return lhs -= offset;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept {
|
||||
return lhs.ptr - rhs.ptr;
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.ptr, rhs.ptr);
|
||||
}
|
||||
|
||||
template <class U, class V>
|
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
|
||||
os << other.get() << " (device)";
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
};
|
||||
|
||||
template <>
|
||||
class DevicePtr<const void> {
|
||||
public:
|
||||
using element_type = const void;
|
||||
using pointer = typename std::add_pointer<element_type>::type;
|
||||
|
||||
DevicePtr() = default;
|
||||
|
||||
/* host const void pointer to const void device pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
|
||||
|
||||
/* allow any device pointer to be implicitly convereted to void device pointer */
|
||||
template <class T>
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
|
||||
|
||||
/* explicit conversion into host void pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
|
||||
|
||||
/* const void device pointer can be explicitly casted into any const device pointer type */
|
||||
template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true>
|
||||
CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
|
||||
return static_cast<T*>(ptr);
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.ptr, rhs.ptr);
|
||||
}
|
||||
|
||||
template <class U, class V>
|
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
|
||||
os << other.get() << " (device)";
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
};
|
||||
|
||||
template <>
|
||||
class DevicePtr<void> {
|
||||
public:
|
||||
using element_type = void;
|
||||
using pointer = typename std::add_pointer<element_type>::type;
|
||||
|
||||
DevicePtr() = default;
|
||||
|
||||
/* host pointer to device pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
|
||||
|
||||
/* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */
|
||||
template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false>
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
|
||||
|
||||
CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
|
||||
|
||||
/* explicit conversion into host void pointer */
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
|
||||
|
||||
/* void device pointer can be explicitly casted into any device pointer type */
|
||||
template <class T>
|
||||
CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
|
||||
return DevicePtr<T>(static_cast<T*>(ptr));
|
||||
}
|
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
|
||||
using std::swap;
|
||||
swap(lhs.ptr, rhs.ptr);
|
||||
}
|
||||
|
||||
template <class U, class V>
|
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
|
||||
os << other.get() << " (device)";
|
||||
return os;
|
||||
}
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) {
|
||||
auto addr = reinterpret_cast<std::intptr_t>(ptr.get());
|
||||
return addr % alignment == 0;
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest4
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
/** sets \p n elements to \p ch in \p dest
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T)));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest host pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
* - \p dest points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
|
||||
}
|
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src host pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
* - \p src points to page-locked memory
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get()));
|
||||
}
|
||||
|
||||
/** copies \p n elements from \p src to \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] dest device pointer
|
||||
* \param stream CUDA stream that has to be used for the memory transfer
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
|
||||
}
|
||||
|
||||
/** sets \p n elements to \p ch in \p dest asynchronously
|
||||
*
|
||||
* \param[in] src device pointer
|
||||
* \param[out] ch 8-bit value to fill the device memory with
|
||||
* \param stream CUDA stream that has to be used for the memory operation
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - memory pointed by \p dest must be large enough to hold \p n elements
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T>
|
||||
void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) {
|
||||
if (n <= 0) {
|
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
|
||||
}
|
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get()));
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */
|
82
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/span.hpp
vendored
Normal file
82
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/span.hpp
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
|
||||
|
||||
#include "pointer.hpp"
|
||||
#include "nvcc_defs.hpp"
|
||||
|
||||
#include "../../cuda/types.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief provides non-owning mutable access for device arrays
|
||||
*
|
||||
* const Span<T>/Span<T> provides mutable access to the elements unless T is const qualified
|
||||
* const Span<T> makes the span immutable but not the elements
|
||||
*/
|
||||
template <class T>
|
||||
class Span {
|
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
|
||||
|
||||
public:
|
||||
using value_type = T;
|
||||
using size_type = device::size_type;
|
||||
using index_type = device::index_type;
|
||||
|
||||
using pointer = DevicePtr<value_type>;
|
||||
using const_pointer = DevicePtr<typename std::add_const<value_type>::type>;
|
||||
using reference = typename std::add_lvalue_reference<value_type>::type;
|
||||
using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>;
|
||||
|
||||
Span() noexcept : ptr{ nullptr }, sz{ 0 } { }
|
||||
CUDA4DNN_HOST_DEVICE Span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { }
|
||||
CUDA4DNN_HOST_DEVICE Span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { }
|
||||
|
||||
CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; }
|
||||
CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; }
|
||||
|
||||
CUDA4DNN_DEVICE reference operator[](index_type index) const { return ptr[index]; }
|
||||
CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; }
|
||||
|
||||
template<class U = T, class V = typename std::add_const<U>::type,
|
||||
typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
|
||||
CUDA4DNN_HOST_DEVICE operator Span<V>() const noexcept { return Span<V>{ptr, sz}; }
|
||||
|
||||
private:
|
||||
pointer ptr;
|
||||
size_type sz;
|
||||
};
|
||||
|
||||
/** @brief provides non-owning immutable view for device arrays */
|
||||
template <class T>
|
||||
using View = Span<const T>;
|
||||
|
||||
/** returns true if the address of a span/view is aligned to \p alignment number of elements (not bytes) */
|
||||
template <class T>
|
||||
bool is_address_aligned(View<T> v, std::size_t alignment) {
|
||||
return is_aligned(v.data(), alignment * sizeof(T));
|
||||
}
|
||||
|
||||
/** returns true if the size of a span/view is a multiple of \p alignment */
|
||||
template <class T>
|
||||
bool is_size_aligned(View<T> v, std::size_t alignment) {
|
||||
return v.size() % alignment == 0;
|
||||
}
|
||||
|
||||
/** @brief returns true if the address and the size of the span/view is aligned
|
||||
* \p alignment refers to the number of elements (not bytes)
|
||||
*/
|
||||
template <class T>
|
||||
bool is_fully_aligned(View<T> v, std::size_t alignment) {
|
||||
return is_address_aligned(v, alignment) && is_size_aligned(v, alignment);
|
||||
}
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP */
|
161
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/stream.hpp
vendored
Normal file
161
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/stream.hpp
vendored
Normal file
@ -0,0 +1,161 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
|
||||
|
||||
#include "error.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/core/utils/logger.hpp>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** \file stream.hpp
|
||||
*
|
||||
* Default streams are not supported as they limit flexiblity. All operations are always
|
||||
* carried out in non-default streams in the CUDA backend. The stream classes sacrifice
|
||||
* the ability to support default streams in exchange for better error detection. That is,
|
||||
* a default constructed stream represents no stream and any attempt to use it will throw an
|
||||
* exception.
|
||||
*/
|
||||
|
||||
/** @brief non-copyable smart CUDA stream
|
||||
*
|
||||
* UniqueStream is a smart non-sharable wrapper for CUDA stream handle which ensures that
|
||||
* the handle is destroyed after use. Unless explicitly specified by a constructor argument,
|
||||
* the stream object does not represent any stream by default.
|
||||
*/
|
||||
class UniqueStream {
|
||||
public:
|
||||
UniqueStream() noexcept : stream{ 0 } { }
|
||||
UniqueStream(UniqueStream&) = delete;
|
||||
UniqueStream(UniqueStream&& other) noexcept {
|
||||
stream = other.stream;
|
||||
other.stream = 0;
|
||||
}
|
||||
|
||||
/** creates a non-default stream if `create` is true; otherwise, no stream is created */
|
||||
UniqueStream(bool create) : stream{ 0 } {
|
||||
if (create) {
|
||||
/* we create non-blocking streams to avoid inrerruptions from users using the default stream */
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
}
|
||||
}
|
||||
|
||||
~UniqueStream() {
|
||||
try {
|
||||
/* cudaStreamDestroy does not throw if a valid stream is passed unless a previous
|
||||
* asynchronous operation errored.
|
||||
*/
|
||||
if (stream != 0)
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamDestroy(stream));
|
||||
} catch (const CUDAException& ex) {
|
||||
std::ostringstream os;
|
||||
os << "Asynchronous exception caught during CUDA stream destruction.\n";
|
||||
os << ex.what();
|
||||
os << "Exception will be ignored.\n";
|
||||
CV_LOG_WARNING(0, os.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
UniqueStream& operator=(const UniqueStream&) = delete;
|
||||
UniqueStream& operator=(UniqueStream&& other) noexcept {
|
||||
CV_Assert(other);
|
||||
if (&other != this) {
|
||||
UniqueStream(std::move(*this)); /* destroy current stream */
|
||||
stream = other.stream;
|
||||
other.stream = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** returns the raw CUDA stream handle */
|
||||
cudaStream_t get() const noexcept {
|
||||
CV_Assert(stream);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/** blocks the calling thread until all pending operations in the stream finish */
|
||||
void synchronize() const {
|
||||
CV_Assert(stream);
|
||||
CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
/** returns true if there are pending operations in the stream */
|
||||
bool busy() const {
|
||||
CV_Assert(stream);
|
||||
|
||||
auto status = cudaStreamQuery(stream);
|
||||
if (status == cudaErrorNotReady)
|
||||
return true;
|
||||
CUDA4DNN_CHECK_CUDA(status);
|
||||
return false;
|
||||
}
|
||||
|
||||
/** returns true if the stream is valid */
|
||||
explicit operator bool() const noexcept { return static_cast<bool>(stream); }
|
||||
|
||||
private:
|
||||
cudaStream_t stream;
|
||||
};
|
||||
|
||||
/** @brief sharable smart CUDA stream
|
||||
*
|
||||
* Stream is a smart sharable wrapper for CUDA stream handle which ensures that
|
||||
* the handle is destroyed after use. Unless explicitly specified in the constructor,
|
||||
* the stream object represents no stream.
|
||||
*/
|
||||
class Stream {
|
||||
public:
|
||||
Stream() { }
|
||||
Stream(const Stream&) = default;
|
||||
Stream(Stream&&) = default;
|
||||
|
||||
/** if \p create is `true`, a new stream will be created; otherwise, no stream is created */
|
||||
Stream(bool create) {
|
||||
if (create)
|
||||
stream = std::make_shared<UniqueStream>(create);
|
||||
}
|
||||
|
||||
Stream& operator=(const Stream&) = default;
|
||||
Stream& operator=(Stream&&) = default;
|
||||
|
||||
/** blocks the caller thread until all operations in the stream are complete */
|
||||
void synchronize() const {
|
||||
CV_Assert(stream);
|
||||
stream->synchronize();
|
||||
}
|
||||
|
||||
/** returns true if there are operations pending in the stream */
|
||||
bool busy() const {
|
||||
CV_Assert(stream);
|
||||
return stream->busy();
|
||||
}
|
||||
|
||||
/** returns true if the object points has a valid stream */
|
||||
explicit operator bool() const noexcept {
|
||||
if (!stream)
|
||||
return false;
|
||||
return stream->operator bool();
|
||||
}
|
||||
|
||||
cudaStream_t get() const noexcept {
|
||||
CV_Assert(stream);
|
||||
return stream->get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<UniqueStream> stream;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP */
|
1203
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor.hpp
vendored
Normal file
1203
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor.hpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
477
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
vendored
Normal file
477
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
vendored
Normal file
@ -0,0 +1,477 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
|
||||
|
||||
#include "stream.hpp"
|
||||
#include "tensor.hpp"
|
||||
#include "pointer.hpp"
|
||||
#include "cublas.hpp"
|
||||
#include "cudnn.hpp"
|
||||
#include "workspace.hpp"
|
||||
|
||||
#include "cudnn/convolution.hpp"
|
||||
#include "cudnn/pooling.hpp"
|
||||
#include "cudnn/lrn.hpp"
|
||||
#include "cudnn/softmax.hpp"
|
||||
#include "cudnn/transform.hpp"
|
||||
#include "cudnn/transpose_convolution.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
namespace tensor_ops {
|
||||
|
||||
/** @brief copies data between tensors
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p dest and \p src must have the same shape
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
|
||||
CV_Assert(is_shape_same(dest, src));
|
||||
if (dest.get() != src.get())
|
||||
memcpy(dest.get(), src.get(), dest.size(), stream);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
template <class T>
|
||||
void assertGEMMCompatiblity(const TensorSpan<T>& result, bool transa, const TensorView<T>& A, bool transb, const TensorView<T>& B) {
|
||||
/* check dimension requirements for matrix multiplication */
|
||||
if (!transa && !transb) {
|
||||
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
|
||||
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
|
||||
} else if (!transa && transb) {
|
||||
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
|
||||
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
|
||||
} else if (transa && !transb) {
|
||||
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
|
||||
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
|
||||
} else {
|
||||
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
|
||||
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
|
||||
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief performs generalized matrix-multiplication
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p A and \p B must meet the mathematical requirements for matrix multiplication
|
||||
* - \p result must be large enough to hold the result
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
|
||||
/* matrix operations can be performed only on tensors with rank two or below */
|
||||
CV_Assert(get_effective_rank(A) <= 2);
|
||||
CV_Assert(get_effective_rank(B) <= 2);
|
||||
CV_Assert(get_effective_rank(result) <= 2);
|
||||
|
||||
const auto result_nr = result.get_axis_size(-2);
|
||||
const auto result_nc = result.get_axis_size(-1);
|
||||
const auto common_dim = A.get_axis_size(transa ? -2 : -1);
|
||||
const auto A_nc = A.get_axis_size(-1);
|
||||
const auto B_nc = B.get_axis_size(-1);
|
||||
|
||||
detail::assertGEMMCompatiblity(result, transa, A, transb, B);
|
||||
|
||||
/* tensors are stored in row-major but cublas::gemm operates on column-major matrices
|
||||
* a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
|
||||
*
|
||||
* Required: C = AB
|
||||
* what cuBLAS sees: C^T = A^TB^T = (BA)^T
|
||||
*
|
||||
* By reversing operands, we effectively perform:
|
||||
* C^T = B^TA^T = (AB)^T
|
||||
*
|
||||
* which gives C = AB
|
||||
*/
|
||||
cublas::gemm<T>(handle,
|
||||
transb, transa,
|
||||
result_nc, result_nr, common_dim,
|
||||
alpha, B.get(), B_nc,
|
||||
A.get(), A_nc,
|
||||
beta, result.get(), result_nc);
|
||||
}
|
||||
|
||||
/** @brief performs generalized matrix-multiplication for a strided batch of matrices
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - A, B and C must be rank three tensors with dimensions (batch, rows, cols)
|
||||
* - the last two axes of \p A and \p B must meet the mathematical requirements for matrix multiplication
|
||||
* - \p result must be large enough to hold the result and the matrices must not overlap in memory
|
||||
* - batch dimension should be same in \p A, \p B and \p result
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void gemmStridedBatched(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
|
||||
CV_Assert(A.rank() == 3);
|
||||
CV_Assert(B.rank() == 3);
|
||||
CV_Assert(result.rank() == 3);
|
||||
|
||||
const auto batch_size = result.get_axis_size(0);
|
||||
CV_Assert(batch_size == A.get_axis_size(0));
|
||||
CV_Assert(batch_size == B.get_axis_size(0));
|
||||
|
||||
detail::assertGEMMCompatiblity(result, transa, A, transb, B);
|
||||
|
||||
const auto result_nr = result.get_axis_size(-2);
|
||||
const auto result_nc = result.get_axis_size(-1);
|
||||
const auto common_dim = A.get_axis_size(transa ? -2 : -1);
|
||||
const auto A_nc = A.get_axis_size(-1);
|
||||
const auto B_nc = B.get_axis_size(-1);
|
||||
|
||||
std::size_t strideA = (A.size() / batch_size),
|
||||
strideB = (B.size() / batch_size),
|
||||
strideC = (result.size() / batch_size);
|
||||
|
||||
cublas::gemmStridedBatched<T>(handle,
|
||||
transb, transa,
|
||||
result_nc, result_nr, common_dim,
|
||||
alpha, B.get(), B_nc, strideB,
|
||||
A.get(), A_nc, strideA,
|
||||
beta, result.get(), result_nc, strideC,
|
||||
batch_size);
|
||||
}
|
||||
|
||||
/** @brief performs element-wise addition with broadcasting
|
||||
*
|
||||
* Pre-conditions:
|
||||
* - \p A and \p result must be compatible tensors
|
||||
*
|
||||
* Exception Guarantee: Basic
|
||||
*/
|
||||
template <class T> inline
|
||||
void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
|
||||
CV_Assert(is_shape_same(output, input));
|
||||
|
||||
channel_axis = clamp_axis(channel_axis, input.rank());
|
||||
|
||||
std::size_t outer_size = input.size_range(0, channel_axis);
|
||||
auto channel_size = input.get_axis_size(channel_axis);
|
||||
std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
|
||||
|
||||
std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
|
||||
|
||||
using cudnn::TensorDescriptor;
|
||||
auto inputDesc = TensorDescriptor<T>(shape);
|
||||
auto outputDesc = TensorDescriptor<T>(shape);
|
||||
cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
class Convolution {
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
using FilterDescriptor = cudnn::FilterDescriptor<T>;
|
||||
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
|
||||
using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
|
||||
using ActivationDescriptor = cudnn::ActivationDescriptor;
|
||||
|
||||
public:
|
||||
using ActivationType = ActivationDescriptor::ActivationType;
|
||||
|
||||
struct params_type {
|
||||
/* convolution */
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> filter_shape;
|
||||
std::vector<std::size_t> padding;
|
||||
std::vector<std::size_t> stride;
|
||||
std::vector<std::size_t> dilation;
|
||||
std::size_t groups;
|
||||
|
||||
/* bias and activation (only RELU supported) */
|
||||
std::vector<std::size_t> bias_shape;
|
||||
ActivationType activation_type; /* MUST BE identity if there is no bias and ReLU if there is bias */
|
||||
bool eltwise;
|
||||
};
|
||||
|
||||
Convolution() = default;
|
||||
Convolution(const Convolution&) = delete;
|
||||
Convolution(Convolution&&) = default;
|
||||
Convolution(cudnn::Handle handle, const params_type& params) {
|
||||
cudnnHandle = std::move(handle);
|
||||
|
||||
inputTensorDesc = TensorDescriptor(params.input_shape);
|
||||
filterDesc = FilterDescriptor(params.filter_shape);
|
||||
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
|
||||
|
||||
std::vector<int> output_dims;
|
||||
getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
|
||||
outputTensorDesc = TensorDescriptor(output_dims);
|
||||
|
||||
algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
|
||||
|
||||
if (!params.bias_shape.empty()) {
|
||||
CV_Assert(params.activation_type == ActivationType::RELU);
|
||||
biasTensorDesc = TensorDescriptor(params.bias_shape);
|
||||
if (params.eltwise)
|
||||
eltwiseTensorDesc = TensorDescriptor(output_dims);
|
||||
activationDesc = ActivationDescriptor(params.activation_type, 0.0);
|
||||
} else {
|
||||
CV_Assert(params.activation_type == ActivationType::IDENTITY);
|
||||
}
|
||||
}
|
||||
|
||||
Convolution& operator=(const Convolution&) = delete;
|
||||
Convolution& operator=(Convolution&&) = default;
|
||||
|
||||
std::size_t get_workspace_size() const noexcept {
|
||||
return algo.get_workspace_size();
|
||||
}
|
||||
|
||||
void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
|
||||
cudnn::convolve<T>(
|
||||
cudnnHandle,
|
||||
convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
inputTensorDesc, input.get(),
|
||||
1.0, 0.0, outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
void convolve_with_bias_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, WorkspaceInstance scratchpad) {
|
||||
cudnn::convolve_with_bias_activation<T>(
|
||||
cudnnHandle,
|
||||
1.0, convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
inputTensorDesc, input.get(),
|
||||
biasTensorDesc, bias.get(),
|
||||
activationDesc,
|
||||
outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
void convolve_with_bias_eltwise_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, TensorView<T> eltwise, WorkspaceInstance scratchpad) {
|
||||
cudnn::convolve_with_bias_eltwise_activation<T>(
|
||||
cudnnHandle,
|
||||
1.0, convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
inputTensorDesc, input.get(),
|
||||
biasTensorDesc, bias.get(),
|
||||
1.0, eltwiseTensorDesc, eltwise.get(),
|
||||
activationDesc,
|
||||
outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorDescriptor inputTensorDesc, outputTensorDesc;
|
||||
FilterDescriptor filterDesc;
|
||||
ConvolutionDescriptor convDesc;
|
||||
ConvolutionAlgorithm algo;
|
||||
TensorDescriptor biasTensorDesc;
|
||||
TensorDescriptor eltwiseTensorDesc;
|
||||
ActivationDescriptor activationDesc;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TransposeConvolution {
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
using FilterDescriptor = cudnn::FilterDescriptor<T>;
|
||||
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
|
||||
using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
|
||||
|
||||
public:
|
||||
struct params_type {
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
std::vector<std::size_t> filter_shape;
|
||||
|
||||
std::vector<std::size_t> padding;
|
||||
std::vector<std::size_t> stride;
|
||||
std::vector<std::size_t> dilation;
|
||||
|
||||
std::size_t groups;
|
||||
};
|
||||
|
||||
TransposeConvolution() = default;
|
||||
TransposeConvolution(const TransposeConvolution&) = delete;
|
||||
TransposeConvolution(TransposeConvolution&&) = default;
|
||||
TransposeConvolution(cudnn::Handle handle, const params_type& params) {
|
||||
cudnnHandle = std::move(handle);
|
||||
|
||||
filterDesc = FilterDescriptor(params.filter_shape);
|
||||
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
|
||||
|
||||
/* input_shape is the output shape for convolution
|
||||
* output_shape is the input shape for convolution
|
||||
*/
|
||||
convInputTensorDesc = TensorDescriptor(params.output_shape);
|
||||
|
||||
std::vector<int> conv_output_dims;
|
||||
getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
|
||||
|
||||
/* the convolution output must be identical to what cuDNN expects */
|
||||
CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
|
||||
|
||||
convOutputTensorDesc = TensorDescriptor(params.input_shape);
|
||||
|
||||
algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
|
||||
}
|
||||
|
||||
TransposeConvolution& operator=(const TransposeConvolution&) = delete;
|
||||
TransposeConvolution& operator=(TransposeConvolution&&) = default;
|
||||
|
||||
std::size_t get_workspace_size() const noexcept {
|
||||
return algo.get_workspace_size();
|
||||
}
|
||||
|
||||
void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
|
||||
cudnn::transpose_convolve<T>(
|
||||
cudnnHandle,
|
||||
convDesc, algo, scratchpad,
|
||||
filterDesc, filters.get(),
|
||||
convOutputTensorDesc, input.get(),
|
||||
1.0, 0.0, convInputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
|
||||
FilterDescriptor filterDesc;
|
||||
ConvolutionDescriptor convDesc;
|
||||
TransposeConvolutionAlgorithm algo;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class Pooling {
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
using PoolingDescriptor = cudnn::PoolingDescriptor;
|
||||
|
||||
public:
|
||||
using PoolingType = PoolingDescriptor::PoolingType;
|
||||
|
||||
struct params_type {
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
std::vector<std::size_t> window_size;
|
||||
std::vector<std::size_t> padding;
|
||||
std::vector<std::size_t> stride;
|
||||
|
||||
PoolingType type;
|
||||
};
|
||||
|
||||
Pooling() = default;
|
||||
Pooling(const Pooling&) = delete;
|
||||
Pooling(Pooling&&) = default;
|
||||
Pooling(cudnn::Handle handle, const params_type& params) {
|
||||
cudnnHandle = std::move(handle);
|
||||
|
||||
inputTensorDesc = TensorDescriptor(params.input_shape);
|
||||
poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
|
||||
|
||||
//std::vector<int> output_dim;
|
||||
//getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
|
||||
outputTensorDesc = TensorDescriptor(params.output_shape);
|
||||
}
|
||||
|
||||
Pooling& operator=(const Pooling&) = delete;
|
||||
Pooling& operator=(Pooling&&) = default;
|
||||
|
||||
void pool(TensorView<T> input, TensorSpan<T> output) {
|
||||
cudnn::pool<T>(
|
||||
cudnnHandle,
|
||||
poolingDesc,
|
||||
inputTensorDesc, input.get(),
|
||||
1.0, 0.0, outputTensorDesc, output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorDescriptor inputTensorDesc, outputTensorDesc;
|
||||
PoolingDescriptor poolingDesc;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class LRN {
|
||||
using LRNDescriptor = cudnn::LRNDescriptor;
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
|
||||
public:
|
||||
using LRNType = LRNDescriptor::LRNType;
|
||||
|
||||
LRN() = default;
|
||||
LRN(const LRN&) = delete;
|
||||
LRN(LRN&&) = default;
|
||||
LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
|
||||
cudnnHandle = std::move(handle);
|
||||
lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
|
||||
}
|
||||
|
||||
LRN& operator=(const LRN&) = delete;
|
||||
LRN& operator=(LRN&&) = default;
|
||||
|
||||
void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
|
||||
cudnn::LRNForward<T>(
|
||||
cudnnHandle,
|
||||
lrnDesc,
|
||||
TensorDescriptor(input.shape_as_vector()), input.get(),
|
||||
1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
|
||||
workspace
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
LRNDescriptor lrnDesc;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TensorTransform {
|
||||
using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
|
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
||||
|
||||
public:
|
||||
TensorTransform() = default;
|
||||
TensorTransform(const TensorTransform&) = delete;
|
||||
TensorTransform(TensorTransform&&) = default;
|
||||
|
||||
template <class SequenceContainer>
|
||||
TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
|
||||
cudnnHandle = std::move(handle);
|
||||
transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
|
||||
}
|
||||
|
||||
TensorTransform& operator=(const TensorTransform&) = delete;
|
||||
TensorTransform& operator=(TensorTransform&&) = default;
|
||||
|
||||
void transform(TensorView<T> input, TensorSpan<T> output) {
|
||||
cudnn::transform<T>(
|
||||
cudnnHandle,
|
||||
transDesc,
|
||||
TensorDescriptor(input.shape_as_vector()), input.get(),
|
||||
TensorDescriptor(output.shape_as_vector()), output.get()
|
||||
);
|
||||
}
|
||||
|
||||
private:
|
||||
cudnn::Handle cudnnHandle;
|
||||
TensorTransformDescriptor transDesc;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */
|
166
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/workspace.hpp
vendored
Normal file
166
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/csl/workspace.hpp
vendored
Normal file
@ -0,0 +1,166 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
|
||||
|
||||
#include "pointer.hpp"
|
||||
#include "span.hpp"
|
||||
#include "tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
||||
|
||||
/** @brief maintains a single block of reusable device memory
|
||||
*
|
||||
* Each Workspace object is intended to be used by a single entity at a time but by
|
||||
* different entities at different times. It maintains a single reusable block of memory which
|
||||
* is sufficient for the largest consumer.
|
||||
*/
|
||||
class Workspace {
|
||||
public:
|
||||
|
||||
/** @brief reserve \p bytes of memory */
|
||||
void require(std::size_t bytes) {
|
||||
if (bytes > ptr.size())
|
||||
ptr.reset(bytes);
|
||||
}
|
||||
|
||||
/** @brief number of bytes reserved by the largest consumer */
|
||||
std::size_t size() const noexcept {
|
||||
return ptr.size();
|
||||
}
|
||||
|
||||
/** @brief returns the pointer to the workspace memory */
|
||||
DevicePtr<unsigned char> get() {
|
||||
return ptr.get();
|
||||
}
|
||||
|
||||
private:
|
||||
ManagedPtr<unsigned char> ptr;
|
||||
};
|
||||
|
||||
/** used to compute total workspace size from several workspace requests */
|
||||
class WorkspaceBuilder {
|
||||
public:
|
||||
WorkspaceBuilder() noexcept : max_size_in_bytes{ 0 } { }
|
||||
|
||||
/** request memory for \p count number of elements of the type \tparam T */
|
||||
template <class T = std::int8_t>
|
||||
void require(std::size_t count) noexcept {
|
||||
auto blocks256 = (count * sizeof(T) + 255) / 256;
|
||||
max_size_in_bytes += blocks256 * 256;
|
||||
}
|
||||
|
||||
/** returns the total workspace memory that is required */
|
||||
std::size_t required_workspace_size() const noexcept { return max_size_in_bytes; }
|
||||
|
||||
private:
|
||||
std::size_t max_size_in_bytes;
|
||||
};
|
||||
|
||||
/** general memory block from a workspace which can be passed on to the requester */
|
||||
class WorkspaceInstance {
|
||||
public:
|
||||
|
||||
/** returns a device pointer to the workspace memory */
|
||||
template <class T = void>
|
||||
DevicePtr<T> get() const noexcept {
|
||||
return static_cast<DevicePtr<T>>(ptr);
|
||||
}
|
||||
|
||||
/** returnss the size of the workspace memory in bytes */
|
||||
std::size_t size_in_bytes() const noexcept {
|
||||
return size_in_bytes_;
|
||||
}
|
||||
|
||||
/** creates a Span<T> of \p count elements from the workspace memory */
|
||||
template <class T>
|
||||
Span<T> get_span(std::size_t count = 0) const {
|
||||
if (count == 0)
|
||||
count = size_in_bytes_ / sizeof(T);
|
||||
|
||||
if (count * sizeof(T) > size_in_bytes_)
|
||||
CV_Error(Error::StsNoMem, "memory not sufficient");
|
||||
|
||||
return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
|
||||
}
|
||||
|
||||
/** creates a TensorSpan<T> of the given shape from the workspace memory */
|
||||
template <class T, class ForwardItr>
|
||||
TensorSpan<T> get_tensor_span(ForwardItr shape_begin, ForwardItr shape_end) const {
|
||||
using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
|
||||
auto required_size = std::accumulate(shape_begin, shape_end, 1, std::multiplies<ItrValueType>());
|
||||
if (required_size * sizeof(T) > size_in_bytes_)
|
||||
CV_Error(Error::StsNoMem, "memory not sufficient");
|
||||
return TensorSpan<T>(static_cast<DevicePtr<T>>(ptr), shape_begin, shape_end);
|
||||
}
|
||||
|
||||
private:
|
||||
DevicePtr<void> ptr;
|
||||
std::size_t size_in_bytes_;
|
||||
|
||||
friend class WorkspaceAllocator;
|
||||
WorkspaceInstance(DevicePtr<void> ptr_, std::size_t size_in_bytes__)
|
||||
: ptr{ ptr_ }, size_in_bytes_{ size_in_bytes__ } { }
|
||||
};
|
||||
|
||||
/** used to split a single workspace into constituents */
|
||||
class WorkspaceAllocator {
|
||||
public:
|
||||
WorkspaceAllocator() = default;
|
||||
WorkspaceAllocator(Workspace& workspace) noexcept
|
||||
: current{ workspace.get() }, bytes_remaining { workspace.size() }
|
||||
{
|
||||
CV_Assert(is_aligned<void>(current, 256));
|
||||
CV_Assert(bytes_remaining % 256 == 0);
|
||||
}
|
||||
|
||||
/** allocates a Span<T> of \p count elements from the workspace memory */
|
||||
template <class T>
|
||||
Span<T> get_span(std::size_t count = 0) {
|
||||
return accquire<T>(count);
|
||||
}
|
||||
|
||||
/** allocates a TensorSpan<T> of the given shape from the workspace memory */
|
||||
template <class T, class ForwardItr>
|
||||
TensorSpan<T> get_tensor_span(ForwardItr start, ForwardItr end) {
|
||||
using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
|
||||
auto required_size = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
|
||||
return TensorSpan<T>(accquire<T>(required_size).data(), start, end);
|
||||
}
|
||||
|
||||
/** allocates a WorkspaceInstance of size \p bytes from the workspace memory */
|
||||
WorkspaceInstance get_instance(std::size_t bytes = 0) {
|
||||
auto span = accquire(bytes);
|
||||
return WorkspaceInstance(DevicePtr<void>(span.data()), span.size());
|
||||
}
|
||||
|
||||
private:
|
||||
template <class T = std::int8_t>
|
||||
Span<T> accquire(std::size_t count = 0) {
|
||||
auto ptr = current;
|
||||
|
||||
if (count == 0)
|
||||
count = bytes_remaining / sizeof(T);
|
||||
|
||||
auto blocks256 = (count * sizeof(T) + 255) / 256;
|
||||
if (bytes_remaining < blocks256 * 256)
|
||||
CV_Error(Error::StsNoMem, "out of workspace memory");
|
||||
|
||||
bytes_remaining -= blocks256 * 256;
|
||||
current = static_cast<DevicePtr<std::int8_t>>(current) + blocks256 * 256;
|
||||
return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
|
||||
}
|
||||
|
||||
DevicePtr<void> current;
|
||||
std::size_t bytes_remaining;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP */
|
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp
vendored
Normal file
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
|
||||
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
|
||||
|
||||
namespace detail {
|
||||
template <class T, class Tag, class = void>
|
||||
struct is_iterator_helper : std::false_type {};
|
||||
|
||||
template <class T, class Tag>
|
||||
struct is_iterator_helper<T, Tag,
|
||||
typename std::enable_if<std::is_base_of<Tag, typename std::iterator_traits<T>::iterator_category>::value, void>::type
|
||||
> : std::true_type {};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
using is_iterator = typename detail::is_iterator_helper<T, std::input_iterator_tag>;
|
||||
|
||||
template <class T>
|
||||
using is_forward_iterator = typename detail::is_iterator_helper<T, std::forward_iterator_tag>;
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP */
|
110
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/resizable_static_array.hpp
vendored
Normal file
110
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/cxx_utils/resizable_static_array.hpp
vendored
Normal file
@ -0,0 +1,110 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
|
||||
|
||||
template <class T, std::size_t maxN>
|
||||
class resizable_static_array {
|
||||
using container_type = std::array<T, maxN>;
|
||||
|
||||
public:
|
||||
using value_type = typename container_type::value_type;
|
||||
using size_type = typename container_type::size_type;
|
||||
using difference_type = typename container_type::difference_type;
|
||||
using reference = typename container_type::reference;
|
||||
using const_reference = typename container_type::const_reference;
|
||||
using pointer = typename container_type::pointer;
|
||||
using const_pointer = typename container_type::const_pointer;
|
||||
using iterator = typename container_type::iterator;
|
||||
using const_iterator = typename container_type::const_iterator;
|
||||
using reverse_iterator = typename container_type::reverse_iterator;
|
||||
using const_reverse_iterator = typename container_type::const_reverse_iterator;
|
||||
|
||||
resizable_static_array() noexcept : size_{ 0 } { }
|
||||
explicit resizable_static_array(size_type sz) noexcept : size_{ sz } { }
|
||||
|
||||
bool empty() const noexcept { return static_cast<bool>(size_); }
|
||||
size_type size() const noexcept { return size_; }
|
||||
size_type capacity() const noexcept { return maxN; }
|
||||
|
||||
void resize(size_type sz) noexcept {
|
||||
assert(sz <= capacity());
|
||||
size_ = sz;
|
||||
}
|
||||
|
||||
void clear() noexcept { size_ = 0; }
|
||||
|
||||
template <class ForwardItr>
|
||||
void assign(ForwardItr first, ForwardItr last) {
|
||||
resize(std::distance(first, last));
|
||||
std::copy(first, last, begin());
|
||||
}
|
||||
|
||||
iterator begin() noexcept { return std::begin(arr); }
|
||||
iterator end() noexcept { return std::begin(arr) + size(); }
|
||||
|
||||
const_iterator begin() const noexcept { return arr.cbegin(); }
|
||||
const_iterator end() const noexcept { return arr.cbegin() + size(); }
|
||||
|
||||
const_iterator cbegin() const noexcept { return arr.cbegin(); }
|
||||
const_iterator cend() const noexcept { return arr.cbegin() + size(); }
|
||||
|
||||
reverse_iterator rbegin() noexcept { return std::begin(arr) + size(); }
|
||||
reverse_iterator rend() noexcept { return std::begin(arr); }
|
||||
|
||||
const_reverse_iterator rbegin() const noexcept { return arr.cbegin()+ size(); }
|
||||
const_reverse_iterator rend() const noexcept { return arr.cbegin(); }
|
||||
|
||||
const_reverse_iterator crbegin() const noexcept { return arr.cbegin() + size(); }
|
||||
const_reverse_iterator crend() const noexcept { return arr.cbegin(); }
|
||||
|
||||
reference operator[](size_type pos) {
|
||||
assert(pos < size());
|
||||
return arr[pos];
|
||||
}
|
||||
|
||||
const_reference operator[](size_type pos) const {
|
||||
assert(pos < size());
|
||||
return arr[pos];
|
||||
}
|
||||
|
||||
iterator insert(iterator pos, const T& value) {
|
||||
resize(size() + 1);
|
||||
std::move_backward(pos, end() - 1, end());
|
||||
*pos = value;
|
||||
return pos;
|
||||
}
|
||||
|
||||
iterator insert(iterator pos, T&& value) {
|
||||
resize(size() + 1);
|
||||
std::move_backward(pos, end() - 1, end());
|
||||
*pos = std::move(value);
|
||||
return pos;
|
||||
}
|
||||
|
||||
iterator erase(iterator pos) {
|
||||
std::move(pos + 1, end(), pos);
|
||||
resize(size() - 1);
|
||||
return pos;
|
||||
}
|
||||
|
||||
pointer data() noexcept { return arr.data(); }
|
||||
const_pointer data() const noexcept { return arr.data(); }
|
||||
|
||||
private:
|
||||
std::size_t size_;
|
||||
container_type arr;
|
||||
};
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP */
|
86
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/init.hpp
vendored
Normal file
86
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/init.hpp
vendored
Normal file
@ -0,0 +1,86 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP
|
||||
|
||||
#include "csl/error.hpp"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cudnn.h>
|
||||
|
||||
#include <opencv2/core/cuda.hpp>
|
||||
#include <sstream>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
void checkVersions()
|
||||
{
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#programming-model
|
||||
// cuDNN API Compatibility
|
||||
// Beginning in cuDNN 7, the binary compatibility of a patch and minor releases is maintained as follows:
|
||||
// Any patch release x.y.z is forward or backward-compatible with applications built against another cuDNN patch release x.y.w (meaning, of the same major and minor version number, but having w!=z).
|
||||
// cuDNN minor releases beginning with cuDNN 7 are binary backward-compatible with applications built against the same or earlier patch release (meaning, an application built against cuDNN 7.x is binary compatible with cuDNN library 7.y, where y>=x).
|
||||
// Applications compiled with a cuDNN version 7.y are not guaranteed to work with 7.x release when y > x.
|
||||
auto cudnn_bversion = cudnnGetVersion();
|
||||
auto cudnn_major_bversion = cudnn_bversion / 1000, cudnn_minor_bversion = cudnn_bversion % 1000 / 100;
|
||||
if (cudnn_major_bversion != CUDNN_MAJOR || cudnn_minor_bversion < CUDNN_MINOR)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "cuDNN reports version " << cudnn_major_bversion << "." << cudnn_minor_bversion << " which is not compatible with the version " << CUDNN_MAJOR << "." << CUDNN_MINOR << " with which OpenCV was built";
|
||||
CV_LOG_WARNING(NULL, oss.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
int getDeviceCount()
|
||||
{
|
||||
return cuda::getCudaEnabledDeviceCount();
|
||||
}
|
||||
|
||||
int getDevice()
|
||||
{
|
||||
int device_id = -1;
|
||||
CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device_id));
|
||||
return device_id;
|
||||
}
|
||||
|
||||
bool isDeviceCompatible()
|
||||
{
|
||||
int device_id = getDevice();
|
||||
if (device_id < 0)
|
||||
return false;
|
||||
|
||||
int major = 0, minor = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
|
||||
|
||||
if (cv::cuda::TargetArchs::hasEqualOrLessPtx(major, minor))
|
||||
return true;
|
||||
|
||||
for (int i = minor; i >= 0; i--)
|
||||
if (cv::cuda::TargetArchs::hasBin(major, i))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool doesDeviceSupportFP16()
|
||||
{
|
||||
int device_id = getDevice();
|
||||
if (device_id < 0)
|
||||
return false;
|
||||
|
||||
int major = 0, minor = 0;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_id));
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_id));
|
||||
|
||||
int version = major * 10 + minor;
|
||||
if (version < 53)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_INIT_HPP */
|
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp
vendored
Normal file
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* inplace_output = activation(inplace_output) + eltwise */
|
||||
|
||||
template <class T>
|
||||
void relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise, T slope);
|
||||
|
||||
template <class T>
|
||||
void clipped_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void tanh_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void swish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void mish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void sigmoid_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void power_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, csl::View<T> eltwise, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP */
|
53
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activations.hpp
vendored
Normal file
53
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/activations.hpp
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T slope);
|
||||
|
||||
template <class T>
|
||||
void clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void axiswise_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t inner_size, csl::View<T> slope);
|
||||
|
||||
template <class T>
|
||||
void tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void swish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void mish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void elu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void abs(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void bnll(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
template <class T>
|
||||
void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift);
|
||||
|
||||
template <class T>
|
||||
void exp(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T normScale, T normShift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP */
|
38
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
vendored
Normal file
38
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void biasN_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T slope);
|
||||
|
||||
template <class T>
|
||||
void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||
|
||||
template <class T>
|
||||
void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */
|
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp
vendored
Normal file
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* inplace_output = activation(inplace_output + bias) + eltwise
|
||||
* broadcasting on `bias` is allowed but not on `eltwise`
|
||||
*/
|
||||
|
||||
template <class T>
|
||||
void biasN_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T slope);
|
||||
|
||||
template <class T>
|
||||
void biasN_clipped_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void biasN_tanh_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_sigmoid_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_swish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_mish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_power_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP */
|
45
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp
vendored
Normal file
45
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* inplace_output = activation(inplace_output + bias + eltwise)
|
||||
* broadcasting on `bias` is allowed but not on `eltwise`
|
||||
*/
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_identity_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T slope);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise);
|
||||
|
||||
template <class T>
|
||||
void biasN_eltwise_sum_2_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, csl::View<T> eltwise, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP */
|
27
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/concat.hpp
vendored
Normal file
27
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/concat.hpp
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void concat(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, std::size_t output_axis_offset,
|
||||
csl::TensorView<T> input, std::size_t axis);
|
||||
|
||||
template <class T>
|
||||
void concat_with_offsets(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> axis_offsets);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP */
|
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
vendored
Normal file
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void crop_and_resize(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> boxes);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP */
|
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/detection_output.hpp
vendored
Normal file
42
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/detection_output.hpp
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_DETECTION_OUTPUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_DETECTION_OUTPUT_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void decode_bboxes(const csl::Stream& stream, csl::Span<T> output, csl::View<T> locations, csl::View<T> priors,
|
||||
std::size_t num_loc_classes, bool share_location, std::size_t background_label_id,
|
||||
bool transpose_location, bool variance_encoded_in_target,
|
||||
bool corner_true_or_center_false, bool normalized_bbox,
|
||||
bool clip_box, float clip_width, float clip_height);
|
||||
|
||||
template <class T>
|
||||
void findTopK(const csl::Stream& stream, csl::TensorSpan<int> indices, csl::TensorSpan<int> count, csl::TensorView<T> scores, std::size_t background_label_id, float threshold);
|
||||
|
||||
template <class T>
|
||||
void box_collect(const csl::Stream& stream, csl::TensorSpan<T> collected_bboxes, csl::TensorView<T> decoded_bboxes, csl::TensorView<int> indices, csl::TensorView<int> count, bool share_location, std::size_t background_label_id);
|
||||
|
||||
template <class T>
|
||||
void blockwise_class_nms(const csl::Stream& stream, csl::TensorSpan<int> indices, csl::TensorSpan<int> count, csl::TensorView<T> collected_bboxes,
|
||||
bool normalized_bbox, std::size_t background_label_id, float nms_threshold);
|
||||
|
||||
template <class T>
|
||||
void nms_collect(const csl::Stream& stream, csl::TensorSpan<int> kept_indices, csl::TensorSpan<int> kept_count,
|
||||
csl::TensorView<int> indices, csl::TensorView<int> count, csl::TensorView<T> scores, float, std::size_t background_label_id);
|
||||
|
||||
template <class T>
|
||||
void consolidate_detections(const csl::Stream& stream, csl::TensorSpan<T> output,
|
||||
csl::TensorView<int> kept_indices, csl::TensorView<int> kept_count,
|
||||
csl::TensorView<T> decoded_bboxes, csl::TensorView<T> scores, bool share_location, csl::DevicePtr<int> num_detections);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_DETECTION_OUTPUT_HPP */
|
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp
vendored
Normal file
40
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
/* output = activation(x + y) */
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y, T slope);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y, T floor, T ceiling);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_swish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_mish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2_power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y, T exp, T scale, T shift);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP */
|
35
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
vendored
Normal file
35
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void eltwise_max_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_min_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_sum_coeff_2(const csl::Stream& stream, csl::TensorSpan<T> output, T coeff_x, csl::TensorView<T> x, T coeff_y, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_prod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
template <class T>
|
||||
void eltwise_div_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
|
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp
vendored
Normal file
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void fill(const csl::Stream& stream, csl::Span<T> output, T value);
|
||||
|
||||
template <class T>
|
||||
void copy(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP */
|
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fp_conversion.hpp
vendored
Normal file
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/fp_conversion.hpp
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
void fp32_to_fp16(const csl::Stream& stream, csl::Span<half> output, csl::View<float> input);
|
||||
void fp16_to_fp32(const csl::Stream& stream, csl::Span<float> output, csl::View<half> input);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP */
|
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/grid_nms.hpp
vendored
Normal file
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/grid_nms.hpp
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_GRID_NMS_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_GRID_NMS_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
|
||||
|
||||
template <class T>
|
||||
void grid_nms(const csl::Stream& stream, csl::Span<unsigned int> workspace, csl::TensorSpan<int> indices, csl::TensorSpan<int> count, csl::TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_GRID_NMS_HPP */
|
32
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp
vendored
Normal file
32
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void max_pooling_with_indices(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorSpan<T> indices, csl::TensorView<T> input,
|
||||
const std::vector<std::size_t>& kernel_size, const std::vector<std::size_t>& strides,
|
||||
const std::vector<std::size_t>& padding_left);
|
||||
|
||||
template <class T>
|
||||
void max_unpooling(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> indices,
|
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
|
||||
const std::vector<std::size_t>& padding_left);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP */
|
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
vendored
Normal file
31
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void reduce_mean(const csl::Stream& stream, csl::Span<float> means, csl::View<T> input, std::size_t inner_size);
|
||||
|
||||
template <class T>
|
||||
void reduce_mean_sqr_sum(const csl::Stream& stream, csl::Span<float> means, csl::Span<float> sum_sqrs, csl::View<T> input, std::size_t inner_size);
|
||||
|
||||
void compute_normalization_scale(const csl::Stream& stream, csl::Span<float> scale, csl::View<float> means, csl::View<float> sum_sqrs, std::size_t inner_size, float eps);
|
||||
|
||||
template <class T>
|
||||
void normalize_mean(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<float> means, std::size_t inner_size);
|
||||
|
||||
template <class T>
|
||||
void normalize_mean_variance(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<float> means, csl::View<float> scale, std::size_t inner_size);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */
|
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/normalize.hpp
vendored
Normal file
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/normalize.hpp
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void normalize(
|
||||
const csl::Stream& stream,
|
||||
csl::Span<T> output, csl::View<T> input,
|
||||
std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
|
||||
csl::Span<T> workspace);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP */
|
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/padding.hpp
vendored
Normal file
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/padding.hpp
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void copy_with_reflection101(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorView<T> input,
|
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP */
|
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/permute.hpp
vendored
Normal file
24
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/permute.hpp
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void permute(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> order);
|
||||
|
||||
template <class T>
|
||||
void transpose(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t in_width, std::size_t out_width);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */
|
28
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/prior_box.hpp
vendored
Normal file
28
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/prior_box.hpp
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void generate_prior_boxes(
|
||||
const csl::Stream& stream,
|
||||
csl::Span<T> output,
|
||||
csl::View<float> boxWidth, csl::View<float> boxHeight, csl::View<float> offsetX, csl::View<float> offsetY, float stepX, float stepY,
|
||||
std::vector<float> variance,
|
||||
std::size_t numPriors,
|
||||
std::size_t layerWidth, std::size_t layerHeight,
|
||||
std::size_t imageWidth, std::size_t imageHeight,
|
||||
bool normalize, bool clip);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP */
|
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/region.hpp
vendored
Normal file
25
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/region.hpp
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void region(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias,
|
||||
T object_prob_cutoff, T class_prob_cutoff,
|
||||
std::size_t boxes_per_cell, std::size_t box_size,
|
||||
std::size_t rows, std::size_t cols, T scale_x_y,
|
||||
std::size_t height_norm, std::size_t width_norm,
|
||||
bool if_true_sigmoid_else_softmax, bool new_coords);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP */
|
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/resize.hpp
vendored
Normal file
21
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/resize.hpp
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers);
|
||||
|
||||
template <class T>
|
||||
void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP */
|
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
vendored
Normal file
19
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void roi_pooling(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> rois, float spatial_scale);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP */
|
39
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
vendored
Normal file
39
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void biasN(const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output,
|
||||
csl::TensorView<T> input, std::size_t inner_size,
|
||||
csl::TensorView<T> bias);
|
||||
|
||||
template <class T>
|
||||
void scaleN(const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output,
|
||||
csl::TensorView<T> input, std::size_t inner_size,
|
||||
csl::TensorView<T> weights);
|
||||
|
||||
template <class T>
|
||||
void scale1_with_bias1(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T beta);
|
||||
|
||||
template <class T>
|
||||
void scaleN_with_biasN(
|
||||
const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output,
|
||||
csl::TensorView<T> input, std::size_t inner_size,
|
||||
csl::TensorView<T> weights, csl::TensorView<T> bias);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP */
|
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/shortcut.hpp
vendored
Normal file
18
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/shortcut.hpp
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SHORTCUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SHORTCUT_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void input_shortcut(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> from);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SHORTCUT_HPP */
|
22
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/slice.hpp
vendored
Normal file
22
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/kernels/slice.hpp
vendored
Normal file
@ -0,0 +1,22 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||
|
||||
template <class T>
|
||||
void slice(const csl::Stream& stream,
|
||||
csl::TensorSpan<T> output, csl::TensorView<T> input,
|
||||
std::vector<std::size_t> offsets);
|
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP */
|
376
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/activation.hpp
vendored
Normal file
376
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/activation.hpp
vendored
Normal file
@ -0,0 +1,376 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/activations.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ReLUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ReLUOp(csl::Stream stream_, T slope_)
|
||||
: stream(std::move(stream_)), slope{ slope_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::relu<T>(stream, output, input, slope);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T slope;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ClippedReLUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ClippedReLUOp(csl::Stream stream_, T min_, T max_)
|
||||
: stream(std::move(stream_)), min{ min_ }, max{ max_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::clipped_relu<T>(stream, output, input, min, max);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T min, max;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ChannelwiseReLUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ChannelwiseReLUOp(csl::Stream stream_, const Mat& slope)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
CV_Assert(!slope.empty());
|
||||
slopeTensor = csl::makeTensorHeader<T>(slope);
|
||||
csl::copyMatToTensor<T>(slope, slopeTensor, stream);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
CV_Assert(input.get_axis_size(1) == slopeTensor.size());
|
||||
std::size_t inner_size = input.size_range(2, input.rank());
|
||||
kernels::axiswise_relu<T>(stream, output, input, inner_size, slopeTensor);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> slopeTensor;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class TanHOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
TanHOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::tanh<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class SwishOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
SwishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::swish<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class MishOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
MishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::mish<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class SigmoidOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
SigmoidOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::sigmoid<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ELUOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ELUOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::elu<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class AbsValOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
AbsValOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::abs<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class BNLLOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
BNLLOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::bnll<T>(stream, output, input);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class PowerOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
PowerOp(csl::Stream stream_, T exp_, T scale_, T shift_)
|
||||
: stream(std::move(stream_)), exp{ exp_ }, scale{ scale_ }, shift{ shift_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::power<T>(stream, output, input, exp, scale, shift);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T exp, scale, shift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ExpOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ExpOp(csl::Stream stream_, T nScale_, T nShift_)
|
||||
: stream(std::move(stream_)), normScale{ nScale_ }, normShift{ nShift_ } { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::exp<T>(stream, output, input, normScale, normShift);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
const T normScale, normShift;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */
|
58
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp
vendored
Normal file
58
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class BatchNormOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
BatchNormOp(csl::Stream stream_, const cv::Mat& weights, const cv::Mat& bias)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
|
||||
weightsTensor = csl::makeTensorHeader<T>(weights);
|
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
std::size_t inner_size = input.size_range(2, input.rank());
|
||||
kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weightsTensor, biasTensor);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> weightsTensor, biasTensor;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP */
|
90
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/concat.hpp
vendored
Normal file
90
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/concat.hpp
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/pointer.hpp"
|
||||
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
#include "../kernels/concat.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ConcatOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ConcatOp(csl::Stream stream_, std::size_t concat_axis, bool zero_padding)
|
||||
: stream(std::move(stream_)), concat_axis{ concat_axis }, zero_padding{ zero_padding }
|
||||
{
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if(zero_padding)
|
||||
{
|
||||
auto output_shape = output_wrapper->getShape();
|
||||
|
||||
kernels::fill<T>(stream, output, 0.0);
|
||||
|
||||
std::size_t output_concat_axis_offset = 0;
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
auto input_shape = input_wrapper->getShape();
|
||||
|
||||
std::vector<std::size_t> offsets(input_shape.size());
|
||||
for (int j = 0; j < offsets.size(); j++)
|
||||
offsets[j] = (output_shape[j] - input_shape[j]) / 2;
|
||||
offsets[concat_axis] = output_concat_axis_offset;
|
||||
|
||||
kernels::concat_with_offsets(stream, output, input, offsets);
|
||||
|
||||
output_concat_axis_offset += input.get_axis_size(concat_axis);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::size_t output_axis_offset = 0;
|
||||
for (int i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
kernels::concat(stream, output, output_axis_offset, input, concat_axis);
|
||||
|
||||
output_axis_offset += input.get_axis_size(concat_axis);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::size_t concat_axis;
|
||||
bool zero_padding;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP */
|
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/const.hpp
vendored
Normal file
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/const.hpp
vendored
Normal file
@ -0,0 +1,51 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class ConstOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ConstOp(csl::Stream stream_, const cv::Mat& data)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
constTensor = csl::makeTensorHeader<T>(data);
|
||||
csl::copyMatToTensor<T>(data, constTensor, stream);
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(outputs.size() == 1 && inputs.size() == 0);
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
csl::tensor_ops::copy<T>(stream, output, constTensor);
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::Tensor<T> constTensor;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP */
|
608
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
vendored
Normal file
608
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
vendored
Normal file
@ -0,0 +1,608 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/cudnn.hpp"
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
#include "../csl/tensor_ops.hpp"
|
||||
|
||||
#include "../kernels/scale_shift.hpp"
|
||||
#include "../kernels/activations.hpp"
|
||||
#include "../kernels/activation_eltwise.hpp"
|
||||
#include "../kernels/bias_activation.hpp"
|
||||
#include "../kernels/bias_eltwise_activation.hpp"
|
||||
#include "../kernels/bias_activation_eltwise.hpp"
|
||||
#include "../kernels/activation_eltwise.hpp"
|
||||
#include "../kernels/eltwise_activation.hpp"
|
||||
#include "../kernels/eltwise_ops.hpp"
|
||||
|
||||
#include <opencv2/core.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct ConvolutionConfiguration {
|
||||
/* the size of the following vectors must be equal to the kernel size */
|
||||
std::vector<std::size_t> kernel_size;
|
||||
std::vector<std::size_t> dilations, strides;
|
||||
|
||||
enum class PaddingMode {
|
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
|
||||
VALID, /* no padding is added */
|
||||
SAME /* TensorFlow logic is used for same padding */
|
||||
};
|
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */
|
||||
PaddingMode padMode;
|
||||
std::vector<std::size_t> pads_begin, pads_end;
|
||||
|
||||
/* full shape inclusive of channel and batch axis */
|
||||
std::vector<std::size_t> input_shape;
|
||||
std::vector<std::size_t> output_shape;
|
||||
|
||||
/* group count for grouped convolution */
|
||||
std::size_t groups;
|
||||
|
||||
enum class FusionMode {
|
||||
NONE,
|
||||
ACTIVATION, /* act(conv) */
|
||||
ELTWISE_SUM, /* eltwise + conv */ /* eltwise tensor is passed as second input to forward */
|
||||
ELTWISE_SUM_THEN_ACTIVATION, /* act(conv + eltwise) */
|
||||
ACTIVATION_THEN_ELTWISE_SUM, /* act(conv) + eltwise */
|
||||
};
|
||||
|
||||
FusionMode fusion_mode;
|
||||
|
||||
enum class ActivationType {
|
||||
IDENTITY,
|
||||
RELU, /* uses value provided in `relu_negative_slope` */
|
||||
CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
|
||||
POWER,
|
||||
TANH,
|
||||
SIGMOID,
|
||||
SWISH,
|
||||
MISH
|
||||
};
|
||||
|
||||
ActivationType activation_type;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil;
|
||||
float power_exp, power_scale, power_shift;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class ConvolutionOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle_, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
|
||||
: stream(std::move(stream_)), cudnnHandle(std::move(handle_))
|
||||
{
|
||||
const auto& kernel_size = config.kernel_size;
|
||||
const auto& dilations = config.dilations;
|
||||
const auto& strides = config.strides;
|
||||
|
||||
const auto convolution_order = kernel_size.size();
|
||||
CV_Assert(convolution_order == dilations.size());
|
||||
CV_Assert(convolution_order == strides.size());
|
||||
|
||||
const auto& input_shape = config.input_shape;
|
||||
const auto& output_shape = config.output_shape;
|
||||
CV_Assert(input_shape.size() == output_shape.size());
|
||||
CV_Assert(input_shape.size() == convolution_order + 2);
|
||||
|
||||
const auto groups = config.groups;
|
||||
|
||||
CV_Assert (1 <= convolution_order && convolution_order <= 3);
|
||||
|
||||
const auto rank = input_shape.size();
|
||||
const auto output_feature_maps = output_shape[1];
|
||||
const auto input_feature_maps = input_shape[1];
|
||||
const auto input_feature_maps_per_group = input_feature_maps / groups;
|
||||
CV_Assert(input_feature_maps % groups == 0);
|
||||
|
||||
filtersTensor = csl::makeTensorHeader<T>(filters);
|
||||
csl::copyMatToTensor<T>(filters, filtersTensor, stream);
|
||||
|
||||
if (!bias.empty())
|
||||
{
|
||||
biasTensor = csl::makeTensorHeader<T>(bias);
|
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream);
|
||||
}
|
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
|
||||
*
|
||||
* `common_padding` contains the amount of padding that has to be added to both sides
|
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added
|
||||
* to a particular side in addition to the common padding
|
||||
*/
|
||||
std::vector<std::size_t> common_padding(rank, 0);
|
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
|
||||
if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL)
|
||||
{
|
||||
const auto& pads_begin = config.pads_begin;
|
||||
const auto& pads_end = config.pads_end;
|
||||
|
||||
CV_Assert(convolution_order == pads_begin.size());
|
||||
CV_Assert(convolution_order == pads_end.size());
|
||||
|
||||
for (int i = 2; i < common_padding.size(); i++)
|
||||
{
|
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
|
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i];
|
||||
padding_right[i] = pads_end[i - 2] - common_padding[i];
|
||||
}
|
||||
}
|
||||
else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID)
|
||||
{
|
||||
/* nothing to do as the paddings are already preset to zero */
|
||||
}
|
||||
else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME)
|
||||
{
|
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
|
||||
*
|
||||
* if total padding is odd, the extra is added towards the end
|
||||
*/
|
||||
for (int i = 2; i < rank; i++)
|
||||
{
|
||||
const auto j = i - 2; /* filter index */
|
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
|
||||
const auto required_total_padding =
|
||||
std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]);
|
||||
|
||||
common_padding[i] = required_total_padding / 2;
|
||||
padding_left[i] = 0;
|
||||
padding_right[i] = required_total_padding % 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */
|
||||
for (int i = 2; i < rank; i++) {
|
||||
const auto j = i - 2; /* filter idx */
|
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
|
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
|
||||
std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
|
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right
|
||||
*/
|
||||
if (rem && padding_right[i] > 0)
|
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
|
||||
}
|
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; };
|
||||
if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
|
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
|
||||
{
|
||||
/* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
|
||||
* copying the input to a bigger tensor and padding the ends manually
|
||||
*/
|
||||
transformed_shape = input_shape;
|
||||
for (int i = 0; i < rank; i++)
|
||||
transformed_shape[i] += padding_left[i] + padding_right[i];
|
||||
|
||||
inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
|
||||
}
|
||||
|
||||
typename csl::Convolution<T>::params_type params;
|
||||
if (transformed_shape.empty())
|
||||
{
|
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* the convolution operation will be seeing the transformed input */
|
||||
params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape));
|
||||
}
|
||||
|
||||
auto& fshape = params.filter_shape;
|
||||
fshape.resize(rank);
|
||||
fshape[0] = output_feature_maps;
|
||||
fshape[1] = input_feature_maps_per_group;
|
||||
std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
|
||||
CV_Assert(fshape.size() == kernel_size.size() + 2);
|
||||
|
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
|
||||
params.stride = strides;
|
||||
params.dilation = dilations;
|
||||
params.groups = config.groups;
|
||||
|
||||
fusion_mode = config.fusion_mode;
|
||||
activation = config.activation_type;
|
||||
relu_negative_slope = config.relu_negative_slope;
|
||||
crelu_floor = config.crelu_floor;
|
||||
crelu_ceil = config.crelu_ceil;
|
||||
power_exp = config.power_exp;
|
||||
power_scale = config.power_scale;
|
||||
power_shift = config.power_shift;
|
||||
|
||||
/* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves
|
||||
* hence, the activation for cuDNN is IDENTITY by default
|
||||
*/
|
||||
fusion_location = InternalFusionLocation::NATIVE; /* i.e. we perform bias, act and eltwise */
|
||||
params.eltwise = false;
|
||||
params.activation_type = csl::Convolution<T>::ActivationType::IDENTITY;
|
||||
|
||||
/* cuDNN can fuse the operations with convolution in some cases; try if it's possible */
|
||||
if (!biasTensor.empty() && 0 &&
|
||||
biasTensor.size() == output_feature_maps && /* cuDNN requirement */
|
||||
activation == ConvolutionConfiguration::ActivationType::RELU && /* cuDNN requirement */
|
||||
relu_negative_slope == 0.0 && /* cuDNN requirement */
|
||||
(fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION || /* act(conv + bias) */
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) /* act(conv + bias + eltwise) */
|
||||
)
|
||||
{
|
||||
bool do_not_fuse = false;
|
||||
if(std::is_same<T, half>::value)
|
||||
{
|
||||
/* performance degrades if fused with tensor core based convolutions in most cases */
|
||||
int device;
|
||||
CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device));
|
||||
|
||||
int cc_major;
|
||||
CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
|
||||
|
||||
if (cc_major >= 7)
|
||||
do_not_fuse = true;
|
||||
}
|
||||
|
||||
if (!do_not_fuse)
|
||||
{
|
||||
fusion_location = InternalFusionLocation::CUDNN;
|
||||
auto bias_shape = std::vector<std::size_t>(rank, 1);
|
||||
bias_shape[1] = output_feature_maps;
|
||||
params.bias_shape = bias_shape;
|
||||
|
||||
if (config.fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
params.eltwise = true;
|
||||
|
||||
params.activation_type = csl::Convolution<T>::ActivationType::RELU;
|
||||
}
|
||||
}
|
||||
|
||||
convoluter = csl::Convolution<T>(cudnnHandle, params);
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
if (!transformed_shape.empty())
|
||||
{
|
||||
auto& shape = transformed_shape;
|
||||
auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
|
||||
builder.require<T>(sz);
|
||||
}
|
||||
builder.require(convoluter.get_workspace_size());
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* input[0] = conv input, input[1] = bias (from fused eltwise layer) */
|
||||
CV_Assert(inputs.size() == 1 || inputs.size() == 2);
|
||||
CV_Assert(outputs.size() == 1);
|
||||
|
||||
csl::WorkspaceAllocator allocator(workspace);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
if (!transformed_shape.empty())
|
||||
{
|
||||
auto& shape = transformed_shape;
|
||||
auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
inputTransformer.transform(input, transformed_input);
|
||||
input = transformed_input;
|
||||
}
|
||||
|
||||
auto conv_scratchpad = allocator.get_instance();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
if (fusion_location == InternalFusionLocation::CUDNN)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION)
|
||||
convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, conv_scratchpad);
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
{
|
||||
auto eltwise_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto eltwise = eltwise_wrapper->getView();
|
||||
CV_Assert(is_shape_same(eltwise, output));
|
||||
|
||||
convoluter.convolve_with_bias_eltwise_activation(output, input, filtersTensor, biasTensor, eltwise, conv_scratchpad);
|
||||
}
|
||||
}
|
||||
catch(const csl::cudnn::cuDNNException& ex)
|
||||
{
|
||||
if (ex.getCUDNNStatus() == CUDNN_STATUS_NOT_SUPPORTED)
|
||||
{
|
||||
/* drop cuDNN fusion and use the native fusion path */
|
||||
fusion_location = InternalFusionLocation::NATIVE;
|
||||
}
|
||||
else
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
if (fusion_location == InternalFusionLocation::NATIVE)
|
||||
{
|
||||
convoluter.convolve(output, input, filtersTensor, conv_scratchpad);
|
||||
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM)
|
||||
{
|
||||
CV_Assert(inputs.size() == 2);
|
||||
}
|
||||
|
||||
if (!biasTensor.empty() && inputs.size() == 2)
|
||||
{
|
||||
/* bias and eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM);
|
||||
|
||||
auto eltwise_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto eltwise = eltwise_wrapper->getView();
|
||||
CV_Assert(is_shape_same(eltwise, output));
|
||||
|
||||
std::size_t inner_size = output.size_range(2, output.rank());
|
||||
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM)
|
||||
{
|
||||
kernels::biasN_eltwise_sum_2_identity_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
{
|
||||
/* activation(conv + bias + eltwise) */
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::biasN_eltwise_sum_2_identity_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::biasN_eltwise_sum_2_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::biasN_eltwise_sum_2_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_eltwise_sum_2_power_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_eltwise_sum_2_tanh_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::biasN_eltwise_sum_2_sigmoid_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::biasN_eltwise_sum_2_swish_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::biasN_eltwise_sum_2_mish_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM)
|
||||
{
|
||||
/* activation(conv + bias) + eltwise */
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::biasN_eltwise_sum_2_identity_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::biasN_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::biasN_clipped_relu_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_power_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_tanh_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::biasN_sigmoid_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::biasN_swish_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::biasN_mish_eltwise_sum_2_inplace<T>(stream, output, inner_size, biasTensor, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!biasTensor.empty() && inputs.size() == 1)
|
||||
{
|
||||
/* bias but no eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION);
|
||||
|
||||
std::size_t inner_size = output.size_range(2, output.rank());
|
||||
switch(activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (biasTensor.empty() && inputs.size() == 2)
|
||||
{
|
||||
/* no bias but eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM);
|
||||
|
||||
auto eltwise_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto eltwise = eltwise_wrapper->getView();
|
||||
CV_Assert(is_shape_same(eltwise, output));
|
||||
|
||||
/* we pass `eltwise` as `bias` (with `inner_size` as one) to bias-activation kernels */
|
||||
|
||||
if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM)
|
||||
{
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, eltwise);
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION)
|
||||
{
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::eltwise_sum_2_relu<T>(stream, output, output, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::eltwise_sum_2_clipped_relu<T>(stream, output, output, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::eltwise_sum_2_power<T>(stream, output, output, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::eltwise_sum_2_tanh<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::eltwise_sum_2_sigmoid<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::eltwise_sum_2_swish<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::eltwise_sum_2_mish<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM)
|
||||
{
|
||||
switch (activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
kernels::eltwise_sum_2<T>(stream, output, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::clipped_relu_eltwise_sum_2_inplace<T>(stream, output, eltwise, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::power_eltwise_sum_2_inplace<T>(stream, output, eltwise, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::tanh_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::sigmoid_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::swish_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::mish_eltwise_sum_2_inplace<T>(stream, output, eltwise);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(biasTensor.empty() && inputs.size() == 1)
|
||||
{
|
||||
/* no bias and no eltwise */
|
||||
CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE ||
|
||||
fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION);
|
||||
|
||||
switch(activation)
|
||||
{
|
||||
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::RELU:
|
||||
kernels::relu<T>(stream, output, output, relu_negative_slope);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::POWER:
|
||||
kernels::power<T>(stream, output, output, power_exp, power_scale, power_shift);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::TANH:
|
||||
kernels::tanh<T>(stream, output, output);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||
kernels::sigmoid<T>(stream, output, output);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||
kernels::swish<T>(stream, output, output);
|
||||
break;
|
||||
case ConvolutionConfiguration::ActivationType::MISH:
|
||||
kernels::mish<T>(stream, output, output);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
csl::cudnn::Handle cudnnHandle;
|
||||
csl::Tensor<T> filtersTensor, biasTensor;
|
||||
csl::Convolution<T> convoluter;
|
||||
|
||||
std::vector<std::size_t> transformed_shape;
|
||||
csl::TensorTransform<T> inputTransformer;
|
||||
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
|
||||
ConvolutionConfiguration::FusionMode fusion_mode;
|
||||
ConvolutionConfiguration::ActivationType activation;
|
||||
float relu_negative_slope, crelu_floor, crelu_ceil;
|
||||
float power_exp, power_scale, power_shift;
|
||||
|
||||
enum class InternalFusionLocation {
|
||||
CUDNN,
|
||||
NATIVE
|
||||
} fusion_location;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */
|
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
vendored
Normal file
51
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
vendored
Normal file
@ -0,0 +1,51 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/span.hpp"
|
||||
|
||||
#include "../kernels/crop_and_resize.hpp"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
template <class T>
|
||||
class CropAndResizeOp final : public CUDABackendNode {
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
CropAndResizeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
CV_Assert(inputs.size() == 2 && outputs.size() == 1);
|
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto input = input_wrapper->getView();
|
||||
|
||||
auto box_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto boxes = box_wrapper->getView();
|
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
kernels::crop_and_resize(stream, output, input, static_cast<csl::View<T>>(boxes));
|
||||
}
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP */
|
282
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/detection_output.hpp
vendored
Normal file
282
3rdparty/opencv-4.5.4/modules/dnn/src/cuda4dnn/primitives/detection_output.hpp
vendored
Normal file
@ -0,0 +1,282 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP
|
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP
|
||||
|
||||
#include "../../op_cuda.hpp"
|
||||
|
||||
#include "../csl/stream.hpp"
|
||||
#include "../csl/tensor.hpp"
|
||||
|
||||
#include "../kernels/fill_copy.hpp"
|
||||
#include "../kernels/permute.hpp"
|
||||
#include "../kernels/detection_output.hpp"
|
||||
#include "../kernels/grid_nms.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn {
|
||||
|
||||
struct DetectionOutputConfiguration {
|
||||
std::size_t batch_size;
|
||||
|
||||
enum class CodeType {
|
||||
CORNER,
|
||||
CENTER_SIZE
|
||||
};
|
||||
CodeType code_type;
|
||||
|
||||
bool share_location;
|
||||
std::size_t num_priors;
|
||||
std::size_t num_classes;
|
||||
std::size_t background_class_id;
|
||||
|
||||
bool transpose_location;
|
||||
bool variance_encoded_in_target;
|
||||
bool normalized_bbox;
|
||||
bool clip_box;
|
||||
|
||||
std::size_t classwise_topK;
|
||||
float confidence_threshold;
|
||||
float nms_threshold;
|
||||
|
||||
int keepTopK;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class DetectionOutputOp final : public CUDABackendNode {
|
||||
private:
|
||||
/* We have block level NMS kernel where each block handles one class of one batch item.
|
||||
* If the number of classes and batch size together is very low, the blockwise NMS kernel
|
||||
* won't able to fully saturate the GPU with work.
|
||||
*
|
||||
* We also have a grid level NMS kernel where multiple blocks handle each class of every batch item.
|
||||
* This performs better in the worst case and utilizes resources better when block level kernel isn't
|
||||
* able to saturate the GPU with enough work. However, this is not efficient in the average case where
|
||||
* the block level kernel is able to saturate the GPU. It does better when the blockwise NMS barely
|
||||
* saturates the GPU.
|
||||
*
|
||||
* `GRID_NMS_CUTOFF` is the cutoff for `num_classes * batch_size` above which we will switch from grid
|
||||
* level NMS to block level NMS.
|
||||
*/
|
||||
static constexpr int GRID_NMS_CUTOFF = 32;
|
||||
|
||||
public:
|
||||
using wrapper_type = GetCUDABackendWrapperType<T>;
|
||||
|
||||
DetectionOutputOp(csl::Stream stream_, const DetectionOutputConfiguration& config)
|
||||
: stream(std::move(stream_))
|
||||
{
|
||||
corner_true_or_center_false = (config.code_type == DetectionOutputConfiguration::CodeType::CORNER);
|
||||
|
||||
share_location = config.share_location;
|
||||
num_priors = config.num_priors;
|
||||
num_classes = config.num_classes;
|
||||
background_class_id = config.background_class_id;
|
||||
|
||||
transpose_location = config.transpose_location;
|
||||
variance_encoded_in_target = config.variance_encoded_in_target;
|
||||
normalized_bbox = config.normalized_bbox;
|
||||
clip_box = config.clip_box;
|
||||
|
||||
classwise_topK = config.classwise_topK;
|
||||
confidence_threshold = config.confidence_threshold;
|
||||
nms_threshold = config.nms_threshold;
|
||||
|
||||
keepTopK = config.keepTopK;
|
||||
CV_Assert(keepTopK > 0);
|
||||
|
||||
if (classwise_topK == -1)
|
||||
{
|
||||
classwise_topK = num_priors;
|
||||
if (keepTopK > 0 && keepTopK < num_priors)
|
||||
classwise_topK = keepTopK;
|
||||
}
|
||||
|
||||
auto batch_size = config.batch_size;
|
||||
auto num_loc_classes = (share_location ? 1 : num_classes);
|
||||
|
||||
csl::WorkspaceBuilder builder;
|
||||
builder.require<T>(batch_size * num_priors * num_loc_classes * 4); /* decoded boxes */
|
||||
builder.require<T>(batch_size * num_classes * num_priors); /* transposed scores */
|
||||
builder.require<int>(batch_size * num_classes * classwise_topK); /* indices */
|
||||
builder.require<int>(batch_size * num_classes); /* classwise topK count */
|
||||
builder.require<T>(batch_size * num_classes * classwise_topK * 4); /* topK decoded boxes */
|
||||
|
||||
if (batch_size * num_classes <= GRID_NMS_CUTOFF)
|
||||
{
|
||||
auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK);
|
||||
builder.require(batch_size * workspace_per_batch_item);
|
||||
}
|
||||
|
||||
builder.require<int>(batch_size * keepTopK); /* final kept indices */
|
||||
builder.require<int>(batch_size); /* kept indices count */
|
||||
builder.require<int>(1); /* total number of detections */
|
||||
|
||||
scratch_mem_in_bytes = builder.required_workspace_size();
|
||||
}
|
||||
|
||||
void forward(
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs,
|
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
|
||||
csl::Workspace& workspace) override
|
||||
{
|
||||
/* locations, scores and priors make the first three inputs in order */
|
||||
/* the 4th input is used to obtain the shape for clipping */
|
||||
CV_Assert((inputs.size() == 3 || inputs.size() == 4) && outputs.size() == 1);
|
||||
|
||||
// locations: [batch_size, num_priors, num_loc_classes, 4]
|
||||
auto locations_wrapper = inputs[0].dynamicCast<wrapper_type>();
|
||||
auto locations = locations_wrapper->getView();
|
||||
|
||||
// scores: [batch_size, num_priors, num_classes]
|
||||
auto scores_wrapper = inputs[1].dynamicCast<wrapper_type>();
|
||||
auto scores = scores_wrapper->getView();
|
||||
scores.unsqueeze();
|
||||
scores.reshape(-1, num_priors, num_classes);
|
||||
|
||||
// priors: [1, 2, num_priors, 4]
|
||||
auto priors_wrapper = inputs[2].dynamicCast<wrapper_type>();
|
||||
auto priors = priors_wrapper->getView();
|
||||
|
||||
// output: [1, 1, batch_size * keepTopK, 7]
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
|
||||
auto output = output_wrapper->getSpan();
|
||||
|
||||
auto batch_size = locations.get_axis_size(0);
|
||||
auto num_loc_classes = (share_location ? 1 : num_classes);
|
||||
while(locations.rank() < 4)
|
||||
locations.unsqueeze();
|
||||
locations.reshape(batch_size, num_priors, num_loc_classes, 4);
|
||||
|
||||
float clip_width = 0.0, clip_height = 0.0;
|
||||
if (clip_box)
|
||||
{
|
||||
if (normalized_bbox)
|
||||
{
|
||||
clip_width = clip_height = 1.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto image_wrapper = inputs[3].dynamicCast<wrapper_type>();
|
||||
auto image_shape = image_wrapper->getShape();
|
||||
|
||||
CV_Assert(image_shape.size() == 4);
|
||||
clip_width = image_shape[3] - 1;
|
||||
clip_height = image_shape[2] - 1;
|
||||
}
|
||||
}
|
||||
|
||||
csl::WorkspaceAllocator allocator(workspace);
|
||||
|
||||
// decoded_boxes: [batch_size, num_priors, num_loc_classes, 4]
|
||||
csl::TensorSpan<T> decoded_boxes;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_priors, num_loc_classes, 4};
|
||||
decoded_boxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
CV_Assert(is_shape_same(decoded_boxes, locations));
|
||||
}
|
||||
|
||||
kernels::decode_bboxes<T>(stream, decoded_boxes, locations, priors,
|
||||
num_loc_classes, share_location, background_class_id,
|
||||
transpose_location, variance_encoded_in_target,
|
||||
corner_true_or_center_false, normalized_bbox,
|
||||
clip_box, clip_width, clip_height);
|
||||
|
||||
// scores_permuted: [batch_size, num_classes, num_priors]
|
||||
csl::TensorSpan<T> scores_permuted;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes, num_priors};
|
||||
scores_permuted = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::permute<T>(stream, scores_permuted, scores, {0, 2, 1});
|
||||
|
||||
// indices: [batch_size, num_classes, classwise_topK]
|
||||
csl::TensorSpan<int> indices;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK};
|
||||
indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
// count: [batch_size, num_classes]
|
||||
csl::TensorSpan<int> count;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes};
|
||||
count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::findTopK<T>(stream, indices, count, scores_permuted, background_class_id, confidence_threshold);
|
||||
|
||||
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
|
||||
csl::TensorSpan<T> collected_bboxes;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK, 4};
|
||||
collected_bboxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::box_collect<T>(stream, collected_bboxes, decoded_boxes, indices, count, share_location, background_class_id);
|
||||
|
||||
if (batch_size * num_classes <= GRID_NMS_CUTOFF)
|
||||
{
|
||||
auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK);
|
||||
auto workspace = allocator.get_span<unsigned int>(batch_size * workspace_per_batch_item / sizeof(unsigned int));
|
||||
kernels::grid_nms<T>(stream, workspace, indices, count, collected_bboxes, background_class_id, normalized_bbox, nms_threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernels::blockwise_class_nms<T>(stream, indices, count, collected_bboxes, normalized_bbox, background_class_id, nms_threshold);
|
||||
}
|
||||
|
||||
// kept_indices: [batch_size, keepTopK]
|
||||
csl::TensorSpan<int> kept_indices;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size, static_cast<std::size_t>(keepTopK)};
|
||||
kept_indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
// kept_count: [batch_size]
|
||||
csl::TensorSpan<int> kept_count;
|
||||
{
|
||||
auto shape = std::vector<std::size_t>{batch_size};
|
||||
kept_count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
|
||||
}
|
||||
|
||||
kernels::nms_collect<T>(stream, kept_indices, kept_count, indices, count, scores_permuted, confidence_threshold, background_class_id);
|
||||
|
||||
auto num_detections = allocator.get_span<int>(1);
|
||||
kernels::fill<int>(stream, num_detections, 0);
|
||||
kernels::fill<T>(stream, output, 0.0);
|
||||
kernels::consolidate_detections<T>(stream, output, kept_indices, kept_count, decoded_boxes, scores_permuted, share_location, num_detections.data());
|
||||
}
|
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
|
||||
|
||||
private:
|
||||
csl::Stream stream;
|
||||
std::size_t scratch_mem_in_bytes;
|
||||
|
||||
bool share_location;
|
||||
std::size_t num_priors;
|
||||
std::size_t num_classes;
|
||||
std::size_t background_class_id;
|
||||
|
||||
bool transpose_location;
|
||||
bool variance_encoded_in_target;
|
||||
bool corner_true_or_center_false;
|
||||
bool normalized_bbox;
|
||||
bool clip_box;
|
||||
|
||||
std::size_t classwise_topK;
|
||||
float confidence_threshold;
|
||||
float nms_threshold;
|
||||
|
||||
int keepTopK;
|
||||
};
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */
|
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP */
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user