feat: 切换后端至PaddleOCR-NCNN,切换工程为CMake

1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试
2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程
3.重整权利声明文件,重整代码工程,确保最小化侵权风险

Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake
Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
This commit is contained in:
wangzhengyang
2022-05-10 09:54:44 +08:00
parent ecdd171c6f
commit 718c41634f
10018 changed files with 3593797 additions and 186748 deletions

View File

@@ -0,0 +1,200 @@
if(WINRT)
ocv_module_disable(dnn)
endif()
if(NOT HAVE_PROTOBUF)
ocv_module_disable(opencv_dnn)
endif()
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
ocv_option(OPENCV_DNN_OPENCL "Build with OpenCL support" HAVE_OPENCL AND NOT APPLE)
if(HAVE_TENGINE)
add_definitions(-DHAVE_TENGINE=1)
endif()
if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
add_definitions(-DCV_OCL4DNN=1)
endif()
ocv_option(OPENCV_DNN_CUDA "Build with CUDA support"
HAVE_CUDA
AND HAVE_CUBLAS
AND HAVE_CUDNN
)
if(OPENCV_DNN_CUDA)
if(HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
add_definitions(-DCV_CUDA4DNN=1)
else()
if(NOT HAVE_CUDA)
message(SEND_ERROR "DNN: CUDA backend requires CUDA Toolkit. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
elseif(NOT HAVE_CUBLAS)
message(SEND_ERROR "DNN: CUDA backend requires cuBLAS. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
elseif(NOT HAVE_CUDNN)
message(SEND_ERROR "DNN: CUDA backend requires cuDNN. Please resolve dependency or disable OPENCV_DNN_CUDA=OFF")
endif()
endif()
endif()
ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
if(MSVC)
add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
/wd4305 /wd4127 /wd4100 /wd4512 /wd4125 /wd4389 /wd4510 /wd4610
/wd4702 /wd4456 /wd4457 /wd4065 /wd4310 /wd4661 /wd4506
)
else()
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-deprecated -Wmissing-prototypes -Wmissing-declarations -Wshadow
-Wunused-parameter -Wsign-compare
)
endif()
if(HAVE_CUDA)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
endif()
if(NOT HAVE_CXX11)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef) # LANG_CXX11 from protobuf files
endif()
if(APPLE_FRAMEWORK)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshorten-64-to-32)
endif()
if(ANDROID)
add_definitions(-DDISABLE_POSIX_MEMALIGN -DTH_DISABLE_HEAP_TRACKING)
endif()
if(NOT BUILD_PROTOBUF)
add_definitions(-DOPENCV_DNN_EXTERNAL_PROTOBUF=1)
endif()
add_definitions(-DHAVE_PROTOBUF=1)
#suppress warnings in autogenerated caffe.pb.* files
ocv_warnings_disable(CMAKE_CXX_FLAGS
/wd4125 /wd4267 /wd4127 /wd4244 /wd4512 /wd4702
/wd4456 /wd4510 /wd4610 /wd4800
/wd4701 /wd4703 # potentially uninitialized local/pointer variable 'value' used
/wd4505 # unreferenced local function has been removed
/wd4458 # declaration of 'x' hides class member. GCC still works, MSVC bug is here: https://developercommunity.visualstudio.com/content/problem/219311/c-c4458-declaration-hides-class-member-warning-iss.html
-wd858 -wd2196
-Winvalid-offsetof # Apple Clang (attr_value.pb.cc)
)
set(include_dirs "")
set(libs "")
if(PROTOBUF_UPDATE_FILES)
file(GLOB proto_files "${CMAKE_CURRENT_LIST_DIR}/src/tensorflow/*.proto" "${CMAKE_CURRENT_LIST_DIR}/src/caffe/opencv-caffe.proto" "${CMAKE_CURRENT_LIST_DIR}/src/onnx/opencv-onnx.proto")
set(PROTOBUF_GENERATE_CPP_APPEND_PATH ON) # required for tensorflow
protobuf_generate_cpp(fw_srcs fw_hdrs ${proto_files})
else()
file(GLOB fw_srcs "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow/*.cc" "${CMAKE_CURRENT_LIST_DIR}/misc/caffe/opencv-caffe.pb.cc" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx/opencv-onnx.pb.cc")
file(GLOB fw_hdrs "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow/*.h" "${CMAKE_CURRENT_LIST_DIR}/misc/caffe/opencv-caffe.pb.h" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx/opencv-onnx.pb.h")
set(fw_inc "${CMAKE_CURRENT_LIST_DIR}/misc/caffe" "${CMAKE_CURRENT_LIST_DIR}/misc/tensorflow" "${CMAKE_CURRENT_LIST_DIR}/misc/onnx")
endif()
list(APPEND include_dirs ${fw_inc})
list(APPEND libs ${Protobuf_LIBRARIES})
if(NOT BUILD_PROTOBUF)
list(APPEND include_dirs ${Protobuf_INCLUDE_DIRS})
endif()
set(sources_options "")
list(APPEND libs ${LAPACK_LIBRARIES})
if(OPENCV_DNN_OPENCL AND HAVE_OPENCL)
list(APPEND include_dirs ${OPENCL_INCLUDE_DIRS})
else()
set(sources_options EXCLUDE_OPENCL)
endif()
if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
list(APPEND include_dirs ${CUDA_TOOLKIT_INCLUDE} ${CUDNN_INCLUDE_DIRS})
set(CC_LIST ${CUDA_ARCH_BIN})
separate_arguments(CC_LIST)
foreach(cc ${CC_LIST})
if(cc VERSION_LESS 3.0)
message(FATAL_ERROR "CUDA backend for DNN module requires CC 3.0 or higher. Please remove unsupported architectures from CUDA_ARCH_BIN option or disable OPENCV_DNN_CUDA=OFF.")
endif()
endforeach()
unset(CC_LIST)
else()
set(sources_options ${sources_options} EXCLUDE_CUDA)
endif()
if(HAVE_TENGINE)
list(APPEND include_dirs ${TENGINE_INCLUDE_DIRS})
list(APPEND libs -Wl,--whole-archive ${TENGINE_LIBRARIES} -Wl,--no-whole-archive)
endif()
ocv_module_include_directories(${include_dirs})
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-suggest-override") # GCC
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-array-bounds") # GCC 9.3.0 (Ubuntu 20.04)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
ocv_append_source_files_cxx_compiler_options(fw_srcs "-Wno-inconsistent-missing-override") # Clang
endif()
set(dnn_runtime_libs "")
if(INF_ENGINE_TARGET)
set(use_nn_builder OFF)
if(TARGET inference_engine_nn_builder OR # custom imported target
TARGET IE::inference_engine_nn_builder OR # default imported target via InferenceEngineConfig.cmake
INF_ENGINE_RELEASE VERSION_LESS "2020000000") # compatibility with older versions on IE
set(use_nn_builder ON)
endif()
ocv_option(OPENCV_DNN_IE_NN_BUILDER_2019 "Build with Inference Engine NN Builder API support" ${use_nn_builder}) # future: NOT HAVE_NGRAPH
if(OPENCV_DNN_IE_NN_BUILDER_2019)
message(STATUS "DNN: Enabling Inference Engine NN Builder API support")
add_definitions(-DHAVE_DNN_IE_NN_BUILDER_2019=1)
endif()
list(APPEND dnn_runtime_libs ${INF_ENGINE_TARGET})
endif()
if(HAVE_NGRAPH)
message(STATUS "DNN: Enabling Inference Engine nGraph API support")
add_definitions(-DHAVE_DNN_NGRAPH)
list(APPEND dnn_runtime_libs ngraph::ngraph)
endif()
ocv_glob_module_sources(${sources_options} SOURCES ${fw_srcs})
ocv_create_module(${libs} ${dnn_runtime_libs})
ocv_add_samples()
ocv_add_accuracy_tests(${dnn_runtime_libs})
set(perf_path "${CMAKE_CURRENT_LIST_DIR}/perf")
file(GLOB_RECURSE perf_srcs "${perf_path}/*.cpp")
file(GLOB_RECURSE perf_hdrs "${perf_path}/*.hpp" "${perf_path}/*.h")
ocv_add_perf_tests(${INF_ENGINE_TARGET}
FILES test_common "${CMAKE_CURRENT_LIST_DIR}/test/test_common.hpp" "${CMAKE_CURRENT_LIST_DIR}/test/test_common.impl.hpp"
FILES Src ${perf_srcs}
FILES Include ${perf_hdrs}
)
ocv_option(OPENCV_DNN_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
ocv_option(OPENCV_DNN_PERF_CLCAFFE "Add performance tests of clCaffe framework" OFF)
if(BUILD_PERF_TESTS)
if (OPENCV_DNN_PERF_CAFFE
OR ${the_module}_PERF_CAFFE # compatibility for deprecated option
)
find_package(Caffe QUIET)
if (Caffe_FOUND)
add_definitions(-DHAVE_CAFFE=1)
ocv_target_link_libraries(opencv_perf_dnn caffe)
endif()
elseif(OPENCV_DNN_PERF_CLCAFFE
OR ${the_module}_PERF_CAFFE # compatibility for deprecated option
)
find_package(Caffe QUIET)
if (Caffe_FOUND)
add_definitions(-DHAVE_CLCAFFE=1)
ocv_target_link_libraries(opencv_perf_dnn caffe)
endif()
endif()
endif()

View File

@@ -0,0 +1,11 @@
if(NOT (OPENCV_DNN_OPENCL AND HAVE_OPENCL))
message(STATUS "opencv_dnn: filter out ocl4dnn source code")
ocv_list_filterout(OPENCV_MODULE_${the_module}_SOURCES "/ocl4dnn/")
ocv_list_filterout(OPENCV_MODULE_${the_module}_HEADERS "/ocl4dnn/")
endif()
if(NOT (OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN))
message(STATUS "opencv_dnn: filter out cuda4dnn source code")
ocv_list_filterout(OPENCV_MODULE_${the_module}_SOURCES "/cuda4dnn/")
ocv_list_filterout(OPENCV_MODULE_${the_module}_HEADERS "/cuda4dnn/")
endif()

View File

@@ -0,0 +1,78 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_DNN_HPP
#define OPENCV_DNN_HPP
// This is an umbrella header to include into you project.
// We are free to change headers layout in dnn subfolder, so please include
// this header for future compatibility
/** @defgroup dnn Deep Neural Network module
@{
This module contains:
- API for new layers creation, layers are building bricks of neural networks;
- set of built-in most-useful Layers;
- API to construct and modify comprehensive neural networks from layers;
- functionality for loading serialized networks models from different frameworks.
Functionality of this module is designed only for forward pass computations (i.e. network testing).
A network training is in principle not supported.
@}
*/
/** @example samples/dnn/classification.cpp
Check @ref tutorial_dnn_googlenet "the corresponding tutorial" for more details
*/
/** @example samples/dnn/colorization.cpp
*/
/** @example samples/dnn/object_detection.cpp
Check @ref tutorial_dnn_yolo "the corresponding tutorial" for more details
*/
/** @example samples/dnn/openpose.cpp
*/
/** @example samples/dnn/segmentation.cpp
*/
/** @example samples/dnn/text_detection.cpp
*/
#include <opencv2/dnn/dnn.hpp>
#endif /* OPENCV_DNN_HPP */

View File

@@ -0,0 +1,832 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_DNN_DNN_ALL_LAYERS_HPP
#define OPENCV_DNN_DNN_ALL_LAYERS_HPP
#include <opencv2/dnn.hpp>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
//! @addtogroup dnn
//! @{
/** @defgroup dnnLayerList Partial List of Implemented Layers
@{
This subsection of dnn module contains information about built-in layers and their descriptions.
Classes listed here, in fact, provides C++ API for creating instances of built-in layers.
In addition to this way of layers instantiation, there is a more common factory API (see @ref dnnLayerFactory), it allows to create layers dynamically (by name) and register new ones.
You can use both API, but factory API is less convenient for native C++ programming and basically designed for use inside importers (see @ref readNetFromCaffe(), @ref readNetFromTorch(), @ref readNetFromTensorflow()).
Built-in layers partially reproduce functionality of corresponding Caffe and Torch7 layers.
In particular, the following layers and Caffe importer were tested to reproduce <a href="http://caffe.berkeleyvision.org/tutorial/layers.html">Caffe</a> functionality:
- Convolution
- Deconvolution
- Pooling
- InnerProduct
- TanH, ReLU, Sigmoid, BNLL, Power, AbsVal
- Softmax
- Reshape, Flatten, Slice, Split
- LRN
- MVN
- Dropout (since it does nothing on forward pass -))
*/
class CV_EXPORTS BlankLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams &params);
};
/**
* Constant layer produces the same data blob at an every forward pass.
*/
class CV_EXPORTS ConstLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams &params);
};
//! LSTM recurrent layer
class CV_EXPORTS LSTMLayer : public Layer
{
public:
/** Creates instance of LSTM layer */
static Ptr<LSTMLayer> create(const LayerParams& params);
/** @deprecated Use LayerParams::blobs instead.
@brief Set trained weights for LSTM layer.
LSTM behavior on each step is defined by current input, previous output, previous cell state and learned weights.
Let @f$x_t@f$ be current input, @f$h_t@f$ be current output, @f$c_t@f$ be current state.
Than current output and current cell state is computed as follows:
@f{eqnarray*}{
h_t &= o_t \odot tanh(c_t), \\
c_t &= f_t \odot c_{t-1} + i_t \odot g_t, \\
@f}
where @f$\odot@f$ is per-element multiply operation and @f$i_t, f_t, o_t, g_t@f$ is internal gates that are computed using learned weights.
Gates are computed as follows:
@f{eqnarray*}{
i_t &= sigmoid&(W_{xi} x_t + W_{hi} h_{t-1} + b_i), \\
f_t &= sigmoid&(W_{xf} x_t + W_{hf} h_{t-1} + b_f), \\
o_t &= sigmoid&(W_{xo} x_t + W_{ho} h_{t-1} + b_o), \\
g_t &= tanh &(W_{xg} x_t + W_{hg} h_{t-1} + b_g), \\
@f}
where @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
@f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
For simplicity and performance purposes we use @f$ W_x = [W_{xi}; W_{xf}; W_{xo}, W_{xg}] @f$
(i.e. @f$W_x@f$ is vertical concatenation of @f$ W_{x?} @f$), @f$ W_x \in R^{4N_h \times N_x} @f$.
The same for @f$ W_h = [W_{hi}; W_{hf}; W_{ho}, W_{hg}], W_h \in R^{4N_h \times N_h} @f$
and for @f$ b = [b_i; b_f, b_o, b_g]@f$, @f$b \in R^{4N_h} @f$.
@param Wh is matrix defining how previous output is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_h @f$)
@param Wx is matrix defining how current input is transformed to internal gates (i.e. according to above mentioned notation is @f$ W_x @f$)
@param b is bias vector (i.e. according to above mentioned notation is @f$ b @f$)
*/
CV_DEPRECATED virtual void setWeights(const Mat &Wh, const Mat &Wx, const Mat &b) = 0;
/** @brief Specifies shape of output blob which will be [[`T`], `N`] + @p outTailShape.
* @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used,
* where `Wh` is parameter from setWeights().
*/
virtual void setOutShape(const MatShape &outTailShape = MatShape()) = 0;
/** @deprecated Use flag `produce_cell_output` in LayerParams.
* @brief Specifies either interpret first dimension of input blob as timestamp dimension either as sample.
*
* If flag is set to true then shape of input blob will be interpreted as [`T`, `N`, `[data dims]`] where `T` specifies number of timestamps, `N` is number of independent streams.
* In this case each forward() call will iterate through `T` timestamps and update layer's state `T` times.
*
* If flag is set to false then shape of input blob will be interpreted as [`N`, `[data dims]`].
* In this case each forward() call will make one iteration and produce one timestamp with shape [`N`, `[out dims]`].
*/
CV_DEPRECATED virtual void setUseTimstampsDim(bool use = true) = 0;
/** @deprecated Use flag `use_timestamp_dim` in LayerParams.
* @brief If this flag is set to true then layer will produce @f$ c_t @f$ as second output.
* @details Shape of the second output is the same as first output.
*/
CV_DEPRECATED virtual void setProduceCellOutput(bool produce = false) = 0;
/* In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$).
* @param input should contain packed values @f$x_t@f$
* @param output contains computed outputs: @f$h_t@f$ (and @f$c_t@f$ if setProduceCellOutput() flag was set to true).
*
* If setUseTimstampsDim() is set to true then @p input[0] should has at least two dimensions with the following shape: [`T`, `N`, `[data dims]`],
* where `T` specifies number of timestamps, `N` is number of independent streams (i.e. @f$ x_{t_0 + t}^{stream} @f$ is stored inside @p input[0][t, stream, ...]).
*
* If setUseTimstampsDim() is set to false then @p input[0] should contain single timestamp, its shape should has form [`N`, `[data dims]`] with at least one dimension.
* (i.e. @f$ x_{t}^{stream} @f$ is stored inside @p input[0][stream, ...]).
*/
int inputNameToIndex(String inputName) CV_OVERRIDE;
int outputNameToIndex(const String& outputName) CV_OVERRIDE;
};
/** @brief GRU recurrent one-layer
*
* Accepts input sequence and computes the final hidden state for each element in the batch.
*
* - input[0] containing the features of the input sequence.
* input[0] should have shape [`T`, `N`, `data_dims`] where `T` is sequence length, `N` is batch size, `data_dims` is input size
* - output would have shape [`T`, `N`, `D` * `hidden_size`] where `D = 2` if layer is bidirectional otherwise `D = 1`
*
* Depends on the following attributes:
* - hidden_size - Number of neurons in the hidden layer
* - direction - RNN could be bidirectional or forward
*
* The final hidden state @f$ h_t @f$ computes by the following formulas:
*
@f{eqnarray*}{
r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)} \\
@f}
* Where @f$x_t@f$ is current input, @f$h_{(t-1)}@f$ is previous or initial hidden state.
*
* @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
* @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
*
* @f$\odot@f$ is per-element multiply operation.
*/
class CV_EXPORTS GRULayer : public Layer
{
public:
/** Creates instance of GRU layer */
static Ptr<GRULayer> create(const LayerParams& params);
};
/** @brief Classical recurrent layer
Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
- input: should contain packed input @f$x_t@f$.
- output: should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
*/
class CV_EXPORTS RNNLayer : public Layer
{
public:
/** Creates instance of RNNLayer */
static Ptr<RNNLayer> create(const LayerParams& params);
/** Setups learned weights.
Recurrent-layer behavior on each step is defined by current input @f$ x_t @f$, previous state @f$ h_t @f$ and learned weights as follows:
@f{eqnarray*}{
h_t &= tanh&(W_{hh} h_{t-1} + W_{xh} x_t + b_h), \\
o_t &= tanh&(W_{ho} h_t + b_o),
@f}
@param Wxh is @f$ W_{xh} @f$ matrix
@param bh is @f$ b_{h} @f$ vector
@param Whh is @f$ W_{hh} @f$ matrix
@param Who is @f$ W_{xo} @f$ matrix
@param bo is @f$ b_{o} @f$ vector
*/
virtual void setWeights(const Mat &Wxh, const Mat &bh, const Mat &Whh, const Mat &Who, const Mat &bo) = 0;
/** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output.
* @details Shape of the second output is the same as first output.
*/
virtual void setProduceHiddenOutput(bool produce = false) = 0;
};
class CV_EXPORTS BaseConvolutionLayer : public Layer
{
public:
CV_DEPRECATED_EXTERNAL Size kernel, stride, pad, dilation, adjustPad;
std::vector<size_t> adjust_pads;
std::vector<size_t> kernel_size, strides, dilations;
std::vector<size_t> pads_begin, pads_end;
String padMode;
int numOutput;
};
class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
{
public:
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
{
public:
int input_zp, output_zp;
float output_sc;
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
{
public:
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
class CV_EXPORTS LRNLayer : public Layer
{
public:
int type;
int size;
float alpha, beta, bias;
bool normBySize;
static Ptr<LRNLayer> create(const LayerParams& params);
};
class CV_EXPORTS PoolingLayer : public Layer
{
public:
int type;
std::vector<size_t> kernel_size, strides;
std::vector<size_t> pads_begin, pads_end;
bool globalPooling; //!< Flag is true if at least one of the axes is global pooled.
std::vector<bool> isGlobalPooling;
bool computeMaxIdx;
String padMode;
bool ceilMode;
// If true for average pooling with padding, divide an every output region
// by a whole kernel area. Otherwise exclude zero padded values and divide
// by number of real values.
bool avePoolPaddedArea;
// ROIPooling parameters.
Size pooledSize;
float spatialScale;
// PSROIPooling parameters.
int psRoiOutChannels;
static Ptr<PoolingLayer> create(const LayerParams& params);
};
class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer
{
public:
int input_zp, output_zp;
static Ptr<PoolingLayerInt8> create(const LayerParams& params);
};
class CV_EXPORTS SoftmaxLayer : public Layer
{
public:
bool logSoftMax;
static Ptr<SoftmaxLayer> create(const LayerParams& params);
};
class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer
{
public:
float output_sc;
int output_zp;
static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
};
class CV_EXPORTS InnerProductLayer : public Layer
{
public:
int axis;
static Ptr<InnerProductLayer> create(const LayerParams& params);
};
class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer
{
public:
int output_zp;
static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
};
class CV_EXPORTS MVNLayer : public Layer
{
public:
float eps;
bool normVariance, acrossChannels;
static Ptr<MVNLayer> create(const LayerParams& params);
};
/* Reshaping */
class CV_EXPORTS ReshapeLayer : public Layer
{
public:
MatShape newShapeDesc;
Range newShapeRange;
static Ptr<ReshapeLayer> create(const LayerParams& params);
};
class CV_EXPORTS FlattenLayer : public Layer
{
public:
static Ptr<FlattenLayer> create(const LayerParams &params);
};
class CV_EXPORTS QuantizeLayer : public Layer
{
public:
float scale;
int zeropoint;
static Ptr<QuantizeLayer> create(const LayerParams &params);
};
class CV_EXPORTS DequantizeLayer : public Layer
{
public:
float scale;
int zeropoint;
static Ptr<DequantizeLayer> create(const LayerParams &params);
};
class CV_EXPORTS RequantizeLayer : public Layer
{
public:
float scale, shift;
static Ptr<RequantizeLayer> create(const LayerParams &params);
};
class CV_EXPORTS ConcatLayer : public Layer
{
public:
int axis;
/**
* @brief Add zero padding in case of concatenation of blobs with different
* spatial sizes.
*
* Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat
*/
bool padding;
int paddingValue;
static Ptr<ConcatLayer> create(const LayerParams &params);
};
class CV_EXPORTS SplitLayer : public Layer
{
public:
int outputsCount; //!< Number of copies that will be produced (is ignored when negative).
static Ptr<SplitLayer> create(const LayerParams &params);
};
/**
* Slice layer has several modes:
* 1. Caffe mode
* @param[in] axis Axis of split operation
* @param[in] slice_point Array of split points
*
* Number of output blobs equals to number of split points plus one. The
* first blob is a slice on input from 0 to @p slice_point[0] - 1 by @p axis,
* the second output blob is a slice of input from @p slice_point[0] to
* @p slice_point[1] - 1 by @p axis and the last output blob is a slice of
* input from @p slice_point[-1] up to the end of @p axis size.
*
* 2. TensorFlow mode
* @param begin Vector of start indices
* @param size Vector of sizes
*
* More convenient numpy-like slice. One and only output blob
* is a slice `input[begin[0]:begin[0]+size[0], begin[1]:begin[1]+size[1], ...]`
*
* 3. Torch mode
* @param axis Axis of split operation
*
* Split input blob on the equal parts by @p axis.
*/
class CV_EXPORTS SliceLayer : public Layer
{
public:
/**
* @brief Vector of slice ranges.
*
* The first dimension equals number of output blobs.
* Inner vector has slice ranges for the first number of input dimensions.
*/
std::vector<std::vector<Range> > sliceRanges;
std::vector<std::vector<int> > sliceSteps;
int axis;
int num_split;
static Ptr<SliceLayer> create(const LayerParams &params);
};
class CV_EXPORTS PermuteLayer : public Layer
{
public:
static Ptr<PermuteLayer> create(const LayerParams& params);
};
/**
* Permute channels of 4-dimensional input blob.
* @param group Number of groups to split input channels and pick in turns
* into output blob.
*
* \f[ groupSize = \frac{number\ of\ channels}{group} \f]
* \f[ output(n, c, h, w) = input(n, groupSize \times (c \% group) + \lfloor \frac{c}{group} \rfloor, h, w) \f]
* Read more at https://arxiv.org/pdf/1707.01083.pdf
*/
class CV_EXPORTS ShuffleChannelLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
int group;
};
/**
* @brief Adds extra values for specific axes.
* @param paddings Vector of paddings in format
* @code
* [ pad_before, pad_after, // [0]th dimension
* pad_before, pad_after, // [1]st dimension
* ...
* pad_before, pad_after ] // [n]th dimension
* @endcode
* that represents number of padded values at every dimension
* starting from the first one. The rest of dimensions won't
* be padded.
* @param value Value to be padded. Defaults to zero.
* @param type Padding type: 'constant', 'reflect'
* @param input_dims Torch's parameter. If @p input_dims is not equal to the
* actual input dimensionality then the `[0]th` dimension
* is considered as a batch dimension and @p paddings are shifted
* to a one dimension. Defaults to `-1` that means padding
* corresponding to @p paddings.
*/
class CV_EXPORTS PaddingLayer : public Layer
{
public:
static Ptr<PaddingLayer> create(const LayerParams& params);
};
/* Activations */
class CV_EXPORTS ActivationLayer : public Layer
{
public:
virtual void forwardSlice(const float* src, float* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
};
class CV_EXPORTS ReLULayer : public ActivationLayer
{
public:
float negativeSlope;
static Ptr<ReLULayer> create(const LayerParams &params);
};
class CV_EXPORTS ReLU6Layer : public ActivationLayer
{
public:
float minValue, maxValue;
static Ptr<ReLU6Layer> create(const LayerParams &params);
};
class CV_EXPORTS ChannelsPReLULayer : public ActivationLayer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS ELULayer : public ActivationLayer
{
public:
static Ptr<ELULayer> create(const LayerParams &params);
};
class CV_EXPORTS TanHLayer : public ActivationLayer
{
public:
static Ptr<TanHLayer> create(const LayerParams &params);
};
class CV_EXPORTS SwishLayer : public ActivationLayer
{
public:
static Ptr<SwishLayer> create(const LayerParams &params);
};
class CV_EXPORTS MishLayer : public ActivationLayer
{
public:
static Ptr<MishLayer> create(const LayerParams &params);
};
class CV_EXPORTS SigmoidLayer : public ActivationLayer
{
public:
static Ptr<SigmoidLayer> create(const LayerParams &params);
};
class CV_EXPORTS BNLLLayer : public ActivationLayer
{
public:
static Ptr<BNLLLayer> create(const LayerParams &params);
};
class CV_EXPORTS AbsLayer : public ActivationLayer
{
public:
static Ptr<AbsLayer> create(const LayerParams &params);
};
class CV_EXPORTS PowerLayer : public ActivationLayer
{
public:
float power, scale, shift;
static Ptr<PowerLayer> create(const LayerParams &params);
};
class CV_EXPORTS ExpLayer : public ActivationLayer
{
public:
float base, scale, shift;
static Ptr<ExpLayer> create(const LayerParams &params);
};
class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
{
public:
static Ptr<ActivationLayerInt8> create(const LayerParams &params);
};
/* Layers used in semantic segmentation */
class CV_EXPORTS CropLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams &params);
};
/** @brief Element wise operation on inputs
Extra optional parameters:
- "operation" as string. Values are "sum" (default), "prod", "max", "div", "min"
- "coeff" as float array. Specify weights of inputs for SUM operation
- "output_channels_mode" as string. Values are "same" (default, all input must have the same layout), "input_0", "input_0_truncate", "max_input_channels"
*/
class CV_EXPORTS EltwiseLayer : public Layer
{
public:
static Ptr<EltwiseLayer> create(const LayerParams &params);
};
class CV_EXPORTS EltwiseLayerInt8 : public Layer
{
public:
static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
};
class CV_EXPORTS BatchNormLayer : public ActivationLayer
{
public:
bool hasWeights, hasBias;
float epsilon;
static Ptr<BatchNormLayer> create(const LayerParams &params);
};
class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer
{
public:
float input_sc, output_sc;
int input_zp, output_zp;
static Ptr<BatchNormLayerInt8> create(const LayerParams &params);
};
class CV_EXPORTS MaxUnpoolLayer : public Layer
{
public:
Size poolKernel;
Size poolPad;
Size poolStride;
static Ptr<MaxUnpoolLayer> create(const LayerParams &params);
};
class CV_EXPORTS ScaleLayer : public Layer
{
public:
bool hasBias;
int axis;
static Ptr<ScaleLayer> create(const LayerParams& params);
};
class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer
{
public:
float output_sc;
int output_zp;
static Ptr<ScaleLayerInt8> create(const LayerParams &params);
};
class CV_EXPORTS ShiftLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS ShiftLayerInt8 : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS DataAugmentationLayer : public Layer
{
public:
static Ptr<DataAugmentationLayer> create(const LayerParams& params);
};
class CV_EXPORTS CorrelationLayer : public Layer
{
public:
static Ptr<CorrelationLayer> create(const LayerParams& params);
};
class CV_EXPORTS AccumLayer : public Layer
{
public:
static Ptr<AccumLayer> create(const LayerParams& params);
};
class CV_EXPORTS FlowWarpLayer : public Layer
{
public:
static Ptr<FlowWarpLayer> create(const LayerParams& params);
};
class CV_EXPORTS PriorBoxLayer : public Layer
{
public:
static Ptr<PriorBoxLayer> create(const LayerParams& params);
};
class CV_EXPORTS ReorgLayer : public Layer
{
public:
static Ptr<ReorgLayer> create(const LayerParams& params);
};
class CV_EXPORTS RegionLayer : public Layer
{
public:
float nmsThreshold;
static Ptr<RegionLayer> create(const LayerParams& params);
};
/**
* @brief Detection output layer.
*
* The layer size is: @f$ (1 \times 1 \times N \times 7) @f$
* where N is [keep_top_k] parameter multiplied by batch size. Each row is:
* [image_id, label, confidence, xmin, ymin, xmax, ymax]
* where image_id is the index of image input in the batch.
*/
class CV_EXPORTS DetectionOutputLayer : public Layer
{
public:
static Ptr<DetectionOutputLayer> create(const LayerParams& params);
};
/**
* @brief \f$ L_p \f$ - normalization layer.
* @param p Normalization factor. The most common `p = 1` for \f$ L_1 \f$ -
* normalization or `p = 2` for \f$ L_2 \f$ - normalization or a custom one.
* @param eps Parameter \f$ \epsilon \f$ to prevent a division by zero.
* @param across_spatial If true, normalize an input across all non-batch dimensions.
* Otherwise normalize an every channel separately.
*
* Across spatial:
* @f[
* norm = \sqrt[p]{\epsilon + \sum_{x, y, c} |src(x, y, c)|^p } \\
* dst(x, y, c) = \frac{ src(x, y, c) }{norm}
* @f]
*
* Channel wise normalization:
* @f[
* norm(c) = \sqrt[p]{\epsilon + \sum_{x, y} |src(x, y, c)|^p } \\
* dst(x, y, c) = \frac{ src(x, y, c) }{norm(c)}
* @f]
*
* Where `x, y` - spatial coordinates, `c` - channel.
*
* An every sample in the batch is normalized separately. Optionally,
* output is scaled by the trained parameters.
*/
class CV_EXPORTS NormalizeBBoxLayer : public Layer
{
public:
float pnorm, epsilon;
CV_DEPRECATED_EXTERNAL bool acrossSpatial;
static Ptr<NormalizeBBoxLayer> create(const LayerParams& params);
};
/**
* @brief Resize input 4-dimensional blob by nearest neighbor or bilinear strategy.
*
* Layer is used to support TensorFlow's resize_nearest_neighbor and resize_bilinear ops.
*/
class CV_EXPORTS ResizeLayer : public Layer
{
public:
static Ptr<ResizeLayer> create(const LayerParams& params);
};
/**
* @brief Bilinear resize layer from https://github.com/cdmh/deeplab-public-ver2
*
* It differs from @ref ResizeLayer in output shape and resize scales computations.
*/
class CV_EXPORTS InterpLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS ProposalLayer : public Layer
{
public:
static Ptr<ProposalLayer> create(const LayerParams& params);
};
class CV_EXPORTS CropAndResizeLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS CumSumLayer : public Layer
{
public:
int exclusive;
int reverse;
static Ptr<CumSumLayer> create(const LayerParams& params);
};
//! @}
//! @}
CV__DNN_INLINE_NS_END
}
}
#endif

View File

@@ -0,0 +1,160 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include <opencv2/core.hpp>
#include <map>
#include <ostream>
#include <opencv2/dnn/dnn.hpp>
#ifndef OPENCV_DNN_DNN_DICT_HPP
#define OPENCV_DNN_DNN_DICT_HPP
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
//! @addtogroup dnn
//! @{
/** @brief This struct stores the scalar value (or array) of one of the following type: double, cv::String or int64.
* @todo Maybe int64 is useless because double type exactly stores at least 2^52 integers.
*/
struct CV_EXPORTS_W DictValue
{
DictValue(const DictValue &r);
DictValue(bool i) : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i ? 1 : 0; } //!< Constructs integer scalar
DictValue(int64 i = 0) : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i; } //!< Constructs integer scalar
CV_WRAP DictValue(int i) : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = i; } //!< Constructs integer scalar
DictValue(unsigned p) : type(Param::INT), pi(new AutoBuffer<int64,1>) { (*pi)[0] = p; } //!< Constructs integer scalar
CV_WRAP DictValue(double p) : type(Param::REAL), pd(new AutoBuffer<double,1>) { (*pd)[0] = p; } //!< Constructs floating point scalar
CV_WRAP DictValue(const String &s) : type(Param::STRING), ps(new AutoBuffer<String,1>) { (*ps)[0] = s; } //!< Constructs string scalar
DictValue(const char *s) : type(Param::STRING), ps(new AutoBuffer<String,1>) { (*ps)[0] = s; } //!< @overload
template<typename TypeIter>
static DictValue arrayInt(TypeIter begin, int size); //!< Constructs integer array
template<typename TypeIter>
static DictValue arrayReal(TypeIter begin, int size); //!< Constructs floating point array
template<typename TypeIter>
static DictValue arrayString(TypeIter begin, int size); //!< Constructs array of strings
template<typename T>
T get(int idx = -1) const; //!< Tries to convert array element with specified index to requested type and returns its.
int size() const;
CV_WRAP bool isInt() const;
CV_WRAP bool isString() const;
CV_WRAP bool isReal() const;
CV_WRAP int getIntValue(int idx = -1) const;
CV_WRAP double getRealValue(int idx = -1) const;
CV_WRAP String getStringValue(int idx = -1) const;
DictValue &operator=(const DictValue &r);
friend std::ostream &operator<<(std::ostream &stream, const DictValue &dictv);
~DictValue();
private:
Param type;
union
{
AutoBuffer<int64, 1> *pi;
AutoBuffer<double, 1> *pd;
AutoBuffer<String, 1> *ps;
void *pv;
};
DictValue(Param _type, void *_p) : type(_type), pv(_p) {}
void release();
};
/** @brief This class implements name-value dictionary, values are instances of DictValue. */
class CV_EXPORTS Dict
{
typedef std::map<String, DictValue> _Dict;
_Dict dict;
public:
//! Checks a presence of the @p key in the dictionary.
bool has(const String &key) const;
//! If the @p key in the dictionary then returns pointer to its value, else returns NULL.
DictValue *ptr(const String &key);
/** @overload */
const DictValue *ptr(const String &key) const;
//! If the @p key in the dictionary then returns its value, else an error will be generated.
const DictValue &get(const String &key) const;
/** @overload */
template <typename T>
T get(const String &key) const;
//! If the @p key in the dictionary then returns its value, else returns @p defaultValue.
template <typename T>
T get(const String &key, const T &defaultValue) const;
//! Sets new @p value for the @p key, or adds new key-value pair into the dictionary.
template<typename T>
const T &set(const String &key, const T &value);
//! Erase @p key from the dictionary.
void erase(const String &key);
friend std::ostream &operator<<(std::ostream &stream, const Dict &dict);
std::map<String, DictValue>::const_iterator begin() const;
std::map<String, DictValue>::const_iterator end() const;
};
//! @}
CV__DNN_INLINE_NS_END
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,412 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_DNN_DNN_INL_HPP
#define OPENCV_DNN_DNN_INL_HPP
#include <opencv2/dnn.hpp>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
template<typename TypeIter>
DictValue DictValue::arrayInt(TypeIter begin, int size)
{
DictValue res(Param::INT, new AutoBuffer<int64, 1>(size));
for (int j = 0; j < size; begin++, j++)
(*res.pi)[j] = *begin;
return res;
}
template<typename TypeIter>
DictValue DictValue::arrayReal(TypeIter begin, int size)
{
DictValue res(Param::REAL, new AutoBuffer<double, 1>(size));
for (int j = 0; j < size; begin++, j++)
(*res.pd)[j] = *begin;
return res;
}
template<typename TypeIter>
DictValue DictValue::arrayString(TypeIter begin, int size)
{
DictValue res(Param::STRING, new AutoBuffer<String, 1>(size));
for (int j = 0; j < size; begin++, j++)
(*res.ps)[j] = *begin;
return res;
}
template<>
inline DictValue DictValue::get<DictValue>(int idx) const
{
CV_Assert(idx == -1);
return *this;
}
template<>
inline int64 DictValue::get<int64>(int idx) const
{
CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
idx = (idx == -1) ? 0 : idx;
if (type == Param::INT)
{
return (*pi)[idx];
}
else if (type == Param::REAL)
{
double doubleValue = (*pd)[idx];
double fracpart, intpart;
fracpart = std::modf(doubleValue, &intpart);
CV_Assert(fracpart == 0.0);
return (int64)doubleValue;
}
else if (type == Param::STRING)
{
return std::atoi((*ps)[idx].c_str());
}
else
{
CV_Assert(isInt() || isReal() || isString());
return 0;
}
}
template<>
inline int DictValue::get<int>(int idx) const
{
return (int)get<int64>(idx);
}
inline int DictValue::getIntValue(int idx) const
{
return (int)get<int64>(idx);
}
template<>
inline unsigned DictValue::get<unsigned>(int idx) const
{
return (unsigned)get<int64>(idx);
}
template<>
inline bool DictValue::get<bool>(int idx) const
{
return (get<int64>(idx) != 0);
}
template<>
inline double DictValue::get<double>(int idx) const
{
CV_Assert((idx == -1 && size() == 1) || (idx >= 0 && idx < size()));
idx = (idx == -1) ? 0 : idx;
if (type == Param::REAL)
{
return (*pd)[idx];
}
else if (type == Param::INT)
{
return (double)(*pi)[idx];
}
else if (type == Param::STRING)
{
return std::atof((*ps)[idx].c_str());
}
else
{
CV_Assert(isReal() || isInt() || isString());
return 0;
}
}
inline double DictValue::getRealValue(int idx) const
{
return get<double>(idx);
}
template<>
inline float DictValue::get<float>(int idx) const
{
return (float)get<double>(idx);
}
template<>
inline String DictValue::get<String>(int idx) const
{
CV_Assert(isString());
CV_Assert((idx == -1 && ps->size() == 1) || (idx >= 0 && idx < (int)ps->size()));
return (*ps)[(idx == -1) ? 0 : idx];
}
inline String DictValue::getStringValue(int idx) const
{
return get<String>(idx);
}
inline void DictValue::release()
{
switch (type)
{
case Param::INT:
delete pi;
break;
case Param::STRING:
delete ps;
break;
case Param::REAL:
delete pd;
break;
case Param::BOOLEAN:
case Param::MAT:
case Param::MAT_VECTOR:
case Param::ALGORITHM:
case Param::FLOAT:
case Param::UNSIGNED_INT:
case Param::UINT64:
case Param::UCHAR:
case Param::SCALAR:
break; // unhandled
}
}
inline DictValue::~DictValue()
{
release();
}
inline DictValue & DictValue::operator=(const DictValue &r)
{
if (&r == this)
return *this;
if (r.type == Param::INT)
{
AutoBuffer<int64, 1> *tmp = new AutoBuffer<int64, 1>(*r.pi);
release();
pi = tmp;
}
else if (r.type == Param::STRING)
{
AutoBuffer<String, 1> *tmp = new AutoBuffer<String, 1>(*r.ps);
release();
ps = tmp;
}
else if (r.type == Param::REAL)
{
AutoBuffer<double, 1> *tmp = new AutoBuffer<double, 1>(*r.pd);
release();
pd = tmp;
}
type = r.type;
return *this;
}
inline DictValue::DictValue(const DictValue &r)
: pv(NULL)
{
type = r.type;
if (r.type == Param::INT)
pi = new AutoBuffer<int64, 1>(*r.pi);
else if (r.type == Param::STRING)
ps = new AutoBuffer<String, 1>(*r.ps);
else if (r.type == Param::REAL)
pd = new AutoBuffer<double, 1>(*r.pd);
}
inline bool DictValue::isString() const
{
return (type == Param::STRING);
}
inline bool DictValue::isInt() const
{
return (type == Param::INT);
}
inline bool DictValue::isReal() const
{
return (type == Param::REAL || type == Param::INT);
}
inline int DictValue::size() const
{
switch (type)
{
case Param::INT:
return (int)pi->size();
case Param::STRING:
return (int)ps->size();
case Param::REAL:
return (int)pd->size();
case Param::BOOLEAN:
case Param::MAT:
case Param::MAT_VECTOR:
case Param::ALGORITHM:
case Param::FLOAT:
case Param::UNSIGNED_INT:
case Param::UINT64:
case Param::UCHAR:
case Param::SCALAR:
break; // unhandled
}
CV_Error_(Error::StsInternal, ("Unhandled type (%d)", static_cast<int>(type)));
}
inline std::ostream &operator<<(std::ostream &stream, const DictValue &dictv)
{
int i;
if (dictv.isInt())
{
for (i = 0; i < dictv.size() - 1; i++)
stream << dictv.get<int64>(i) << ", ";
stream << dictv.get<int64>(i);
}
else if (dictv.isReal())
{
for (i = 0; i < dictv.size() - 1; i++)
stream << dictv.get<double>(i) << ", ";
stream << dictv.get<double>(i);
}
else if (dictv.isString())
{
for (i = 0; i < dictv.size() - 1; i++)
stream << "\"" << dictv.get<String>(i) << "\", ";
stream << dictv.get<String>(i);
}
return stream;
}
/////////////////////////////////////////////////////////////////
inline bool Dict::has(const String &key) const
{
return dict.count(key) != 0;
}
inline DictValue *Dict::ptr(const String &key)
{
_Dict::iterator i = dict.find(key);
return (i == dict.end()) ? NULL : &i->second;
}
inline const DictValue *Dict::ptr(const String &key) const
{
_Dict::const_iterator i = dict.find(key);
return (i == dict.end()) ? NULL : &i->second;
}
inline const DictValue &Dict::get(const String &key) const
{
_Dict::const_iterator i = dict.find(key);
if (i == dict.end())
CV_Error(Error::StsObjectNotFound, "Required argument \"" + key + "\" not found into dictionary");
return i->second;
}
template <typename T>
inline T Dict::get(const String &key) const
{
return this->get(key).get<T>();
}
template <typename T>
inline T Dict::get(const String &key, const T &defaultValue) const
{
_Dict::const_iterator i = dict.find(key);
if (i != dict.end())
return i->second.get<T>();
else
return defaultValue;
}
template<typename T>
inline const T &Dict::set(const String &key, const T &value)
{
_Dict::iterator i = dict.find(key);
if (i != dict.end())
i->second = DictValue(value);
else
dict.insert(std::make_pair(key, DictValue(value)));
return value;
}
inline void Dict::erase(const String &key)
{
dict.erase(key);
}
inline std::ostream &operator<<(std::ostream &stream, const Dict &dict)
{
Dict::_Dict::const_iterator it;
for (it = dict.dict.begin(); it != dict.dict.end(); it++)
stream << it->first << " : " << it->second << "\n";
return stream;
}
inline std::map<String, DictValue>::const_iterator Dict::begin() const
{
return dict.begin();
}
inline std::map<String, DictValue>::const_iterator Dict::end() const
{
return dict.end();
}
CV__DNN_INLINE_NS_END
}
}
#endif

View File

@@ -0,0 +1,78 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
#ifndef OPENCV_DNN_LAYER_DETAILS_HPP
#define OPENCV_DNN_LAYER_DETAILS_HPP
#include <opencv2/dnn/layer.hpp>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
/** @brief Registers layer constructor in runtime.
* @param type string, containing type name of the layer.
* @param constructorFunc pointer to the function of type LayerRegister::Constructor, which creates the layer.
* @details This macros must be placed inside the function code.
*/
#define CV_DNN_REGISTER_LAYER_FUNC(type, constructorFunc) \
cv::dnn::LayerFactory::registerLayer(#type, constructorFunc);
/** @brief Registers layer class in runtime.
* @param type string, containing type name of the layer.
* @param class C++ class, derived from Layer.
* @details This macros must be placed inside the function code.
*/
#define CV_DNN_REGISTER_LAYER_CLASS(type, class) \
cv::dnn::LayerFactory::registerLayer(#type, cv::dnn::details::_layerDynamicRegisterer<class>);
/** @brief Registers layer constructor on module load time.
* @param type string, containing type name of the layer.
* @param constructorFunc pointer to the function of type LayerRegister::Constructor, which creates the layer.
* @details This macros must be placed outside the function code.
*/
#define CV_DNN_REGISTER_LAYER_FUNC_STATIC(type, constructorFunc) \
static cv::dnn::details::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, constructorFunc);
/** @brief Registers layer class on module load time.
* @param type string, containing type name of the layer.
* @param class C++ class, derived from Layer.
* @details This macros must be placed outside the function code.
*/
#define CV_DNN_REGISTER_LAYER_CLASS_STATIC(type, class) \
Ptr<Layer> __LayerStaticRegisterer_func_##type(LayerParams &params) \
{ return Ptr<Layer>(new class(params)); } \
static cv::dnn::details::_LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, __LayerStaticRegisterer_func_##type);
namespace details {
template<typename LayerClass>
Ptr<Layer> _layerDynamicRegisterer(LayerParams &params)
{
return Ptr<Layer>(LayerClass::create(params));
}
//allows automatically register created layer on module load time
class _LayerStaticRegisterer
{
String type;
public:
_LayerStaticRegisterer(const String &layerType, LayerFactory::Constructor layerConstructor)
{
this->type = layerType;
LayerFactory::registerLayer(layerType, layerConstructor);
}
~_LayerStaticRegisterer()
{
LayerFactory::unregisterLayer(type);
}
};
} // namespace
CV__DNN_INLINE_NS_END
}} // namespace
#endif

View File

@@ -0,0 +1,85 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_DNN_LAYER_HPP
#define OPENCV_DNN_LAYER_HPP
#include <opencv2/dnn.hpp>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
//! @addtogroup dnn
//! @{
//!
//! @defgroup dnnLayerFactory Utilities for New Layers Registration
//! @{
/** @brief %Layer factory allows to create instances of registered layers. */
class CV_EXPORTS LayerFactory
{
public:
//! Each Layer class must provide this function to the factory
typedef Ptr<Layer>(*Constructor)(LayerParams &params);
//! Registers the layer class with typename @p type and specified @p constructor. Thread-safe.
static void registerLayer(const String &type, Constructor constructor);
//! Unregisters registered layer with specified type name. Thread-safe.
static void unregisterLayer(const String &type);
/** @brief Creates instance of registered layer.
* @param type type name of creating layer.
* @param params parameters which will be used for layer initialization.
* @note Thread-safe.
*/
static Ptr<Layer> createLayerInstance(const String &type, LayerParams& params);
private:
LayerFactory();
};
//! @}
//! @}
CV__DNN_INLINE_NS_END
}
}
#endif

View File

@@ -0,0 +1,29 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_LAYER_REG_HPP
#define OPENCV_DNN_LAYER_REG_HPP
#include <opencv2/dnn.hpp>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
//! @addtogroup dnn
//! @{
typedef std::map<std::string, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
//! Register layer types of DNN model.
//!
//! @note In order to thread-safely access the factory, see getLayerFactoryMutex() function.
LayerFactory_Impl& getLayerFactoryImpl();
//! Get the mutex guarding @ref LayerFactory_Impl, see getLayerFactoryImpl() function.
Mutex& getLayerFactoryMutex();
//! @}
CV__DNN_INLINE_NS_END
}
}
#endif

View File

@@ -0,0 +1,259 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_DNN_DNN_SHAPE_UTILS_HPP
#define OPENCV_DNN_DNN_SHAPE_UTILS_HPP
#include <opencv2/dnn/dnn.hpp>
#include <opencv2/core/types_c.h> // CV_MAX_DIM
#include <iostream>
#include <ostream>
#include <sstream>
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
//Slicing
struct _Range : public cv::Range
{
_Range(const Range &r) : cv::Range(r) {}
_Range(int start_, int size_ = 1) : cv::Range(start_, start_ + size_) {}
};
static inline Mat slice(const Mat &m, const _Range &r0)
{
Range ranges[CV_MAX_DIM];
for (int i = 1; i < m.dims; i++)
ranges[i] = Range::all();
ranges[0] = r0;
return m(&ranges[0]);
}
static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
{
CV_Assert(m.dims >= 2);
Range ranges[CV_MAX_DIM];
for (int i = 2; i < m.dims; i++)
ranges[i] = Range::all();
ranges[0] = r0;
ranges[1] = r1;
return m(&ranges[0]);
}
static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
{
CV_Assert(m.dims >= 3);
Range ranges[CV_MAX_DIM];
for (int i = 3; i < m.dims; i++)
ranges[i] = Range::all();
ranges[0] = r0;
ranges[1] = r1;
ranges[2] = r2;
return m(&ranges[0]);
}
static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
{
CV_Assert(m.dims >= 4);
Range ranges[CV_MAX_DIM];
for (int i = 4; i < m.dims; i++)
ranges[i] = Range::all();
ranges[0] = r0;
ranges[1] = r1;
ranges[2] = r2;
ranges[3] = r3;
return m(&ranges[0]);
}
static inline Mat getPlane(const Mat &m, int n, int cn)
{
CV_Assert(m.dims > 2);
int sz[CV_MAX_DIM];
for(int i = 2; i < m.dims; i++)
{
sz[i-2] = m.size.p[i];
}
return Mat(m.dims - 2, sz, m.type(), (void*)m.ptr<float>(n, cn));
}
static inline MatShape shape(const int* dims, const int n)
{
MatShape shape;
shape.assign(dims, dims + n);
return shape;
}
static inline MatShape shape(const Mat& mat)
{
return shape(mat.size.p, mat.dims);
}
static inline MatShape shape(const MatSize& sz)
{
return shape(sz.p, sz.dims());
}
static inline MatShape shape(const UMat& mat)
{
return shape(mat.size.p, mat.dims);
}
#if 0 // issues with MatExpr wrapped into InputArray
static inline
MatShape shape(InputArray input)
{
int sz[CV_MAX_DIM];
int ndims = input.sizend(sz);
return shape(sz, ndims);
}
#endif
namespace {inline bool is_neg(int i) { return i < 0; }}
static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
{
int dims[] = {a0, a1, a2, a3};
MatShape s = shape(dims, 4);
s.erase(std::remove_if(s.begin(), s.end(), is_neg), s.end());
return s;
}
static inline int total(const MatShape& shape, int start = -1, int end = -1)
{
if (start == -1) start = 0;
if (end == -1) end = (int)shape.size();
if (shape.empty())
return 0;
int elems = 1;
CV_Assert(start <= (int)shape.size() && end <= (int)shape.size() &&
start <= end);
for(int i = start; i < end; i++)
{
elems *= shape[i];
}
return elems;
}
static inline MatShape concat(const MatShape& a, const MatShape& b)
{
MatShape c = a;
c.insert(c.end(), b.begin(), b.end());
return c;
}
static inline std::string toString(const MatShape& shape, const String& name = "")
{
std::ostringstream ss;
if (!name.empty())
ss << name << ' ';
ss << '[';
for(size_t i = 0, n = shape.size(); i < n; ++i)
ss << ' ' << shape[i];
ss << " ]";
return ss.str();
}
static inline void print(const MatShape& shape, const String& name = "")
{
std::cout << toString(shape, name) << std::endl;
}
static inline std::ostream& operator<<(std::ostream &out, const MatShape& shape)
{
out << toString(shape);
return out;
}
/// @brief Converts axis from `[-dims; dims)` (similar to Python's slice notation) to `[0; dims)` range.
static inline
int normalize_axis(int axis, int dims)
{
CV_Check(axis, axis >= -dims && axis < dims, "");
axis = (axis < 0) ? (dims + axis) : axis;
CV_DbgCheck(axis, axis >= 0 && axis < dims, "");
return axis;
}
static inline
int normalize_axis(int axis, const MatShape& shape)
{
return normalize_axis(axis, (int)shape.size());
}
static inline
Range normalize_axis_range(const Range& r, int axisSize)
{
if (r == Range::all())
return Range(0, axisSize);
CV_CheckGE(r.start, 0, "");
Range clamped(r.start,
r.end > 0 ? std::min(r.end, axisSize) : axisSize + r.end + 1);
CV_DbgCheckGE(clamped.start, 0, "");
CV_CheckLT(clamped.start, clamped.end, "");
CV_CheckLE(clamped.end, axisSize, "");
return clamped;
}
static inline
bool isAllOnes(const MatShape &inputShape, int startPos, int endPos)
{
CV_Assert(!inputShape.empty());
CV_CheckGE((int) inputShape.size(), startPos, "");
CV_CheckGE(startPos, 0, "");
CV_CheckLE(startPos, endPos, "");
CV_CheckLE((size_t)endPos, inputShape.size(), "");
for (size_t i = startPos; i < endPos; i++)
{
if (inputShape[i] != 1)
return false;
}
return true;
}
CV__DNN_INLINE_NS_END
}
}
#endif

View File

@@ -0,0 +1,24 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
#define OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
#include "../dnn.hpp"
namespace cv { namespace dnn {
CV__DNN_INLINE_NS_BEGIN
/**
* @brief Skip model import after diagnostic run in readNet() functions.
* @param[in] skip Indicates whether to skip the import.
*
* This is an internal OpenCV function not intended for users.
*/
CV_EXPORTS void skipModelImport(bool skip);
CV__DNN_INLINE_NS_END
}} // namespace
#endif // OPENCV_DNN_UTILS_DEBUG_UTILS_HPP

View File

@@ -0,0 +1,76 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2018-2019, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
#ifndef OPENCV_DNN_UTILS_INF_ENGINE_HPP
#define OPENCV_DNN_UTILS_INF_ENGINE_HPP
#include "../dnn.hpp"
namespace cv { namespace dnn {
CV__DNN_INLINE_NS_BEGIN
/* Values for 'OPENCV_DNN_BACKEND_INFERENCE_ENGINE_TYPE' parameter */
#define CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API "NN_BUILDER"
#define CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH "NGRAPH"
/** @brief Returns Inference Engine internal backend API.
*
* See values of `CV_DNN_BACKEND_INFERENCE_ENGINE_*` macros.
*
* Default value is controlled through `OPENCV_DNN_BACKEND_INFERENCE_ENGINE_TYPE` runtime parameter (environment variable).
*/
CV_EXPORTS_W cv::String getInferenceEngineBackendType();
/** @brief Specify Inference Engine internal backend API.
*
* See values of `CV_DNN_BACKEND_INFERENCE_ENGINE_*` macros.
*
* @returns previous value of internal backend API
*/
CV_EXPORTS_W cv::String setInferenceEngineBackendType(const cv::String& newBackendType);
/** @brief Release a Myriad device (binded by OpenCV).
*
* Single Myriad device cannot be shared across multiple processes which uses
* Inference Engine's Myriad plugin.
*/
CV_EXPORTS_W void resetMyriadDevice();
/* Values for 'OPENCV_DNN_IE_VPU_TYPE' parameter */
#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_UNSPECIFIED ""
/// Intel(R) Movidius(TM) Neural Compute Stick, NCS (USB 03e7:2150), Myriad2 (https://software.intel.com/en-us/movidius-ncs)
#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2 "Myriad2"
/// Intel(R) Neural Compute Stick 2, NCS2 (USB 03e7:2485), MyriadX (https://software.intel.com/ru-ru/neural-compute-stick)
#define CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X "MyriadX"
#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_ARM_COMPUTE "ARM_COMPUTE"
#define CV_DNN_INFERENCE_ENGINE_CPU_TYPE_X86 "X86"
/** @brief Returns Inference Engine VPU type.
*
* See values of `CV_DNN_INFERENCE_ENGINE_VPU_TYPE_*` macros.
*/
CV_EXPORTS_W cv::String getInferenceEngineVPUType();
/** @brief Returns Inference Engine CPU type.
*
* Specify OpenVINO plugin: CPU or ARM.
*/
CV_EXPORTS_W cv::String getInferenceEngineCPUType();
/** @brief Release a HDDL plugin.
*/
CV_EXPORTS_W void releaseHDDLPlugin();
CV__DNN_INLINE_NS_END
}} // namespace
#endif // OPENCV_DNN_UTILS_INF_ENGINE_HPP

View File

@@ -0,0 +1,21 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_VERSION_HPP
#define OPENCV_DNN_VERSION_HPP
/// Use with major OpenCV version only.
#define OPENCV_DNN_API_VERSION 20211004
#if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
#define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
#define CV__DNN_INLINE_NS_BEGIN namespace CV__DNN_INLINE_NS {
#define CV__DNN_INLINE_NS_END }
namespace cv { namespace dnn { namespace CV__DNN_INLINE_NS { } using namespace CV__DNN_INLINE_NS; }}
#else
#define CV__DNN_INLINE_NS_BEGIN
#define CV__DNN_INLINE_NS_END
#endif
#endif // OPENCV_DNN_VERSION_HPP

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
# This script is used to estimate an accuracy of different face detection models.
# COCO evaluation tool is used to compute an accuracy metrics (Average Precision).
# Script works with different face detection datasets.
import os
import json
from fnmatch import fnmatch
from math import pi
import cv2 as cv
import argparse
import os
import sys
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
parser = argparse.ArgumentParser(
description='Evaluate OpenCV face detection algorithms '
'using COCO evaluation tool, http://cocodataset.org/#detections-eval')
parser.add_argument('--proto', help='Path to .prototxt of Caffe model or .pbtxt of TensorFlow graph')
parser.add_argument('--model', help='Path to .caffemodel trained in Caffe or .pb from TensorFlow')
parser.add_argument('--cascade', help='Optional path to trained Haar cascade as '
'an additional model for evaluation')
parser.add_argument('--ann', help='Path to text file with ground truth annotations')
parser.add_argument('--pics', help='Path to images root directory')
parser.add_argument('--fddb', help='Evaluate FDDB dataset, http://vis-www.cs.umass.edu/fddb/', action='store_true')
parser.add_argument('--wider', help='Evaluate WIDER FACE dataset, http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/', action='store_true')
args = parser.parse_args()
dataset = {}
dataset['images'] = []
dataset['categories'] = [{ 'id': 0, 'name': 'face' }]
dataset['annotations'] = []
def ellipse2Rect(params):
rad_x = params[0]
rad_y = params[1]
angle = params[2] * 180.0 / pi
center_x = params[3]
center_y = params[4]
pts = cv.ellipse2Poly((int(center_x), int(center_y)), (int(rad_x), int(rad_y)),
int(angle), 0, 360, 10)
rect = cv.boundingRect(pts)
left = rect[0]
top = rect[1]
right = rect[0] + rect[2]
bottom = rect[1] + rect[3]
return left, top, right, bottom
def addImage(imagePath):
assert('images' in dataset)
imageId = len(dataset['images'])
dataset['images'].append({
'id': int(imageId),
'file_name': imagePath
})
return imageId
def addBBox(imageId, left, top, width, height):
assert('annotations' in dataset)
dataset['annotations'].append({
'id': len(dataset['annotations']),
'image_id': int(imageId),
'category_id': 0, # Face
'bbox': [int(left), int(top), int(width), int(height)],
'iscrowd': 0,
'area': float(width * height)
})
def addDetection(detections, imageId, left, top, width, height, score):
detections.append({
'image_id': int(imageId),
'category_id': 0, # Face
'bbox': [int(left), int(top), int(width), int(height)],
'score': float(score)
})
def fddb_dataset(annotations, images):
for d in os.listdir(annotations):
if fnmatch(d, 'FDDB-fold-*-ellipseList.txt'):
with open(os.path.join(annotations, d), 'rt') as f:
lines = [line.rstrip('\n') for line in f]
lineId = 0
while lineId < len(lines):
# Image
imgPath = lines[lineId]
lineId += 1
imageId = addImage(os.path.join(images, imgPath) + '.jpg')
img = cv.imread(os.path.join(images, imgPath) + '.jpg')
# Faces
numFaces = int(lines[lineId])
lineId += 1
for i in range(numFaces):
params = [float(v) for v in lines[lineId].split()]
lineId += 1
left, top, right, bottom = ellipse2Rect(params)
addBBox(imageId, left, top, width=right - left + 1,
height=bottom - top + 1)
def wider_dataset(annotations, images):
with open(annotations, 'rt') as f:
lines = [line.rstrip('\n') for line in f]
lineId = 0
while lineId < len(lines):
# Image
imgPath = lines[lineId]
lineId += 1
imageId = addImage(os.path.join(images, imgPath))
# Faces
numFaces = int(lines[lineId])
lineId += 1
for i in range(numFaces):
params = [int(v) for v in lines[lineId].split()]
lineId += 1
left, top, width, height = params[0], params[1], params[2], params[3]
addBBox(imageId, left, top, width, height)
def evaluate():
cocoGt = COCO('annotations.json')
cocoDt = cocoGt.loadRes('detections.json')
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
### Convert to COCO annotations format #########################################
assert(args.fddb or args.wider)
if args.fddb:
fddb_dataset(args.ann, args.pics)
elif args.wider:
wider_dataset(args.ann, args.pics)
with open('annotations.json', 'wt') as f:
json.dump(dataset, f)
### Obtain detections ##########################################################
detections = []
if args.proto and args.model:
net = cv.dnn.readNet(args.proto, args.model)
def detect(img, imageId):
imgWidth = img.shape[1]
imgHeight = img.shape[0]
net.setInput(cv.dnn.blobFromImage(img, 1.0, (300, 300), (104., 177., 123.), False, False))
out = net.forward()
for i in range(out.shape[2]):
confidence = out[0, 0, i, 2]
left = int(out[0, 0, i, 3] * img.shape[1])
top = int(out[0, 0, i, 4] * img.shape[0])
right = int(out[0, 0, i, 5] * img.shape[1])
bottom = int(out[0, 0, i, 6] * img.shape[0])
x = max(0, min(left, img.shape[1] - 1))
y = max(0, min(top, img.shape[0] - 1))
w = max(0, min(right - x + 1, img.shape[1] - x))
h = max(0, min(bottom - y + 1, img.shape[0] - y))
addDetection(detections, imageId, x, y, w, h, score=confidence)
elif args.cascade:
cascade = cv.CascadeClassifier(args.cascade)
def detect(img, imageId):
srcImgGray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
faces = cascade.detectMultiScale(srcImgGray)
for rect in faces:
left, top, width, height = rect[0], rect[1], rect[2], rect[3]
addDetection(detections, imageId, left, top, width, height, score=1.0)
for i in range(len(dataset['images'])):
sys.stdout.write('\r%d / %d' % (i + 1, len(dataset['images'])))
sys.stdout.flush()
img = cv.imread(dataset['images'][i]['file_name'])
imageId = int(dataset['images'][i]['id'])
detect(img, imageId)
with open('detections.json', 'wt') as f:
json.dump(detections, f)
evaluate()
def rm(f):
if os.path.exists(f):
os.remove(f)
rm('annotations.json')
rm('detections.json')

View File

@@ -0,0 +1 @@
misc/java/src/cpp/dnn_converters.hpp

View File

@@ -0,0 +1,63 @@
{
"type_dict": {
"MatShape": {
"j_type": "MatOfInt",
"jn_type": "long",
"jni_type": "jlong",
"jni_var": "MatShape %(n)s",
"suffix": "J",
"v_type": "Mat",
"j_import": "org.opencv.core.MatOfInt"
},
"vector_MatShape": {
"j_type": "List<MatOfInt>",
"jn_type": "List<MatOfInt>",
"jni_type": "jobject",
"jni_var": "std::vector< MatShape > %(n)s",
"suffix": "Ljava_util_List",
"v_type": "vector_MatShape",
"j_import": "org.opencv.core.MatOfInt"
},
"vector_size_t": {
"j_type": "MatOfDouble",
"jn_type": "long",
"jni_type": "jlong",
"jni_var": "std::vector<size_t> %(n)s",
"suffix": "J",
"v_type": "Mat",
"j_import": "org.opencv.core.MatOfDouble"
},
"vector_Ptr_Layer": {
"j_type": "List<Layer>",
"jn_type": "List<Layer>",
"jni_type": "jobject",
"jni_var": "std::vector< Ptr<cv::dnn::Layer> > %(n)s",
"suffix": "Ljava_util_List",
"v_type": "vector_Layer",
"j_import": "org.opencv.dnn.Layer"
},
"vector_Target": {
"j_type": "List<Integer>",
"jn_type": "List<Integer>",
"jni_type": "jobject",
"jni_var": "std::vector< cv::dnn::Target > %(n)s",
"suffix": "Ljava_util_List",
"v_type": "vector_Target"
},
"LayerId": {
"j_type": "DictValue",
"jn_type": "long",
"jn_args": [
[
"__int64",
".getNativeObjAddr()"
]
],
"jni_name": "(*(*(Ptr<cv::dnn::DictValue>*)%(n)s_nativeObj))",
"jni_type": "jlong",
"suffix": "J",
"j_import": "org.opencv.dnn.DictValue"
}
}
}

View File

@@ -0,0 +1,102 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// Author: abratchik
#include "dnn_converters.hpp"
#define LOG_TAG "org.opencv.dnn"
void Mat_to_MatShape(cv::Mat& mat, MatShape& matshape)
{
matshape.clear();
CHECK_MAT(mat.type()==CV_32SC1 && mat.cols==1);
matshape = (MatShape) mat;
}
void MatShape_to_Mat(MatShape& matshape, cv::Mat& mat)
{
mat = cv::Mat(matshape, true);
}
std::vector<MatShape> List_to_vector_MatShape(JNIEnv* env, jobject list)
{
static jclass juArrayList = ARRAYLIST(env);
jmethodID m_size = LIST_SIZE(env, juArrayList);
jmethodID m_get = LIST_GET(env, juArrayList);
static jclass jMatOfInt = MATOFINT(env);
jint len = env->CallIntMethod(list, m_size);
std::vector<MatShape> result;
result.reserve(len);
for (jint i=0; i<len; i++)
{
jobject element = static_cast<jobject>(env->CallObjectMethod(list, m_get, i));
cv::Mat& mat = *((cv::Mat*) GETNATIVEOBJ(env, jMatOfInt, element) );
MatShape matshape = (MatShape) mat;
result.push_back(matshape);
env->DeleteLocalRef(element);
}
return result;
}
jobject vector_Ptr_Layer_to_List(JNIEnv* env, std::vector<cv::Ptr<cv::dnn::Layer> >& vs)
{
static jclass juArrayList = ARRAYLIST(env);
static jmethodID m_create = CONSTRUCTOR(env, juArrayList);
jmethodID m_add = LIST_ADD(env, juArrayList);
static jclass jLayerClass = LAYER(env);
static jmethodID m_create_layer = LAYER_CONSTRUCTOR(env, jLayerClass);
jobject result = env->NewObject(juArrayList, m_create, vs.size());
for (std::vector< cv::Ptr<cv::dnn::Layer> >::iterator it = vs.begin(); it != vs.end(); ++it) {
jobject element = env->NewObject(jLayerClass, m_create_layer, (*it).get());
env->CallBooleanMethod(result, m_add, element);
env->DeleteLocalRef(element);
}
return result;
}
jobject vector_Target_to_List(JNIEnv* env, std::vector<cv::dnn::Target>& vs)
{
static jclass juArrayList = ARRAYLIST(env);
static jmethodID m_create = CONSTRUCTOR(env, juArrayList);
jmethodID m_add = LIST_ADD(env, juArrayList);
static jclass jInteger = env->FindClass("java/lang/Integer");
static jmethodID m_create_Integer = env->GetMethodID(jInteger, "<init>", "(I)V");
jobject result = env->NewObject(juArrayList, m_create, vs.size());
for (size_t i = 0; i < vs.size(); ++i)
{
jobject element = env->NewObject(jInteger, m_create_Integer, vs[i]);
env->CallBooleanMethod(result, m_add, element);
env->DeleteLocalRef(element);
}
return result;
}
std::vector<cv::Ptr<cv::dnn::Layer> > List_to_vector_Ptr_Layer(JNIEnv* env, jobject list)
{
static jclass juArrayList = ARRAYLIST(env);
jmethodID m_size = LIST_SIZE(env, juArrayList);
jmethodID m_get = LIST_GET(env, juArrayList);
static jclass jLayerClass = LAYER(env);
jint len = env->CallIntMethod(list, m_size);
std::vector< cv::Ptr<cv::dnn::Layer> > result;
result.reserve(len);
for (jint i=0; i<len; i++)
{
jobject element = static_cast<jobject>(env->CallObjectMethod(list, m_get, i));
cv::Ptr<cv::dnn::Layer>* layer_ptr = (cv::Ptr<cv::dnn::Layer>*) GETNATIVEOBJ(env, jLayerClass, element) ;
cv::Ptr<cv::dnn::Layer> layer = *(layer_ptr);
result.push_back(layer);
env->DeleteLocalRef(element);
}
return result;
}

View File

@@ -0,0 +1,33 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// Author: abratchik
#ifndef DNN_CONVERTERS_HPP
#define DNN_CONVERTERS_HPP
#include <jni.h>
#include "opencv_java.hpp"
#include "opencv2/core.hpp"
#include "opencv2/dnn/dnn.hpp"
#define LAYER(ENV) static_cast<jclass>(ENV->NewGlobalRef(ENV->FindClass("org/opencv/dnn/Layer")))
#define LAYER_CONSTRUCTOR(ENV, CLS) ENV->GetMethodID(CLS, "<init>", "(J)V")
using namespace cv::dnn;
void Mat_to_MatShape(cv::Mat& mat, MatShape& matshape);
void MatShape_to_Mat(MatShape& matshape, cv::Mat& mat);
std::vector<MatShape> List_to_vector_MatShape(JNIEnv* env, jobject list);
jobject vector_Ptr_Layer_to_List(JNIEnv* env, std::vector<cv::Ptr<cv::dnn::Layer> >& vs);
std::vector<cv::Ptr<cv::dnn::Layer> > List_to_vector_Ptr_Layer(JNIEnv* env, jobject list);
jobject vector_Target_to_List(JNIEnv* env, std::vector<cv::dnn::Target>& vs);
#endif /* DNN_CONVERTERS_HPP */

View File

@@ -0,0 +1,119 @@
package org.opencv.test.dnn;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.MatOfInt;
import org.opencv.core.MatOfFloat;
import org.opencv.core.MatOfByte;
import org.opencv.core.Scalar;
import org.opencv.core.Size;
import org.opencv.dnn.DictValue;
import org.opencv.dnn.Dnn;
import org.opencv.dnn.Layer;
import org.opencv.dnn.Net;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import org.opencv.test.OpenCVTestCase;
/*
* regression test for #12324,
* testing various java.util.List invocations,
* which use the LIST_GET macro
*/
public class DnnListRegressionTest extends OpenCVTestCase {
private final static String ENV_OPENCV_DNN_TEST_DATA_PATH = "OPENCV_DNN_TEST_DATA_PATH";
private final static String ENV_OPENCV_TEST_DATA_PATH = "OPENCV_TEST_DATA_PATH";
String modelFileName = "";
String sourceImageFile = "";
Net net;
@Override
protected void setUp() throws Exception {
super.setUp();
String envDnnTestDataPath = System.getenv(ENV_OPENCV_DNN_TEST_DATA_PATH);
if(envDnnTestDataPath == null){
isTestCaseEnabled = false;
return;
}
File dnnTestDataPath = new File(envDnnTestDataPath);
modelFileName = new File(dnnTestDataPath, "dnn/tensorflow_inception_graph.pb").toString();
String envTestDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
if(envTestDataPath == null) throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
File testDataPath = new File(envTestDataPath);
File f = new File(testDataPath, "dnn/grace_hopper_227.png");
sourceImageFile = f.toString();
if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);
net = Dnn.readNetFromTensorflow(modelFileName);
Mat image = Imgcodecs.imread(sourceImageFile);
assertNotNull("Loading image from file failed!", image);
Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
assertNotNull("Converting image to blob failed!", inputBlob);
net.setInput(inputBlob, "input");
}
public void testSetInputsNames() {
List<String> inputs = new ArrayList();
inputs.add("input");
try {
net.setInputsNames(inputs);
} catch(Exception e) {
fail("Net setInputsNames failed: " + e.getMessage());
}
}
public void testForward() {
List<Mat> outs = new ArrayList();
List<String> outNames = new ArrayList();
outNames.add("softmax2");
try {
net.forward(outs,outNames);
} catch(Exception e) {
fail("Net forward failed: " + e.getMessage());
}
}
public void testGetMemoryConsumption() {
int layerId = 1;
List<MatOfInt> netInputShapes = new ArrayList();
netInputShapes.add(new MatOfInt(1, 3, 224, 224));
long[] weights=null;
long[] blobs=null;
try {
net.getMemoryConsumption(layerId, netInputShapes, weights, blobs);
} catch(Exception e) {
fail("Net getMemoryConsumption failed: " + e.getMessage());
}
}
public void testGetFLOPS() {
int layerId = 1;
List<MatOfInt> netInputShapes = new ArrayList();
netInputShapes.add(new MatOfInt(1, 3, 224, 224));
try {
net.getFLOPS(layerId, netInputShapes);
} catch(Exception e) {
fail("Net getFLOPS failed: " + e.getMessage());
}
}
}

View File

@@ -0,0 +1,149 @@
package org.opencv.test.dnn;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.MatOfFloat;
import org.opencv.core.MatOfByte;
import org.opencv.core.Scalar;
import org.opencv.core.Size;
import org.opencv.dnn.DictValue;
import org.opencv.dnn.Dnn;
import org.opencv.dnn.Layer;
import org.opencv.dnn.Net;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import org.opencv.test.OpenCVTestCase;
public class DnnTensorFlowTest extends OpenCVTestCase {
private final static String ENV_OPENCV_DNN_TEST_DATA_PATH = "OPENCV_DNN_TEST_DATA_PATH";
private final static String ENV_OPENCV_TEST_DATA_PATH = "OPENCV_TEST_DATA_PATH";
String modelFileName = "";
String sourceImageFile = "";
Net net;
private static void normAssert(Mat ref, Mat test) {
final double l1 = 1e-5;
final double lInf = 1e-4;
double normL1 = Core.norm(ref, test, Core.NORM_L1) / ref.total();
double normLInf = Core.norm(ref, test, Core.NORM_INF) / ref.total();
assertTrue(normL1 < l1);
assertTrue(normLInf < lInf);
}
@Override
protected void setUp() throws Exception {
super.setUp();
String envDnnTestDataPath = System.getenv(ENV_OPENCV_DNN_TEST_DATA_PATH);
if(envDnnTestDataPath == null){
isTestCaseEnabled = false;
return;
}
File dnnTestDataPath = new File(envDnnTestDataPath);
modelFileName = new File(dnnTestDataPath, "dnn/tensorflow_inception_graph.pb").toString();
String envTestDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
if(envTestDataPath == null) throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
File testDataPath = new File(envTestDataPath);
File f = new File(testDataPath, "dnn/grace_hopper_227.png");
sourceImageFile = f.toString();
if(!f.exists()) throw new Exception("Test image is missing: " + sourceImageFile);
net = Dnn.readNetFromTensorflow(modelFileName);
}
public void testGetLayerTypes() {
List<String> layertypes = new ArrayList();
net.getLayerTypes(layertypes);
assertFalse("No layer types returned!", layertypes.isEmpty());
}
public void testGetLayer() {
List<String> layernames = net.getLayerNames();
assertFalse("Test net returned no layers!", layernames.isEmpty());
String testLayerName = layernames.get(0);
DictValue layerId = new DictValue(testLayerName);
assertEquals("DictValue did not return the string, which was used in constructor!", testLayerName, layerId.getStringValue());
Layer layer = net.getLayer(layerId);
assertEquals("Layer name does not match the expected value!", testLayerName, layer.get_name());
}
public void checkInceptionNet(Net net)
{
Mat image = Imgcodecs.imread(sourceImageFile);
assertNotNull("Loading image from file failed!", image);
Mat inputBlob = Dnn.blobFromImage(image, 1.0, new Size(224, 224), new Scalar(0), true, true);
assertNotNull("Converting image to blob failed!", inputBlob);
net.setInput(inputBlob, "input");
Mat result = new Mat();
try {
net.setPreferableBackend(Dnn.DNN_BACKEND_OPENCV);
result = net.forward("softmax2");
}
catch (Exception e) {
fail("DNN forward failed: " + e.getMessage());
}
assertNotNull("Net returned no result!", result);
result = result.reshape(1, 1);
Core.MinMaxLocResult minmax = Core.minMaxLoc(result);
assertEquals("Wrong prediction", (int)minmax.maxLoc.x, 866);
Mat top5RefScores = new MatOfFloat(new float[] {
0.63032645f, 0.2561979f, 0.032181446f, 0.015721032f, 0.014785315f
}).reshape(1, 1);
Core.sort(result, result, Core.SORT_DESCENDING);
normAssert(result.colRange(0, 5), top5RefScores);
}
public void testTestNetForward() {
checkInceptionNet(net);
}
public void testReadFromBuffer() {
File modelFile = new File(modelFileName);
byte[] modelBuffer = new byte[ (int)modelFile.length() ];
try {
FileInputStream fis = new FileInputStream(modelFile);
fis.read(modelBuffer);
fis.close();
} catch (IOException e) {
fail("Failed to read a model: " + e.getMessage());
}
net = Dnn.readNetFromTensorflow(new MatOfByte(modelBuffer));
checkInceptionNet(net);
}
public void testGetAvailableTargets() {
List<Integer> targets = Dnn.getAvailableTargets(Dnn.DNN_BACKEND_OPENCV);
assertTrue(targets.contains(Dnn.DNN_TARGET_CPU));
}
}

View File

@@ -0,0 +1,46 @@
{
"func_arg_fix" : {
"Dnn": {
"(Net*)readNetFromCaffe:(NSString*)prototxt caffeModel:(NSString*)caffeModel" : { "readNetFromCaffe" : {"name" : "readNetFromCaffeFile"} },
"(Net*)readNetFromCaffe:(ByteVector*)bufferProto bufferModel:(ByteVector*)bufferModel" : { "readNetFromCaffe" : {"name" : "readNetFromCaffeBuffer"} },
"(Net*)readNetFromDarknet:(NSString*)cfgFile darknetModel:(NSString*)darknetModel" : { "readNetFromDarknet" : {"name" : "readNetFromDarknetFile"} },
"(Net*)readNetFromDarknet:(ByteVector*)bufferCfg bufferModel:(ByteVector*)bufferModel" : { "readNetFromDarknet" : {"name" : "readNetFromDarknetBuffer"} },
"(Net*)readNetFromONNX:(NSString*)onnxFile" : { "readNetFromONNX" : {"name" : "readNetFromONNXFile"} },
"(Net*)readNetFromONNX:(ByteVector*)buffer" : { "readNetFromONNX" : {"name" : "readNetFromONNXBuffer"} },
"(Net*)readNetFromTensorflow:(NSString*)model config:(NSString*)config" : { "readNetFromTensorflow" : {"name" : "readNetFromTensorflowFile"} },
"(Net*)readNetFromTensorflow:(ByteVector*)bufferModel bufferConfig:(ByteVector*)bufferConfig" : { "readNetFromTensorflow" : {"name" : "readNetFromTensorflowBuffer"} }
},
"Net": {
"(void)forward:(NSMutableArray<Mat*>*)outputBlobs outputName:(NSString*)outputName" : { "forward" : {"name" : "forwardOutputBlobs"} },
"(void)forward:(NSMutableArray<Mat*>*)outputBlobs outBlobNames:(NSArray<NSString*>*)outBlobNames" : { "forward" : {"name" : "forwardOutputBlobs"} },
"(void)forwardAndRetrieve:(NSMutableArray<NSMutableArray<Mat*>*>*)outputBlobs outBlobNames:(NSArray<NSString*>*)outBlobNames" : { "forward" : {"swift_name" : "forwardAndRetrieve"} },
"(long)getFLOPS:(IntVector*)netInputShape" : { "getFLOPS" : {"name" : "getFLOPSWithNetInputShape"} },
"(long)getFLOPS:(NSArray<IntVector*>*)netInputShapes" : { "getFLOPS" : {"name" : "getFLOPSWithNetInputShapes"} },
"(long)getFLOPS:(int)layerId netInputShape:(IntVector*)netInputShape" : { "getFLOPS" : {"name" : "getFLOPSWithLayerId"} },
"(long)getFLOPS:(int)layerId netInputShapes:(NSArray<IntVector*>*)netInputShapes" : { "getFLOPS" : {"name" : "getFLOPSWithLayerId"} },
"(void)getLayersShapes:(IntVector*)netInputShape layersIds:(IntVector*)layersIds inLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)inLayersShapes outLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)outLayersShapes" : { "getLayersShapes" : {"name" : "getLayersShapesWithNetInputShape"} },
"(void)getLayersShapes:(NSArray<IntVector*>*)netInputShapes layersIds:(IntVector*)layersIds inLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)inLayersShapes outLayersShapes:(NSMutableArray<NSMutableArray<IntVector*>*>*)outLayersShapes" : { "getLayersShapes" : {"name" : "getLayersShapesWithNetInputShapes"} }
}
},
"type_dict": {
"MatShape": {
"objc_type": "IntVector*",
"to_cpp": "%(n)s.nativeRef",
"from_cpp": "[IntVector fromNative:%(n)s]",
"cast_to": "std::vector<int>"
},
"vector_MatShape": {
"objc_type": "IntVector*",
"v_type": "IntVector"
},
"vector_vector_MatShape": {
"objc_type": "IntVector*",
"v_v_type": "IntVector"
},
"LayerId": {
"objc_type": "DictValue*",
"to_cpp": "*(cv::dnn::DictValue*)(%(n)s.nativePtr)",
"from_cpp": "[DictValue fromNative:%(n)s]"
}
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,219 @@
#ifdef HAVE_OPENCV_DNN
typedef dnn::DictValue LayerId;
typedef std::vector<dnn::MatShape> vector_MatShape;
typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
template<>
bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const ArgInfo& info)
{
CV_UNUSED(info);
if (!o || o == Py_None)
return true; //Current state will be used
else if (PyLong_Check(o))
{
dv = dnn::DictValue((int64)PyLong_AsLongLong(o));
return true;
}
else if (PyInt_Check(o))
{
dv = dnn::DictValue((int64)PyInt_AS_LONG(o));
return true;
}
else if (PyFloat_Check(o))
{
dv = dnn::DictValue(PyFloat_AsDouble(o));
return true;
}
else
{
std::string str;
if (getUnicodeString(o, str))
{
dv = dnn::DictValue(str);
return true;
}
}
return false;
}
template<typename T>
PyObject* pyopencv_from(const dnn::DictValue &dv)
{
if (dv.size() > 1)
{
std::vector<T> vec(dv.size());
for (int i = 0; i < dv.size(); ++i)
vec[i] = dv.get<T>(i);
return pyopencv_from_generic_vec(vec);
}
else
return pyopencv_from(dv.get<T>());
}
template<>
PyObject* pyopencv_from(const dnn::DictValue &dv)
{
if (dv.isInt()) return pyopencv_from<int>(dv);
if (dv.isReal()) return pyopencv_from<float>(dv);
if (dv.isString()) return pyopencv_from<String>(dv);
CV_Error(Error::StsNotImplemented, "Unknown value type");
return NULL;
}
template<>
PyObject* pyopencv_from(const dnn::LayerParams& lp)
{
PyObject* dict = PyDict_New();
for (std::map<String, dnn::DictValue>::const_iterator it = lp.begin(); it != lp.end(); ++it)
{
CV_Assert(!PyDict_SetItemString(dict, it->first.c_str(), pyopencv_from(it->second)));
}
return dict;
}
template<>
PyObject* pyopencv_from(const std::vector<dnn::Target> &t)
{
return pyopencv_from(std::vector<int>(t.begin(), t.end()));
}
class pycvLayer CV_FINAL : public dnn::Layer
{
public:
pycvLayer(const dnn::LayerParams &params, PyObject* pyLayer) : Layer(params)
{
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();
PyObject* args = PyTuple_New(2);
CV_Assert(!PyTuple_SetItem(args, 0, pyopencv_from(params)));
CV_Assert(!PyTuple_SetItem(args, 1, pyopencv_from(params.blobs)));
o = PyObject_CallObject(pyLayer, args);
Py_DECREF(args);
PyGILState_Release(gstate);
if (!o)
CV_Error(Error::StsError, "Failed to create an instance of custom layer");
}
static void registerLayer(const std::string& type, PyObject* o)
{
std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(type);
if (it != pyLayers.end())
it->second.push_back(o);
else
pyLayers[type] = std::vector<PyObject*>(1, o);
}
static void unregisterLayer(const std::string& type)
{
std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(type);
if (it != pyLayers.end())
{
if (it->second.size() > 1)
it->second.pop_back();
else
pyLayers.erase(it);
}
}
static Ptr<dnn::Layer> create(dnn::LayerParams &params)
{
std::map<std::string, std::vector<PyObject*> >::iterator it = pyLayers.find(params.type);
if (it == pyLayers.end())
CV_Error(Error::StsNotImplemented, "Layer with a type \"" + params.type +
"\" is not implemented");
CV_Assert(!it->second.empty());
return Ptr<dnn::Layer>(new pycvLayer(params, it->second.back()));
}
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
const int,
std::vector<std::vector<int> > &outputs,
std::vector<std::vector<int> > &) const CV_OVERRIDE
{
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();
PyObject* args = PyList_New(inputs.size());
for(size_t i = 0; i < inputs.size(); ++i)
PyList_SetItem(args, i, pyopencv_from_generic_vec(inputs[i]));
PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("getMemoryShapes"), args, NULL);
Py_DECREF(args);
PyGILState_Release(gstate);
if (!res)
CV_Error(Error::StsNotImplemented, "Failed to call \"getMemoryShapes\" method");
CV_Assert(pyopencv_to_generic_vec(res, outputs, ArgInfo("", 0)));
return false;
}
virtual void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays) CV_OVERRIDE
{
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
PyObject* args = pyopencv_from(inputs);
PyObject* res = PyObject_CallMethodObjArgs(o, PyString_FromString("forward"), args, NULL);
Py_DECREF(args);
if (!res)
CV_Error(Error::StsNotImplemented, "Failed to call \"forward\" method");
std::vector<Mat> pyOutputs;
CV_Assert(pyopencv_to(res, pyOutputs, ArgInfo("", 0)));
Py_DECREF(res);
PyGILState_Release(gstate);
CV_Assert(pyOutputs.size() == outputs.size());
for (size_t i = 0; i < outputs.size(); ++i)
{
CV_Assert(pyOutputs[i].size == outputs[i].size);
CV_Assert(pyOutputs[i].type() == outputs[i].type());
pyOutputs[i].copyTo(outputs[i]);
}
}
private:
// Map layers types to python classes.
static std::map<std::string, std::vector<PyObject*> > pyLayers;
PyObject* o; // Instance of implemented python layer.
};
std::map<std::string, std::vector<PyObject*> > pycvLayer::pyLayers;
static PyObject *pyopencv_cv_dnn_registerLayer(PyObject*, PyObject *args, PyObject *kw)
{
const char *keywords[] = { "type", "class", NULL };
char* layerType;
PyObject *classInstance;
if (!PyArg_ParseTupleAndKeywords(args, kw, "sO", (char**)keywords, &layerType, &classInstance))
return NULL;
if (!PyCallable_Check(classInstance)) {
PyErr_SetString(PyExc_TypeError, "class must be callable");
return NULL;
}
pycvLayer::registerLayer(layerType, classInstance);
dnn::LayerFactory::registerLayer(layerType, pycvLayer::create);
Py_RETURN_NONE;
}
static PyObject *pyopencv_cv_dnn_unregisterLayer(PyObject*, PyObject *args, PyObject *kw)
{
const char *keywords[] = { "type", NULL };
char* layerType;
if (!PyArg_ParseTupleAndKeywords(args, kw, "s", (char**)keywords, &layerType))
return NULL;
pycvLayer::unregisterLayer(layerType);
dnn::LayerFactory::unregisterLayer(layerType);
Py_RETURN_NONE;
}
#endif // HAVE_OPENCV_DNN

View File

@@ -0,0 +1,415 @@
#!/usr/bin/env python
import os
import cv2 as cv
import numpy as np
from tests_common import NewOpenCVTests, unittest
def normAssert(test, a, b, msg=None, lInf=1e-5):
test.assertLess(np.max(np.abs(a - b)), lInf, msg)
def inter_area(box1, box2):
x_min, x_max = max(box1[0], box2[0]), min(box1[2], box2[2])
y_min, y_max = max(box1[1], box2[1]), min(box1[3], box2[3])
return (x_max - x_min) * (y_max - y_min)
def area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def box2str(box):
left, top = box[0], box[1]
width, height = box[2] - left, box[3] - top
return '[%f x %f from (%f, %f)]' % (width, height, left, top)
def normAssertDetections(test, refClassIds, refScores, refBoxes, testClassIds, testScores, testBoxes,
confThreshold=0.0, scores_diff=1e-5, boxes_iou_diff=1e-4):
matchedRefBoxes = [False] * len(refBoxes)
errMsg = ''
for i in range(len(testBoxes)):
testScore = testScores[i]
if testScore < confThreshold:
continue
testClassId, testBox = testClassIds[i], testBoxes[i]
matched = False
for j in range(len(refBoxes)):
if (not matchedRefBoxes[j]) and testClassId == refClassIds[j] and \
abs(testScore - refScores[j]) < scores_diff:
interArea = inter_area(testBox, refBoxes[j])
iou = interArea / (area(testBox) + area(refBoxes[j]) - interArea)
if abs(iou - 1.0) < boxes_iou_diff:
matched = True
matchedRefBoxes[j] = True
if not matched:
errMsg += '\nUnmatched prediction: class %d score %f box %s' % (testClassId, testScore, box2str(testBox))
for i in range(len(refBoxes)):
if (not matchedRefBoxes[i]) and refScores[i] > confThreshold:
errMsg += '\nUnmatched reference: class %d score %f box %s' % (refClassIds[i], refScores[i], box2str(refBoxes[i]))
if errMsg:
test.fail(errMsg)
def printParams(backend, target):
backendNames = {
cv.dnn.DNN_BACKEND_OPENCV: 'OCV',
cv.dnn.DNN_BACKEND_INFERENCE_ENGINE: 'DLIE'
}
targetNames = {
cv.dnn.DNN_TARGET_CPU: 'CPU',
cv.dnn.DNN_TARGET_OPENCL: 'OCL',
cv.dnn.DNN_TARGET_OPENCL_FP16: 'OCL_FP16',
cv.dnn.DNN_TARGET_MYRIAD: 'MYRIAD'
}
print('%s/%s' % (backendNames[backend], targetNames[target]))
def getDefaultThreshold(target):
if target == cv.dnn.DNN_TARGET_OPENCL_FP16 or target == cv.dnn.DNN_TARGET_MYRIAD:
return 4e-3
else:
return 1e-5
testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
g_dnnBackendsAndTargets = None
class dnn_test(NewOpenCVTests):
def setUp(self):
super(dnn_test, self).setUp()
global g_dnnBackendsAndTargets
if g_dnnBackendsAndTargets is None:
g_dnnBackendsAndTargets = self.initBackendsAndTargets()
self.dnnBackendsAndTargets = g_dnnBackendsAndTargets
def initBackendsAndTargets(self):
self.dnnBackendsAndTargets = [
[cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU],
]
if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU):
self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_CPU])
if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD):
self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_MYRIAD])
if cv.ocl.haveOpenCL() and cv.ocl.useOpenCL():
self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL])
self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_OPENCL_FP16])
if cv.ocl_Device.getDefault().isIntel():
if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL):
self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL])
if self.checkIETarget(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16):
self.dnnBackendsAndTargets.append([cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_TARGET_OPENCL_FP16])
return self.dnnBackendsAndTargets
def find_dnn_file(self, filename, required=True):
if not required:
required = testdata_required
return self.find_file(filename, [os.environ.get('OPENCV_DNN_TEST_DATA_PATH', os.getcwd()),
os.environ['OPENCV_TEST_DATA_PATH']],
required=required)
def checkIETarget(self, backend, target):
proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt')
model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel')
net = cv.dnn.readNet(proto, model)
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
inp = np.random.standard_normal([1, 2, 10, 11]).astype(np.float32)
try:
net.setInput(inp)
net.forward()
except BaseException as e:
return False
return True
def test_getAvailableTargets(self):
targets = cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_OPENCV)
self.assertTrue(cv.dnn.DNN_TARGET_CPU in targets)
def test_blobFromImage(self):
np.random.seed(324)
width = 6
height = 7
scale = 1.0/127.5
mean = (10, 20, 30)
# Test arguments names.
img = np.random.randint(0, 255, [4, 5, 3]).astype(np.uint8)
blob = cv.dnn.blobFromImage(img, scale, (width, height), mean, True, False)
blob_args = cv.dnn.blobFromImage(img, scalefactor=scale, size=(width, height),
mean=mean, swapRB=True, crop=False)
normAssert(self, blob, blob_args)
# Test values.
target = cv.resize(img, (width, height), interpolation=cv.INTER_LINEAR)
target = target.astype(np.float32)
target = target[:,:,[2, 1, 0]] # BGR2RGB
target[:,:,0] -= mean[0]
target[:,:,1] -= mean[1]
target[:,:,2] -= mean[2]
target *= scale
target = target.transpose(2, 0, 1).reshape(1, 3, height, width) # to NCHW
normAssert(self, blob, target)
def test_model(self):
img_path = self.find_dnn_file("dnn/street.png")
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
if weights is None or config is None:
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
frame = cv.imread(img_path)
model = cv.dnn_DetectionModel(weights, config)
model.setInputParams(size=(300, 300), mean=(127.5, 127.5, 127.5), scale=1.0/127.5)
iouDiff = 0.05
confThreshold = 0.0001
nmsThreshold = 0
scoreDiff = 1e-3
classIds, confidences, boxes = model.detect(frame, confThreshold, nmsThreshold)
refClassIds = (7, 15)
refConfidences = (0.9998, 0.8793)
refBoxes = ((328, 238, 85, 102), (101, 188, 34, 138))
normAssertDetections(self, refClassIds, refConfidences, refBoxes,
classIds, confidences, boxes,confThreshold, scoreDiff, iouDiff)
for box in boxes:
cv.rectangle(frame, box, (0, 255, 0))
cv.rectangle(frame, np.array(box), (0, 255, 0))
cv.rectangle(frame, tuple(box), (0, 255, 0))
cv.rectangle(frame, list(box), (0, 255, 0))
def test_classification_model(self):
img_path = self.find_dnn_file("dnn/googlenet_0.png")
weights = self.find_dnn_file("dnn/squeezenet_v1.1.caffemodel", required=False)
config = self.find_dnn_file("dnn/squeezenet_v1.1.prototxt")
ref = np.load(self.find_dnn_file("dnn/squeezenet_v1.1_prob.npy"))
if weights is None or config is None:
raise unittest.SkipTest("Missing DNN test files (dnn/squeezenet_v1.1.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
frame = cv.imread(img_path)
model = cv.dnn_ClassificationModel(config, weights)
model.setInputSize(227, 227)
model.setInputCrop(True)
out = model.predict(frame)
normAssert(self, out, ref)
def test_textdetection_model(self):
img_path = self.find_dnn_file("dnn/text_det_test1.png")
weights = self.find_dnn_file("dnn/onnx/models/DB_TD500_resnet50.onnx", required=False)
if weights is None:
raise unittest.SkipTest("Missing DNN test files (onnx/models/DB_TD500_resnet50.onnx). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
frame = cv.imread(img_path)
scale = 1.0 / 255.0
size = (736, 736)
mean = (122.67891434, 116.66876762, 104.00698793)
model = cv.dnn_TextDetectionModel_DB(weights)
model.setInputParams(scale, size, mean)
out, _ = model.detect(frame)
self.assertTrue(type(out) == tuple, msg='actual type {}'.format(str(type(out))))
self.assertTrue(np.array(out).shape == (2, 4, 2))
def test_face_detection(self):
proto = self.find_dnn_file('dnn/opencv_face_detector.prototxt')
model = self.find_dnn_file('dnn/opencv_face_detector.caffemodel', required=False)
if proto is None or model is None:
raise unittest.SkipTest("Missing DNN test files (dnn/opencv_face_detector.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
img = self.get_sample('gpu/lbpcascade/er.png')
blob = cv.dnn.blobFromImage(img, mean=(104, 177, 123), swapRB=False, crop=False)
ref = [[0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631],
[0, 1, 0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168],
[0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290],
[0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477],
[0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494],
[0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801]]
print('\n')
for backend, target in self.dnnBackendsAndTargets:
printParams(backend, target)
net = cv.dnn.readNet(proto, model)
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
net.setInput(blob)
out = net.forward().reshape(-1, 7)
scoresDiff = 4e-3 if target in [cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD] else 1e-5
iouDiff = 2e-2 if target in [cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD] else 1e-4
ref = np.array(ref, np.float32)
refClassIds, testClassIds = ref[:, 1], out[:, 1]
refScores, testScores = ref[:, 2], out[:, 2]
refBoxes, testBoxes = ref[:, 3:], out[:, 3:]
normAssertDetections(self, refClassIds, refScores, refBoxes, testClassIds,
testScores, testBoxes, 0.5, scoresDiff, iouDiff)
def test_async(self):
timeout = 10*1000*10**6 # in nanoseconds (10 sec)
proto = self.find_dnn_file('dnn/layers/layer_convolution.prototxt')
model = self.find_dnn_file('dnn/layers/layer_convolution.caffemodel')
if proto is None or model is None:
raise unittest.SkipTest("Missing DNN test files (dnn/layers/layer_convolution.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
print('\n')
for backend, target in self.dnnBackendsAndTargets:
if backend != cv.dnn.DNN_BACKEND_INFERENCE_ENGINE:
continue
printParams(backend, target)
netSync = cv.dnn.readNet(proto, model)
netSync.setPreferableBackend(backend)
netSync.setPreferableTarget(target)
netAsync = cv.dnn.readNet(proto, model)
netAsync.setPreferableBackend(backend)
netAsync.setPreferableTarget(target)
# Generate inputs
numInputs = 10
inputs = []
for _ in range(numInputs):
inputs.append(np.random.standard_normal([2, 6, 75, 113]).astype(np.float32))
# Run synchronously
refs = []
for i in range(numInputs):
netSync.setInput(inputs[i])
refs.append(netSync.forward())
# Run asynchronously. To make test more robust, process inputs in the reversed order.
outs = []
for i in reversed(range(numInputs)):
netAsync.setInput(inputs[i])
outs.insert(0, netAsync.forwardAsync())
for i in reversed(range(numInputs)):
ret, result = outs[i].get(timeoutNs=float(timeout))
self.assertTrue(ret)
normAssert(self, refs[i], result, 'Index: %d' % i, 1e-10)
def test_nms(self):
confs = (1, 1)
rects = ((0, 0, 0.4, 0.4), (0, 0, 0.2, 0.4)) # 0.5 overlap
self.assertTrue(all(cv.dnn.NMSBoxes(rects, confs, 0, 0.6).ravel() == (0, 1)))
def test_custom_layer(self):
class CropLayer(object):
def __init__(self, params, blobs):
self.xstart = 0
self.xend = 0
self.ystart = 0
self.yend = 0
# Our layer receives two inputs. We need to crop the first input blob
# to match a shape of the second one (keeping batch size and number of channels)
def getMemoryShapes(self, inputs):
inputShape, targetShape = inputs[0], inputs[1]
batchSize, numChannels = inputShape[0], inputShape[1]
height, width = targetShape[2], targetShape[3]
self.ystart = (inputShape[2] - targetShape[2]) // 2
self.xstart = (inputShape[3] - targetShape[3]) // 2
self.yend = self.ystart + height
self.xend = self.xstart + width
return [[batchSize, numChannels, height, width]]
def forward(self, inputs):
return [inputs[0][:,:,self.ystart:self.yend,self.xstart:self.xend]]
cv.dnn_registerLayer('CropCaffe', CropLayer)
proto = '''
name: "TestCrop"
input: "input"
input_shape
{
dim: 1
dim: 2
dim: 5
dim: 5
}
input: "roi"
input_shape
{
dim: 1
dim: 2
dim: 3
dim: 3
}
layer {
name: "Crop"
type: "CropCaffe"
bottom: "input"
bottom: "roi"
top: "Crop"
}'''
net = cv.dnn.readNetFromCaffe(bytearray(proto.encode()))
for backend, target in self.dnnBackendsAndTargets:
if backend != cv.dnn.DNN_BACKEND_OPENCV:
continue
printParams(backend, target)
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
src_shape = [1, 2, 5, 5]
dst_shape = [1, 2, 3, 3]
inp = np.arange(0, np.prod(src_shape), dtype=np.float32).reshape(src_shape)
roi = np.empty(dst_shape, dtype=np.float32)
net.setInput(inp, "input")
net.setInput(roi, "roi")
out = net.forward()
ref = inp[:, :, 1:4, 1:4]
normAssert(self, out, ref)
cv.dnn_unregisterLayer('CropCaffe')
# check that dnn module can work with 3D tensor as input for network
def test_input_3d(self):
model = self.find_dnn_file('dnn/onnx/models/hidden_lstm.onnx')
input_file = self.find_dnn_file('dnn/onnx/data/input_hidden_lstm.npy')
output_file = self.find_dnn_file('dnn/onnx/data/output_hidden_lstm.npy')
if model is None:
raise unittest.SkipTest("Missing DNN test files (dnn/onnx/models/hidden_lstm.onnx). "
"Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
if input_file is None or output_file is None:
raise unittest.SkipTest("Missing DNN test files (dnn/onnx/data/{input/output}_hidden_lstm.npy). "
"Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
input = np.load(input_file)
# we have to expand the shape of input tensor because Python bindings cut 3D tensors to 2D
# it should be fixed in future. see : https://github.com/opencv/opencv/issues/19091
# please remove `expand_dims` after that
input = np.expand_dims(input, axis=3)
gold_output = np.load(output_file)
for backend, target in self.dnnBackendsAndTargets:
printParams(backend, target)
net = cv.dnn.readNet(model)
net.setPreferableBackend(backend)
net.setPreferableTarget(target)
net.setInput(input)
real_output = net.forward()
normAssert(self, real_output, gold_output, "", getDefaultThreshold(target))
if __name__ == '__main__':
NewOpenCVTests.bootstrap()

View File

@@ -0,0 +1,365 @@
from __future__ import print_function
import sys
import argparse
import cv2 as cv
import tensorflow as tf
import numpy as np
import struct
if sys.version_info > (3,):
long = int
from tensorflow.python.tools import optimize_for_inference_lib
from tensorflow.tools.graph_transforms import TransformGraph
from tensorflow.core.framework.node_def_pb2 import NodeDef
from google.protobuf import text_format
parser = argparse.ArgumentParser(description="Use this script to create TensorFlow graph "
"with weights from OpenCV's face detection network. "
"Only backbone part of SSD model is converted this way. "
"Look for .pbtxt configuration file at "
"https://github.com/opencv/opencv_extra/tree/master/testdata/dnn/opencv_face_detector.pbtxt")
parser.add_argument('--model', help='Path to .caffemodel weights', required=True)
parser.add_argument('--proto', help='Path to .prototxt Caffe model definition', required=True)
parser.add_argument('--pb', help='Path to output .pb TensorFlow model', required=True)
parser.add_argument('--pbtxt', help='Path to output .pbxt TensorFlow graph', required=True)
parser.add_argument('--quantize', help='Quantize weights to uint8', action='store_true')
parser.add_argument('--fp16', help='Convert weights to half precision floats', action='store_true')
args = parser.parse_args()
assert(not args.quantize or not args.fp16)
dtype = tf.float16 if args.fp16 else tf.float32
################################################################################
cvNet = cv.dnn.readNetFromCaffe(args.proto, args.model)
def dnnLayer(name):
return cvNet.getLayer(long(cvNet.getLayerId(name)))
def scale(x, name):
with tf.variable_scope(name):
layer = dnnLayer(name)
w = tf.Variable(layer.blobs[0].flatten(), dtype=dtype, name='mul')
if len(layer.blobs) > 1:
b = tf.Variable(layer.blobs[1].flatten(), dtype=dtype, name='add')
return tf.nn.bias_add(tf.multiply(x, w), b)
else:
return tf.multiply(x, w, name)
def conv(x, name, stride=1, pad='SAME', dilation=1, activ=None):
with tf.variable_scope(name):
layer = dnnLayer(name)
w = tf.Variable(layer.blobs[0].transpose(2, 3, 1, 0), dtype=dtype, name='weights')
if dilation == 1:
conv = tf.nn.conv2d(x, filter=w, strides=(1, stride, stride, 1), padding=pad)
else:
assert(stride == 1)
conv = tf.nn.atrous_conv2d(x, w, rate=dilation, padding=pad)
if len(layer.blobs) > 1:
b = tf.Variable(layer.blobs[1].flatten(), dtype=dtype, name='bias')
conv = tf.nn.bias_add(conv, b)
return activ(conv) if activ else conv
def batch_norm(x, name):
with tf.variable_scope(name):
# Unfortunately, TensorFlow's batch normalization layer doesn't work with fp16 input.
# Here we do a cast to fp32 but remove it in the frozen graph.
if x.dtype != tf.float32:
x = tf.cast(x, tf.float32)
layer = dnnLayer(name)
assert(len(layer.blobs) >= 3)
mean = layer.blobs[0].flatten()
std = layer.blobs[1].flatten()
scale = layer.blobs[2].flatten()
eps = 1e-5
hasBias = len(layer.blobs) > 3
hasWeights = scale.shape != (1,)
if not hasWeights and not hasBias:
mean /= scale[0]
std /= scale[0]
mean = tf.Variable(mean, dtype=tf.float32, name='mean')
std = tf.Variable(std, dtype=tf.float32, name='std')
gamma = tf.Variable(scale if hasWeights else np.ones(mean.shape), dtype=tf.float32, name='gamma')
beta = tf.Variable(layer.blobs[3].flatten() if hasBias else np.zeros(mean.shape), dtype=tf.float32, name='beta')
bn = tf.nn.fused_batch_norm(x, gamma, beta, mean, std, eps,
is_training=False)[0]
if bn.dtype != dtype:
bn = tf.cast(bn, dtype)
return bn
def l2norm(x, name):
with tf.variable_scope(name):
layer = dnnLayer(name)
w = tf.Variable(layer.blobs[0].flatten(), dtype=dtype, name='mul')
return tf.nn.l2_normalize(x, 3, epsilon=1e-10) * w
### Graph definition ###########################################################
inp = tf.placeholder(dtype, [1, 300, 300, 3], 'data')
data_bn = batch_norm(inp, 'data_bn')
data_scale = scale(data_bn, 'data_scale')
# Instead of tf.pad we use tf.space_to_batch_nd layers which override convolution's padding strategy to explicit numbers
# data_scale = tf.pad(data_scale, [[0, 0], [3, 3], [3, 3], [0, 0]])
data_scale = tf.space_to_batch_nd(data_scale, [1, 1], [[3, 3], [3, 3]], name='Pad')
conv1_h = conv(data_scale, stride=2, pad='VALID', name='conv1_h')
conv1_bn_h = batch_norm(conv1_h, 'conv1_bn_h')
conv1_scale_h = scale(conv1_bn_h, 'conv1_scale_h')
conv1_relu = tf.nn.relu(conv1_scale_h)
conv1_pool = tf.layers.max_pooling2d(conv1_relu, pool_size=(3, 3), strides=(2, 2),
padding='SAME', name='conv1_pool')
layer_64_1_conv1_h = conv(conv1_pool, 'layer_64_1_conv1_h')
layer_64_1_bn2_h = batch_norm(layer_64_1_conv1_h, 'layer_64_1_bn2_h')
layer_64_1_scale2_h = scale(layer_64_1_bn2_h, 'layer_64_1_scale2_h')
layer_64_1_relu2 = tf.nn.relu(layer_64_1_scale2_h)
layer_64_1_conv2_h = conv(layer_64_1_relu2, 'layer_64_1_conv2_h')
layer_64_1_sum = layer_64_1_conv2_h + conv1_pool
layer_128_1_bn1_h = batch_norm(layer_64_1_sum, 'layer_128_1_bn1_h')
layer_128_1_scale1_h = scale(layer_128_1_bn1_h, 'layer_128_1_scale1_h')
layer_128_1_relu1 = tf.nn.relu(layer_128_1_scale1_h)
layer_128_1_conv1_h = conv(layer_128_1_relu1, stride=2, name='layer_128_1_conv1_h')
layer_128_1_bn2 = batch_norm(layer_128_1_conv1_h, 'layer_128_1_bn2')
layer_128_1_scale2 = scale(layer_128_1_bn2, 'layer_128_1_scale2')
layer_128_1_relu2 = tf.nn.relu(layer_128_1_scale2)
layer_128_1_conv2 = conv(layer_128_1_relu2, 'layer_128_1_conv2')
layer_128_1_conv_expand_h = conv(layer_128_1_relu1, stride=2, name='layer_128_1_conv_expand_h')
layer_128_1_sum = layer_128_1_conv2 + layer_128_1_conv_expand_h
layer_256_1_bn1 = batch_norm(layer_128_1_sum, 'layer_256_1_bn1')
layer_256_1_scale1 = scale(layer_256_1_bn1, 'layer_256_1_scale1')
layer_256_1_relu1 = tf.nn.relu(layer_256_1_scale1)
# layer_256_1_conv1 = tf.pad(layer_256_1_relu1, [[0, 0], [1, 1], [1, 1], [0, 0]])
layer_256_1_conv1 = tf.space_to_batch_nd(layer_256_1_relu1, [1, 1], [[1, 1], [1, 1]], name='Pad_1')
layer_256_1_conv1 = conv(layer_256_1_conv1, stride=2, pad='VALID', name='layer_256_1_conv1')
layer_256_1_bn2 = batch_norm(layer_256_1_conv1, 'layer_256_1_bn2')
layer_256_1_scale2 = scale(layer_256_1_bn2, 'layer_256_1_scale2')
layer_256_1_relu2 = tf.nn.relu(layer_256_1_scale2)
layer_256_1_conv2 = conv(layer_256_1_relu2, 'layer_256_1_conv2')
layer_256_1_conv_expand = conv(layer_256_1_relu1, stride=2, name='layer_256_1_conv_expand')
layer_256_1_sum = layer_256_1_conv2 + layer_256_1_conv_expand
layer_512_1_bn1 = batch_norm(layer_256_1_sum, 'layer_512_1_bn1')
layer_512_1_scale1 = scale(layer_512_1_bn1, 'layer_512_1_scale1')
layer_512_1_relu1 = tf.nn.relu(layer_512_1_scale1)
layer_512_1_conv1_h = conv(layer_512_1_relu1, 'layer_512_1_conv1_h')
layer_512_1_bn2_h = batch_norm(layer_512_1_conv1_h, 'layer_512_1_bn2_h')
layer_512_1_scale2_h = scale(layer_512_1_bn2_h, 'layer_512_1_scale2_h')
layer_512_1_relu2 = tf.nn.relu(layer_512_1_scale2_h)
layer_512_1_conv2_h = conv(layer_512_1_relu2, dilation=2, name='layer_512_1_conv2_h')
layer_512_1_conv_expand_h = conv(layer_512_1_relu1, 'layer_512_1_conv_expand_h')
layer_512_1_sum = layer_512_1_conv2_h + layer_512_1_conv_expand_h
last_bn_h = batch_norm(layer_512_1_sum, 'last_bn_h')
last_scale_h = scale(last_bn_h, 'last_scale_h')
fc7 = tf.nn.relu(last_scale_h, name='last_relu')
conv6_1_h = conv(fc7, 'conv6_1_h', activ=tf.nn.relu)
conv6_2_h = conv(conv6_1_h, stride=2, name='conv6_2_h', activ=tf.nn.relu)
conv7_1_h = conv(conv6_2_h, 'conv7_1_h', activ=tf.nn.relu)
# conv7_2_h = tf.pad(conv7_1_h, [[0, 0], [1, 1], [1, 1], [0, 0]])
conv7_2_h = tf.space_to_batch_nd(conv7_1_h, [1, 1], [[1, 1], [1, 1]], name='Pad_2')
conv7_2_h = conv(conv7_2_h, stride=2, pad='VALID', name='conv7_2_h', activ=tf.nn.relu)
conv8_1_h = conv(conv7_2_h, pad='SAME', name='conv8_1_h', activ=tf.nn.relu)
conv8_2_h = conv(conv8_1_h, pad='VALID', name='conv8_2_h', activ=tf.nn.relu)
conv9_1_h = conv(conv8_2_h, 'conv9_1_h', activ=tf.nn.relu)
conv9_2_h = conv(conv9_1_h, pad='VALID', name='conv9_2_h', activ=tf.nn.relu)
conv4_3_norm = l2norm(layer_256_1_relu1, 'conv4_3_norm')
### Locations and confidences ##################################################
locations = []
confidences = []
flattenLayersNames = [] # Collect all reshape layers names that should be replaced to flattens.
for top, suffix in zip([locations, confidences], ['_mbox_loc', '_mbox_conf']):
for bottom, name in zip([conv4_3_norm, fc7, conv6_2_h, conv7_2_h, conv8_2_h, conv9_2_h],
['conv4_3_norm', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']):
name += suffix
flat = tf.layers.flatten(conv(bottom, name))
flattenLayersNames.append(flat.name[:flat.name.find(':')])
top.append(flat)
mbox_loc = tf.concat(locations, axis=-1, name='mbox_loc')
mbox_conf = tf.concat(confidences, axis=-1, name='mbox_conf')
total = int(np.prod(mbox_conf.shape[1:]))
mbox_conf_reshape = tf.reshape(mbox_conf, [-1, 2], name='mbox_conf_reshape')
mbox_conf_softmax = tf.nn.softmax(mbox_conf_reshape, name='mbox_conf_softmax')
mbox_conf_flatten = tf.reshape(mbox_conf_softmax, [-1, total], name='mbox_conf_flatten')
flattenLayersNames.append('mbox_conf_flatten')
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
### Check correctness ######################################################
out_nodes = ['mbox_loc', 'mbox_conf_flatten']
inp_nodes = [inp.name[:inp.name.find(':')]]
np.random.seed(2701)
inputData = np.random.standard_normal([1, 3, 300, 300]).astype(np.float32)
cvNet.setInput(inputData)
cvNet.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
outDNN = cvNet.forward(out_nodes)
outTF = sess.run([mbox_loc, mbox_conf_flatten], feed_dict={inp: inputData.transpose(0, 2, 3, 1)})
print('Max diff @ locations: %e' % np.max(np.abs(outDNN[0] - outTF[0])))
print('Max diff @ confidence: %e' % np.max(np.abs(outDNN[1] - outTF[1])))
# Save a graph
graph_def = sess.graph.as_graph_def()
# Freeze graph. Replaces variables to constants.
graph_def = tf.graph_util.convert_variables_to_constants(sess, graph_def, out_nodes)
# Optimize graph. Removes training-only ops, unused nodes.
graph_def = optimize_for_inference_lib.optimize_for_inference(graph_def, inp_nodes, out_nodes, dtype.as_datatype_enum)
# Fuse constant operations.
transforms = ["fold_constants(ignore_errors=True)"]
if args.quantize:
transforms += ["quantize_weights(minimum_size=0)"]
transforms += ["sort_by_execution_order"]
graph_def = TransformGraph(graph_def, inp_nodes, out_nodes, transforms)
# By default, float16 weights are stored in repeated tensor's field called
# `half_val`. It has type int32 with leading zeros for unused bytes.
# This type is encoded by Variant that means only 7 bits are used for value
# representation but the last one is indicated the end of encoding. This way
# float16 might takes 1 or 2 or 3 bytes depends on value. To improve compression,
# we replace all `half_val` values to `tensor_content` using only 2 bytes for everyone.
for node in graph_def.node:
if 'value' in node.attr:
halfs = node.attr["value"].tensor.half_val
if not node.attr["value"].tensor.tensor_content and halfs:
node.attr["value"].tensor.tensor_content = struct.pack('H' * len(halfs), *halfs)
node.attr["value"].tensor.ClearField('half_val')
# Serialize
with tf.gfile.FastGFile(args.pb, 'wb') as f:
f.write(graph_def.SerializeToString())
################################################################################
# Write a text graph representation
################################################################################
def tensorMsg(values):
msg = 'tensor { dtype: DT_FLOAT tensor_shape { dim { size: %d } }' % len(values)
for value in values:
msg += 'float_val: %f ' % value
return msg + '}'
# Remove Const nodes and unused attributes.
for i in reversed(range(len(graph_def.node))):
if graph_def.node[i].op in ['Const', 'Dequantize']:
del graph_def.node[i]
for attr in ['T', 'data_format', 'Tshape', 'N', 'Tidx', 'Tdim',
'use_cudnn_on_gpu', 'Index', 'Tperm', 'is_training',
'Tpaddings', 'Tblock_shape', 'Tcrops']:
if attr in graph_def.node[i].attr:
del graph_def.node[i].attr[attr]
# Append prior box generators
min_sizes = [30, 60, 111, 162, 213, 264]
max_sizes = [60, 111, 162, 213, 264, 315]
steps = [8, 16, 32, 64, 100, 300]
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
layers = [conv4_3_norm, fc7, conv6_2_h, conv7_2_h, conv8_2_h, conv9_2_h]
for i in range(6):
priorBox = NodeDef()
priorBox.name = 'PriorBox_%d' % i
priorBox.op = 'PriorBox'
priorBox.input.append(layers[i].name[:layers[i].name.find(':')])
priorBox.input.append(inp_nodes[0]) # data
text_format.Merge('i: %d' % min_sizes[i], priorBox.attr["min_size"])
text_format.Merge('i: %d' % max_sizes[i], priorBox.attr["max_size"])
text_format.Merge('b: true', priorBox.attr["flip"])
text_format.Merge('b: false', priorBox.attr["clip"])
text_format.Merge(tensorMsg(aspect_ratios[i]), priorBox.attr["aspect_ratio"])
text_format.Merge(tensorMsg([0.1, 0.1, 0.2, 0.2]), priorBox.attr["variance"])
text_format.Merge('f: %f' % steps[i], priorBox.attr["step"])
text_format.Merge('f: 0.5', priorBox.attr["offset"])
graph_def.node.extend([priorBox])
# Concatenate prior boxes
concat = NodeDef()
concat.name = 'mbox_priorbox'
concat.op = 'ConcatV2'
for i in range(6):
concat.input.append('PriorBox_%d' % i)
concat.input.append('mbox_loc/axis')
graph_def.node.extend([concat])
# DetectionOutput layer
detectionOut = NodeDef()
detectionOut.name = 'detection_out'
detectionOut.op = 'DetectionOutput'
detectionOut.input.append('mbox_loc')
detectionOut.input.append('mbox_conf_flatten')
detectionOut.input.append('mbox_priorbox')
text_format.Merge('i: 2', detectionOut.attr['num_classes'])
text_format.Merge('b: true', detectionOut.attr['share_location'])
text_format.Merge('i: 0', detectionOut.attr['background_label_id'])
text_format.Merge('f: 0.45', detectionOut.attr['nms_threshold'])
text_format.Merge('i: 400', detectionOut.attr['top_k'])
text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
text_format.Merge('i: 200', detectionOut.attr['keep_top_k'])
text_format.Merge('f: 0.01', detectionOut.attr['confidence_threshold'])
graph_def.node.extend([detectionOut])
# Replace L2Normalization subgraph onto a single node.
for i in reversed(range(len(graph_def.node))):
if graph_def.node[i].name in ['conv4_3_norm/l2_normalize/Square',
'conv4_3_norm/l2_normalize/Sum',
'conv4_3_norm/l2_normalize/Maximum',
'conv4_3_norm/l2_normalize/Rsqrt']:
del graph_def.node[i]
for node in graph_def.node:
if node.name == 'conv4_3_norm/l2_normalize':
node.op = 'L2Normalize'
node.input.pop()
node.input.pop()
node.input.append(layer_256_1_relu1.name)
node.input.append('conv4_3_norm/l2_normalize/Sum/reduction_indices')
break
softmaxShape = NodeDef()
softmaxShape.name = 'reshape_before_softmax'
softmaxShape.op = 'Const'
text_format.Merge(
'tensor {'
' dtype: DT_INT32'
' tensor_shape { dim { size: 3 } }'
' int_val: 0'
' int_val: -1'
' int_val: 2'
'}', softmaxShape.attr["value"])
graph_def.node.extend([softmaxShape])
for node in graph_def.node:
if node.name == 'mbox_conf_reshape':
node.input[1] = softmaxShape.name
elif node.name == 'mbox_conf_softmax':
text_format.Merge('i: 2', node.attr['axis'])
elif node.name in flattenLayersNames:
node.op = 'Flatten'
inpName = node.input[0]
node.input.pop()
node.input.pop()
node.input.append(inpName)
tf.train.write_graph(graph_def, "", args.pbtxt, as_text=True)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,968 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: graph.proto
#ifndef PROTOBUF_graph_2eproto__INCLUDED
#define PROTOBUF_graph_2eproto__INCLUDED
#include <string>
#include <google/protobuf/stubs/common.h>
#if GOOGLE_PROTOBUF_VERSION < 3005000
#error This file was generated by a newer version of protoc which is
#error incompatible with your Protocol Buffer headers. Please update
#error your headers.
#endif
#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
#error This file was generated by an older version of protoc which is
#error incompatible with your Protocol Buffer headers. Please
#error regenerate this file with a newer version of protoc.
#endif
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/arena.h>
#include <google/protobuf/arenastring.h>
#include <google/protobuf/generated_message_table_driven.h>
#include <google/protobuf/generated_message_util.h>
#include <google/protobuf/metadata.h>
#include <google/protobuf/message.h>
#include <google/protobuf/repeated_field.h> // IWYU pragma: export
#include <google/protobuf/extension_set.h> // IWYU pragma: export
#include <google/protobuf/map.h> // IWYU pragma: export
#include <google/protobuf/map_entry.h>
#include <google/protobuf/map_field_inl.h>
#include <google/protobuf/unknown_field_set.h>
#include "attr_value.pb.h"
#include "function.pb.h"
#include "versions.pb.h"
// @@protoc_insertion_point(includes)
namespace protobuf_graph_2eproto {
// Internal implementation detail -- do not use these members.
struct TableStruct {
static const ::google::protobuf::internal::ParseTableField entries[];
static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
static const ::google::protobuf::internal::ParseTable schema[3];
static const ::google::protobuf::internal::FieldMetadata field_metadata[];
static const ::google::protobuf::internal::SerializationTable serialization_table[];
static const ::google::protobuf::uint32 offsets[];
};
void AddDescriptors();
void InitDefaultsGraphDefImpl();
void InitDefaultsGraphDef();
void InitDefaultsNodeDef_AttrEntry_DoNotUseImpl();
void InitDefaultsNodeDef_AttrEntry_DoNotUse();
void InitDefaultsNodeDefImpl();
void InitDefaultsNodeDef();
inline void InitDefaults() {
InitDefaultsGraphDef();
InitDefaultsNodeDef_AttrEntry_DoNotUse();
InitDefaultsNodeDef();
}
} // namespace protobuf_graph_2eproto
namespace opencv_tensorflow {
class GraphDef;
class GraphDefDefaultTypeInternal;
extern GraphDefDefaultTypeInternal _GraphDef_default_instance_;
class NodeDef;
class NodeDefDefaultTypeInternal;
extern NodeDefDefaultTypeInternal _NodeDef_default_instance_;
class NodeDef_AttrEntry_DoNotUse;
class NodeDef_AttrEntry_DoNotUseDefaultTypeInternal;
extern NodeDef_AttrEntry_DoNotUseDefaultTypeInternal _NodeDef_AttrEntry_DoNotUse_default_instance_;
} // namespace opencv_tensorflow
namespace opencv_tensorflow {
// ===================================================================
class GraphDef : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.GraphDef) */ {
public:
GraphDef();
virtual ~GraphDef();
GraphDef(const GraphDef& from);
inline GraphDef& operator=(const GraphDef& from) {
CopyFrom(from);
return *this;
}
#if LANG_CXX11
GraphDef(GraphDef&& from) noexcept
: GraphDef() {
*this = ::std::move(from);
}
inline GraphDef& operator=(GraphDef&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
#endif
inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
return GetArenaNoVirtual();
}
inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
return MaybeArenaPtr();
}
static const ::google::protobuf::Descriptor* descriptor();
static const GraphDef& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const GraphDef* internal_default_instance() {
return reinterpret_cast<const GraphDef*>(
&_GraphDef_default_instance_);
}
static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
0;
void UnsafeArenaSwap(GraphDef* other);
void Swap(GraphDef* other);
friend void swap(GraphDef& a, GraphDef& b) {
a.Swap(&b);
}
// implements Message ----------------------------------------------
inline GraphDef* New() const PROTOBUF_FINAL { return New(NULL); }
GraphDef* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void CopyFrom(const GraphDef& from);
void MergeFrom(const GraphDef& from);
void Clear() PROTOBUF_FINAL;
bool IsInitialized() const PROTOBUF_FINAL;
size_t ByteSizeLong() const PROTOBUF_FINAL;
bool MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
void SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
private:
void SharedCtor();
void SharedDtor();
void SetCachedSize(int size) const PROTOBUF_FINAL;
void InternalSwap(GraphDef* other);
protected:
explicit GraphDef(::google::protobuf::Arena* arena);
private:
static void ArenaDtor(void* object);
inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
private:
inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
return _internal_metadata_.arena();
}
inline void* MaybeArenaPtr() const {
return _internal_metadata_.raw_arena_ptr();
}
public:
::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
// nested types ----------------------------------------------------
// accessors -------------------------------------------------------
// repeated .opencv_tensorflow.NodeDef node = 1;
int node_size() const;
void clear_node();
static const int kNodeFieldNumber = 1;
const ::opencv_tensorflow::NodeDef& node(int index) const;
::opencv_tensorflow::NodeDef* mutable_node(int index);
::opencv_tensorflow::NodeDef* add_node();
::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >*
mutable_node();
const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >&
node() const;
// .opencv_tensorflow.FunctionDefLibrary library = 2;
bool has_library() const;
void clear_library();
static const int kLibraryFieldNumber = 2;
private:
void _slow_mutable_library();
public:
const ::opencv_tensorflow::FunctionDefLibrary& library() const;
::opencv_tensorflow::FunctionDefLibrary* release_library();
::opencv_tensorflow::FunctionDefLibrary* mutable_library();
void set_allocated_library(::opencv_tensorflow::FunctionDefLibrary* library);
void unsafe_arena_set_allocated_library(
::opencv_tensorflow::FunctionDefLibrary* library);
::opencv_tensorflow::FunctionDefLibrary* unsafe_arena_release_library();
// .opencv_tensorflow.VersionDef versions = 4;
bool has_versions() const;
void clear_versions();
static const int kVersionsFieldNumber = 4;
private:
void _slow_mutable_versions();
public:
const ::opencv_tensorflow::VersionDef& versions() const;
::opencv_tensorflow::VersionDef* release_versions();
::opencv_tensorflow::VersionDef* mutable_versions();
void set_allocated_versions(::opencv_tensorflow::VersionDef* versions);
void unsafe_arena_set_allocated_versions(
::opencv_tensorflow::VersionDef* versions);
::opencv_tensorflow::VersionDef* unsafe_arena_release_versions();
// int32 version = 3 [deprecated = true];
GOOGLE_PROTOBUF_DEPRECATED_ATTR void clear_version();
GOOGLE_PROTOBUF_DEPRECATED_ATTR static const int kVersionFieldNumber = 3;
GOOGLE_PROTOBUF_DEPRECATED_ATTR ::google::protobuf::int32 version() const;
GOOGLE_PROTOBUF_DEPRECATED_ATTR void set_version(::google::protobuf::int32 value);
// @@protoc_insertion_point(class_scope:opencv_tensorflow.GraphDef)
private:
::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
typedef void InternalArenaConstructable_;
typedef void DestructorSkippable_;
::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef > node_;
::opencv_tensorflow::FunctionDefLibrary* library_;
::opencv_tensorflow::VersionDef* versions_;
::google::protobuf::int32 version_;
mutable int _cached_size_;
friend struct ::protobuf_graph_2eproto::TableStruct;
friend void ::protobuf_graph_2eproto::InitDefaultsGraphDefImpl();
};
// -------------------------------------------------------------------
class NodeDef_AttrEntry_DoNotUse : public ::google::protobuf::internal::MapEntry<NodeDef_AttrEntry_DoNotUse,
::std::string, ::opencv_tensorflow::AttrValue,
::google::protobuf::internal::WireFormatLite::TYPE_STRING,
::google::protobuf::internal::WireFormatLite::TYPE_MESSAGE,
0 > {
public:
typedef ::google::protobuf::internal::MapEntry<NodeDef_AttrEntry_DoNotUse,
::std::string, ::opencv_tensorflow::AttrValue,
::google::protobuf::internal::WireFormatLite::TYPE_STRING,
::google::protobuf::internal::WireFormatLite::TYPE_MESSAGE,
0 > SuperType;
NodeDef_AttrEntry_DoNotUse();
NodeDef_AttrEntry_DoNotUse(::google::protobuf::Arena* arena);
void MergeFrom(const NodeDef_AttrEntry_DoNotUse& other);
static const NodeDef_AttrEntry_DoNotUse* internal_default_instance() { return reinterpret_cast<const NodeDef_AttrEntry_DoNotUse*>(&_NodeDef_AttrEntry_DoNotUse_default_instance_); }
void MergeFrom(const ::google::protobuf::Message& other) PROTOBUF_FINAL;
::google::protobuf::Metadata GetMetadata() const;
};
// -------------------------------------------------------------------
class NodeDef : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.NodeDef) */ {
public:
NodeDef();
virtual ~NodeDef();
NodeDef(const NodeDef& from);
inline NodeDef& operator=(const NodeDef& from) {
CopyFrom(from);
return *this;
}
#if LANG_CXX11
NodeDef(NodeDef&& from) noexcept
: NodeDef() {
*this = ::std::move(from);
}
inline NodeDef& operator=(NodeDef&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
#endif
inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
return GetArenaNoVirtual();
}
inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
return MaybeArenaPtr();
}
static const ::google::protobuf::Descriptor* descriptor();
static const NodeDef& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const NodeDef* internal_default_instance() {
return reinterpret_cast<const NodeDef*>(
&_NodeDef_default_instance_);
}
static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
2;
void UnsafeArenaSwap(NodeDef* other);
void Swap(NodeDef* other);
friend void swap(NodeDef& a, NodeDef& b) {
a.Swap(&b);
}
// implements Message ----------------------------------------------
inline NodeDef* New() const PROTOBUF_FINAL { return New(NULL); }
NodeDef* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void CopyFrom(const NodeDef& from);
void MergeFrom(const NodeDef& from);
void Clear() PROTOBUF_FINAL;
bool IsInitialized() const PROTOBUF_FINAL;
size_t ByteSizeLong() const PROTOBUF_FINAL;
bool MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
void SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
private:
void SharedCtor();
void SharedDtor();
void SetCachedSize(int size) const PROTOBUF_FINAL;
void InternalSwap(NodeDef* other);
protected:
explicit NodeDef(::google::protobuf::Arena* arena);
private:
static void ArenaDtor(void* object);
inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
private:
inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
return _internal_metadata_.arena();
}
inline void* MaybeArenaPtr() const {
return _internal_metadata_.raw_arena_ptr();
}
public:
::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
// nested types ----------------------------------------------------
// accessors -------------------------------------------------------
// repeated string input = 3;
int input_size() const;
void clear_input();
static const int kInputFieldNumber = 3;
const ::std::string& input(int index) const;
::std::string* mutable_input(int index);
void set_input(int index, const ::std::string& value);
#if LANG_CXX11
void set_input(int index, ::std::string&& value);
#endif
void set_input(int index, const char* value);
void set_input(int index, const char* value, size_t size);
::std::string* add_input();
void add_input(const ::std::string& value);
#if LANG_CXX11
void add_input(::std::string&& value);
#endif
void add_input(const char* value);
void add_input(const char* value, size_t size);
const ::google::protobuf::RepeatedPtrField< ::std::string>& input() const;
::google::protobuf::RepeatedPtrField< ::std::string>* mutable_input();
// map<string, .opencv_tensorflow.AttrValue> attr = 5;
int attr_size() const;
void clear_attr();
static const int kAttrFieldNumber = 5;
const ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >&
attr() const;
::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >*
mutable_attr();
// string name = 1;
void clear_name();
static const int kNameFieldNumber = 1;
const ::std::string& name() const;
void set_name(const ::std::string& value);
#if LANG_CXX11
void set_name(::std::string&& value);
#endif
void set_name(const char* value);
void set_name(const char* value, size_t size);
::std::string* mutable_name();
::std::string* release_name();
void set_allocated_name(::std::string* name);
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
::std::string* unsafe_arena_release_name();
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
void unsafe_arena_set_allocated_name(
::std::string* name);
// string op = 2;
void clear_op();
static const int kOpFieldNumber = 2;
const ::std::string& op() const;
void set_op(const ::std::string& value);
#if LANG_CXX11
void set_op(::std::string&& value);
#endif
void set_op(const char* value);
void set_op(const char* value, size_t size);
::std::string* mutable_op();
::std::string* release_op();
void set_allocated_op(::std::string* op);
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
::std::string* unsafe_arena_release_op();
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
void unsafe_arena_set_allocated_op(
::std::string* op);
// string device = 4;
void clear_device();
static const int kDeviceFieldNumber = 4;
const ::std::string& device() const;
void set_device(const ::std::string& value);
#if LANG_CXX11
void set_device(::std::string&& value);
#endif
void set_device(const char* value);
void set_device(const char* value, size_t size);
::std::string* mutable_device();
::std::string* release_device();
void set_allocated_device(::std::string* device);
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
::std::string* unsafe_arena_release_device();
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
void unsafe_arena_set_allocated_device(
::std::string* device);
// @@protoc_insertion_point(class_scope:opencv_tensorflow.NodeDef)
private:
::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
typedef void InternalArenaConstructable_;
typedef void DestructorSkippable_;
::google::protobuf::RepeatedPtrField< ::std::string> input_;
::google::protobuf::internal::MapField<
NodeDef_AttrEntry_DoNotUse,
::std::string, ::opencv_tensorflow::AttrValue,
::google::protobuf::internal::WireFormatLite::TYPE_STRING,
::google::protobuf::internal::WireFormatLite::TYPE_MESSAGE,
0 > attr_;
::google::protobuf::internal::ArenaStringPtr name_;
::google::protobuf::internal::ArenaStringPtr op_;
::google::protobuf::internal::ArenaStringPtr device_;
mutable int _cached_size_;
friend struct ::protobuf_graph_2eproto::TableStruct;
friend void ::protobuf_graph_2eproto::InitDefaultsNodeDefImpl();
};
// ===================================================================
// ===================================================================
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
// GraphDef
// repeated .opencv_tensorflow.NodeDef node = 1;
inline int GraphDef::node_size() const {
return node_.size();
}
inline void GraphDef::clear_node() {
node_.Clear();
}
inline const ::opencv_tensorflow::NodeDef& GraphDef::node(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.node)
return node_.Get(index);
}
inline ::opencv_tensorflow::NodeDef* GraphDef::mutable_node(int index) {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.GraphDef.node)
return node_.Mutable(index);
}
inline ::opencv_tensorflow::NodeDef* GraphDef::add_node() {
// @@protoc_insertion_point(field_add:opencv_tensorflow.GraphDef.node)
return node_.Add();
}
inline ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >*
GraphDef::mutable_node() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.GraphDef.node)
return &node_;
}
inline const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::NodeDef >&
GraphDef::node() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.GraphDef.node)
return node_;
}
// .opencv_tensorflow.VersionDef versions = 4;
inline bool GraphDef::has_versions() const {
return this != internal_default_instance() && versions_ != NULL;
}
inline const ::opencv_tensorflow::VersionDef& GraphDef::versions() const {
const ::opencv_tensorflow::VersionDef* p = versions_;
// @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.versions)
return p != NULL ? *p : *reinterpret_cast<const ::opencv_tensorflow::VersionDef*>(
&::opencv_tensorflow::_VersionDef_default_instance_);
}
inline ::opencv_tensorflow::VersionDef* GraphDef::release_versions() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.GraphDef.versions)
::opencv_tensorflow::VersionDef* temp = versions_;
if (GetArenaNoVirtual() != NULL) {
temp = ::google::protobuf::internal::DuplicateIfNonNull(temp, NULL);
}
versions_ = NULL;
return temp;
}
inline ::opencv_tensorflow::VersionDef* GraphDef::unsafe_arena_release_versions() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.GraphDef.versions)
::opencv_tensorflow::VersionDef* temp = versions_;
versions_ = NULL;
return temp;
}
inline ::opencv_tensorflow::VersionDef* GraphDef::mutable_versions() {
if (versions_ == NULL) {
_slow_mutable_versions();
}
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.GraphDef.versions)
return versions_;
}
inline void GraphDef::set_allocated_versions(::opencv_tensorflow::VersionDef* versions) {
::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
if (message_arena == NULL) {
delete reinterpret_cast< ::google::protobuf::MessageLite*>(versions_);
}
if (versions) {
::google::protobuf::Arena* submessage_arena =
reinterpret_cast< ::google::protobuf::MessageLite*>(versions)->GetArena();
if (message_arena != submessage_arena) {
versions = ::google::protobuf::internal::GetOwnedMessage(
message_arena, versions, submessage_arena);
}
} else {
}
versions_ = versions;
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.GraphDef.versions)
}
// int32 version = 3 [deprecated = true];
inline void GraphDef::clear_version() {
version_ = 0;
}
inline ::google::protobuf::int32 GraphDef::version() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.version)
return version_;
}
inline void GraphDef::set_version(::google::protobuf::int32 value) {
version_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.GraphDef.version)
}
// .opencv_tensorflow.FunctionDefLibrary library = 2;
inline bool GraphDef::has_library() const {
return this != internal_default_instance() && library_ != NULL;
}
inline const ::opencv_tensorflow::FunctionDefLibrary& GraphDef::library() const {
const ::opencv_tensorflow::FunctionDefLibrary* p = library_;
// @@protoc_insertion_point(field_get:opencv_tensorflow.GraphDef.library)
return p != NULL ? *p : *reinterpret_cast<const ::opencv_tensorflow::FunctionDefLibrary*>(
&::opencv_tensorflow::_FunctionDefLibrary_default_instance_);
}
inline ::opencv_tensorflow::FunctionDefLibrary* GraphDef::release_library() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.GraphDef.library)
::opencv_tensorflow::FunctionDefLibrary* temp = library_;
if (GetArenaNoVirtual() != NULL) {
temp = ::google::protobuf::internal::DuplicateIfNonNull(temp, NULL);
}
library_ = NULL;
return temp;
}
inline ::opencv_tensorflow::FunctionDefLibrary* GraphDef::unsafe_arena_release_library() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.GraphDef.library)
::opencv_tensorflow::FunctionDefLibrary* temp = library_;
library_ = NULL;
return temp;
}
inline ::opencv_tensorflow::FunctionDefLibrary* GraphDef::mutable_library() {
if (library_ == NULL) {
_slow_mutable_library();
}
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.GraphDef.library)
return library_;
}
inline void GraphDef::set_allocated_library(::opencv_tensorflow::FunctionDefLibrary* library) {
::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
if (message_arena == NULL) {
delete reinterpret_cast< ::google::protobuf::MessageLite*>(library_);
}
if (library) {
::google::protobuf::Arena* submessage_arena =
reinterpret_cast< ::google::protobuf::MessageLite*>(library)->GetArena();
if (message_arena != submessage_arena) {
library = ::google::protobuf::internal::GetOwnedMessage(
message_arena, library, submessage_arena);
}
} else {
}
library_ = library;
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.GraphDef.library)
}
// -------------------------------------------------------------------
// -------------------------------------------------------------------
// NodeDef
// string name = 1;
inline void NodeDef::clear_name() {
name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline const ::std::string& NodeDef::name() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.name)
return name_.Get();
}
inline void NodeDef::set_name(const ::std::string& value) {
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
// @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.name)
}
#if LANG_CXX11
inline void NodeDef::set_name(::std::string&& value) {
name_.Set(
&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.NodeDef.name)
}
#endif
inline void NodeDef::set_name(const char* value) {
GOOGLE_DCHECK(value != NULL);
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.name)
}
inline void NodeDef::set_name(const char* value,
size_t size) {
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.name)
}
inline ::std::string* NodeDef::mutable_name() {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.name)
return name_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline ::std::string* NodeDef::release_name() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.NodeDef.name)
return name_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline void NodeDef::set_allocated_name(::std::string* name) {
if (name != NULL) {
} else {
}
name_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name,
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.NodeDef.name)
}
inline ::std::string* NodeDef::unsafe_arena_release_name() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.NodeDef.name)
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
return name_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
GetArenaNoVirtual());
}
inline void NodeDef::unsafe_arena_set_allocated_name(
::std::string* name) {
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
if (name != NULL) {
} else {
}
name_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
name, GetArenaNoVirtual());
// @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.NodeDef.name)
}
// string op = 2;
inline void NodeDef::clear_op() {
op_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline const ::std::string& NodeDef::op() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.op)
return op_.Get();
}
inline void NodeDef::set_op(const ::std::string& value) {
op_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
// @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.op)
}
#if LANG_CXX11
inline void NodeDef::set_op(::std::string&& value) {
op_.Set(
&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.NodeDef.op)
}
#endif
inline void NodeDef::set_op(const char* value) {
GOOGLE_DCHECK(value != NULL);
op_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.op)
}
inline void NodeDef::set_op(const char* value,
size_t size) {
op_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.op)
}
inline ::std::string* NodeDef::mutable_op() {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.op)
return op_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline ::std::string* NodeDef::release_op() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.NodeDef.op)
return op_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline void NodeDef::set_allocated_op(::std::string* op) {
if (op != NULL) {
} else {
}
op_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), op,
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.NodeDef.op)
}
inline ::std::string* NodeDef::unsafe_arena_release_op() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.NodeDef.op)
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
return op_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
GetArenaNoVirtual());
}
inline void NodeDef::unsafe_arena_set_allocated_op(
::std::string* op) {
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
if (op != NULL) {
} else {
}
op_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
op, GetArenaNoVirtual());
// @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.NodeDef.op)
}
// repeated string input = 3;
inline int NodeDef::input_size() const {
return input_.size();
}
inline void NodeDef::clear_input() {
input_.Clear();
}
inline const ::std::string& NodeDef::input(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.input)
return input_.Get(index);
}
inline ::std::string* NodeDef::mutable_input(int index) {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.input)
return input_.Mutable(index);
}
inline void NodeDef::set_input(int index, const ::std::string& value) {
// @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.input)
input_.Mutable(index)->assign(value);
}
#if LANG_CXX11
inline void NodeDef::set_input(int index, ::std::string&& value) {
// @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.input)
input_.Mutable(index)->assign(std::move(value));
}
#endif
inline void NodeDef::set_input(int index, const char* value) {
GOOGLE_DCHECK(value != NULL);
input_.Mutable(index)->assign(value);
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.input)
}
inline void NodeDef::set_input(int index, const char* value, size_t size) {
input_.Mutable(index)->assign(
reinterpret_cast<const char*>(value), size);
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.input)
}
inline ::std::string* NodeDef::add_input() {
// @@protoc_insertion_point(field_add_mutable:opencv_tensorflow.NodeDef.input)
return input_.Add();
}
inline void NodeDef::add_input(const ::std::string& value) {
input_.Add()->assign(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.NodeDef.input)
}
#if LANG_CXX11
inline void NodeDef::add_input(::std::string&& value) {
input_.Add(std::move(value));
// @@protoc_insertion_point(field_add:opencv_tensorflow.NodeDef.input)
}
#endif
inline void NodeDef::add_input(const char* value) {
GOOGLE_DCHECK(value != NULL);
input_.Add()->assign(value);
// @@protoc_insertion_point(field_add_char:opencv_tensorflow.NodeDef.input)
}
inline void NodeDef::add_input(const char* value, size_t size) {
input_.Add()->assign(reinterpret_cast<const char*>(value), size);
// @@protoc_insertion_point(field_add_pointer:opencv_tensorflow.NodeDef.input)
}
inline const ::google::protobuf::RepeatedPtrField< ::std::string>&
NodeDef::input() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.NodeDef.input)
return input_;
}
inline ::google::protobuf::RepeatedPtrField< ::std::string>*
NodeDef::mutable_input() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.NodeDef.input)
return &input_;
}
// string device = 4;
inline void NodeDef::clear_device() {
device_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline const ::std::string& NodeDef::device() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.NodeDef.device)
return device_.Get();
}
inline void NodeDef::set_device(const ::std::string& value) {
device_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
// @@protoc_insertion_point(field_set:opencv_tensorflow.NodeDef.device)
}
#if LANG_CXX11
inline void NodeDef::set_device(::std::string&& value) {
device_.Set(
&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.NodeDef.device)
}
#endif
inline void NodeDef::set_device(const char* value) {
GOOGLE_DCHECK(value != NULL);
device_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.NodeDef.device)
}
inline void NodeDef::set_device(const char* value,
size_t size) {
device_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.NodeDef.device)
}
inline ::std::string* NodeDef::mutable_device() {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.NodeDef.device)
return device_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline ::std::string* NodeDef::release_device() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.NodeDef.device)
return device_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline void NodeDef::set_allocated_device(::std::string* device) {
if (device != NULL) {
} else {
}
device_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), device,
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.NodeDef.device)
}
inline ::std::string* NodeDef::unsafe_arena_release_device() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.NodeDef.device)
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
return device_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
GetArenaNoVirtual());
}
inline void NodeDef::unsafe_arena_set_allocated_device(
::std::string* device) {
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
if (device != NULL) {
} else {
}
device_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
device, GetArenaNoVirtual());
// @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.NodeDef.device)
}
// map<string, .opencv_tensorflow.AttrValue> attr = 5;
inline int NodeDef::attr_size() const {
return attr_.size();
}
inline const ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >&
NodeDef::attr() const {
// @@protoc_insertion_point(field_map:opencv_tensorflow.NodeDef.attr)
return attr_.GetMap();
}
inline ::google::protobuf::Map< ::std::string, ::opencv_tensorflow::AttrValue >*
NodeDef::mutable_attr() {
// @@protoc_insertion_point(field_mutable_map:opencv_tensorflow.NodeDef.attr)
return attr_.MutableMap();
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif // __GNUC__
// -------------------------------------------------------------------
// -------------------------------------------------------------------
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)
#endif // PROTOBUF_graph_2eproto__INCLUDED

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,844 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: tensor.proto
#ifndef PROTOBUF_tensor_2eproto__INCLUDED
#define PROTOBUF_tensor_2eproto__INCLUDED
#include <string>
#include <google/protobuf/stubs/common.h>
#if GOOGLE_PROTOBUF_VERSION < 3005000
#error This file was generated by a newer version of protoc which is
#error incompatible with your Protocol Buffer headers. Please update
#error your headers.
#endif
#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
#error This file was generated by an older version of protoc which is
#error incompatible with your Protocol Buffer headers. Please
#error regenerate this file with a newer version of protoc.
#endif
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/arena.h>
#include <google/protobuf/arenastring.h>
#include <google/protobuf/generated_message_table_driven.h>
#include <google/protobuf/generated_message_util.h>
#include <google/protobuf/metadata.h>
#include <google/protobuf/message.h>
#include <google/protobuf/repeated_field.h> // IWYU pragma: export
#include <google/protobuf/extension_set.h> // IWYU pragma: export
#include <google/protobuf/unknown_field_set.h>
#include "tensor_shape.pb.h"
#include "types.pb.h"
// @@protoc_insertion_point(includes)
namespace protobuf_tensor_2eproto {
// Internal implementation detail -- do not use these members.
struct TableStruct {
static const ::google::protobuf::internal::ParseTableField entries[];
static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
static const ::google::protobuf::internal::ParseTable schema[1];
static const ::google::protobuf::internal::FieldMetadata field_metadata[];
static const ::google::protobuf::internal::SerializationTable serialization_table[];
static const ::google::protobuf::uint32 offsets[];
};
void AddDescriptors();
void InitDefaultsTensorProtoImpl();
void InitDefaultsTensorProto();
inline void InitDefaults() {
InitDefaultsTensorProto();
}
} // namespace protobuf_tensor_2eproto
namespace opencv_tensorflow {
class TensorProto;
class TensorProtoDefaultTypeInternal;
extern TensorProtoDefaultTypeInternal _TensorProto_default_instance_;
} // namespace opencv_tensorflow
namespace opencv_tensorflow {
// ===================================================================
class TensorProto : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.TensorProto) */ {
public:
TensorProto();
virtual ~TensorProto();
TensorProto(const TensorProto& from);
inline TensorProto& operator=(const TensorProto& from) {
CopyFrom(from);
return *this;
}
#if LANG_CXX11
TensorProto(TensorProto&& from) noexcept
: TensorProto() {
*this = ::std::move(from);
}
inline TensorProto& operator=(TensorProto&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
#endif
inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
return GetArenaNoVirtual();
}
inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
return MaybeArenaPtr();
}
static const ::google::protobuf::Descriptor* descriptor();
static const TensorProto& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const TensorProto* internal_default_instance() {
return reinterpret_cast<const TensorProto*>(
&_TensorProto_default_instance_);
}
static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
0;
void UnsafeArenaSwap(TensorProto* other);
void Swap(TensorProto* other);
friend void swap(TensorProto& a, TensorProto& b) {
a.Swap(&b);
}
// implements Message ----------------------------------------------
inline TensorProto* New() const PROTOBUF_FINAL { return New(NULL); }
TensorProto* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void CopyFrom(const TensorProto& from);
void MergeFrom(const TensorProto& from);
void Clear() PROTOBUF_FINAL;
bool IsInitialized() const PROTOBUF_FINAL;
size_t ByteSizeLong() const PROTOBUF_FINAL;
bool MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
void SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
private:
void SharedCtor();
void SharedDtor();
void SetCachedSize(int size) const PROTOBUF_FINAL;
void InternalSwap(TensorProto* other);
protected:
explicit TensorProto(::google::protobuf::Arena* arena);
private:
static void ArenaDtor(void* object);
inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
private:
inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
return _internal_metadata_.arena();
}
inline void* MaybeArenaPtr() const {
return _internal_metadata_.raw_arena_ptr();
}
public:
::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
// nested types ----------------------------------------------------
// accessors -------------------------------------------------------
// repeated float float_val = 5 [packed = true];
int float_val_size() const;
void clear_float_val();
static const int kFloatValFieldNumber = 5;
float float_val(int index) const;
void set_float_val(int index, float value);
void add_float_val(float value);
const ::google::protobuf::RepeatedField< float >&
float_val() const;
::google::protobuf::RepeatedField< float >*
mutable_float_val();
// repeated double double_val = 6 [packed = true];
int double_val_size() const;
void clear_double_val();
static const int kDoubleValFieldNumber = 6;
double double_val(int index) const;
void set_double_val(int index, double value);
void add_double_val(double value);
const ::google::protobuf::RepeatedField< double >&
double_val() const;
::google::protobuf::RepeatedField< double >*
mutable_double_val();
// repeated int32 int_val = 7 [packed = true];
int int_val_size() const;
void clear_int_val();
static const int kIntValFieldNumber = 7;
::google::protobuf::int32 int_val(int index) const;
void set_int_val(int index, ::google::protobuf::int32 value);
void add_int_val(::google::protobuf::int32 value);
const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
int_val() const;
::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
mutable_int_val();
// repeated bytes string_val = 8;
int string_val_size() const;
void clear_string_val();
static const int kStringValFieldNumber = 8;
const ::std::string& string_val(int index) const;
::std::string* mutable_string_val(int index);
void set_string_val(int index, const ::std::string& value);
#if LANG_CXX11
void set_string_val(int index, ::std::string&& value);
#endif
void set_string_val(int index, const char* value);
void set_string_val(int index, const void* value, size_t size);
::std::string* add_string_val();
void add_string_val(const ::std::string& value);
#if LANG_CXX11
void add_string_val(::std::string&& value);
#endif
void add_string_val(const char* value);
void add_string_val(const void* value, size_t size);
const ::google::protobuf::RepeatedPtrField< ::std::string>& string_val() const;
::google::protobuf::RepeatedPtrField< ::std::string>* mutable_string_val();
// repeated float scomplex_val = 9 [packed = true];
int scomplex_val_size() const;
void clear_scomplex_val();
static const int kScomplexValFieldNumber = 9;
float scomplex_val(int index) const;
void set_scomplex_val(int index, float value);
void add_scomplex_val(float value);
const ::google::protobuf::RepeatedField< float >&
scomplex_val() const;
::google::protobuf::RepeatedField< float >*
mutable_scomplex_val();
// repeated int64 int64_val = 10 [packed = true];
int int64_val_size() const;
void clear_int64_val();
static const int kInt64ValFieldNumber = 10;
::google::protobuf::int64 int64_val(int index) const;
void set_int64_val(int index, ::google::protobuf::int64 value);
void add_int64_val(::google::protobuf::int64 value);
const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
int64_val() const;
::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
mutable_int64_val();
// repeated bool bool_val = 11 [packed = true];
int bool_val_size() const;
void clear_bool_val();
static const int kBoolValFieldNumber = 11;
bool bool_val(int index) const;
void set_bool_val(int index, bool value);
void add_bool_val(bool value);
const ::google::protobuf::RepeatedField< bool >&
bool_val() const;
::google::protobuf::RepeatedField< bool >*
mutable_bool_val();
// repeated double dcomplex_val = 12 [packed = true];
int dcomplex_val_size() const;
void clear_dcomplex_val();
static const int kDcomplexValFieldNumber = 12;
double dcomplex_val(int index) const;
void set_dcomplex_val(int index, double value);
void add_dcomplex_val(double value);
const ::google::protobuf::RepeatedField< double >&
dcomplex_val() const;
::google::protobuf::RepeatedField< double >*
mutable_dcomplex_val();
// repeated int32 half_val = 13 [packed = true];
int half_val_size() const;
void clear_half_val();
static const int kHalfValFieldNumber = 13;
::google::protobuf::int32 half_val(int index) const;
void set_half_val(int index, ::google::protobuf::int32 value);
void add_half_val(::google::protobuf::int32 value);
const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
half_val() const;
::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
mutable_half_val();
// bytes tensor_content = 4;
void clear_tensor_content();
static const int kTensorContentFieldNumber = 4;
const ::std::string& tensor_content() const;
void set_tensor_content(const ::std::string& value);
#if LANG_CXX11
void set_tensor_content(::std::string&& value);
#endif
void set_tensor_content(const char* value);
void set_tensor_content(const void* value, size_t size);
::std::string* mutable_tensor_content();
::std::string* release_tensor_content();
void set_allocated_tensor_content(::std::string* tensor_content);
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
::std::string* unsafe_arena_release_tensor_content();
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
void unsafe_arena_set_allocated_tensor_content(
::std::string* tensor_content);
// .opencv_tensorflow.TensorShapeProto tensor_shape = 2;
bool has_tensor_shape() const;
void clear_tensor_shape();
static const int kTensorShapeFieldNumber = 2;
private:
void _slow_mutable_tensor_shape();
public:
const ::opencv_tensorflow::TensorShapeProto& tensor_shape() const;
::opencv_tensorflow::TensorShapeProto* release_tensor_shape();
::opencv_tensorflow::TensorShapeProto* mutable_tensor_shape();
void set_allocated_tensor_shape(::opencv_tensorflow::TensorShapeProto* tensor_shape);
void unsafe_arena_set_allocated_tensor_shape(
::opencv_tensorflow::TensorShapeProto* tensor_shape);
::opencv_tensorflow::TensorShapeProto* unsafe_arena_release_tensor_shape();
// .opencv_tensorflow.DataType dtype = 1;
void clear_dtype();
static const int kDtypeFieldNumber = 1;
::opencv_tensorflow::DataType dtype() const;
void set_dtype(::opencv_tensorflow::DataType value);
// int32 version_number = 3;
void clear_version_number();
static const int kVersionNumberFieldNumber = 3;
::google::protobuf::int32 version_number() const;
void set_version_number(::google::protobuf::int32 value);
// @@protoc_insertion_point(class_scope:opencv_tensorflow.TensorProto)
private:
::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
typedef void InternalArenaConstructable_;
typedef void DestructorSkippable_;
::google::protobuf::RepeatedField< float > float_val_;
mutable int _float_val_cached_byte_size_;
::google::protobuf::RepeatedField< double > double_val_;
mutable int _double_val_cached_byte_size_;
::google::protobuf::RepeatedField< ::google::protobuf::int32 > int_val_;
mutable int _int_val_cached_byte_size_;
::google::protobuf::RepeatedPtrField< ::std::string> string_val_;
::google::protobuf::RepeatedField< float > scomplex_val_;
mutable int _scomplex_val_cached_byte_size_;
::google::protobuf::RepeatedField< ::google::protobuf::int64 > int64_val_;
mutable int _int64_val_cached_byte_size_;
::google::protobuf::RepeatedField< bool > bool_val_;
mutable int _bool_val_cached_byte_size_;
::google::protobuf::RepeatedField< double > dcomplex_val_;
mutable int _dcomplex_val_cached_byte_size_;
::google::protobuf::RepeatedField< ::google::protobuf::int32 > half_val_;
mutable int _half_val_cached_byte_size_;
::google::protobuf::internal::ArenaStringPtr tensor_content_;
::opencv_tensorflow::TensorShapeProto* tensor_shape_;
int dtype_;
::google::protobuf::int32 version_number_;
mutable int _cached_size_;
friend struct ::protobuf_tensor_2eproto::TableStruct;
friend void ::protobuf_tensor_2eproto::InitDefaultsTensorProtoImpl();
};
// ===================================================================
// ===================================================================
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
// TensorProto
// .opencv_tensorflow.DataType dtype = 1;
inline void TensorProto::clear_dtype() {
dtype_ = 0;
}
inline ::opencv_tensorflow::DataType TensorProto::dtype() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.dtype)
return static_cast< ::opencv_tensorflow::DataType >(dtype_);
}
inline void TensorProto::set_dtype(::opencv_tensorflow::DataType value) {
dtype_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.dtype)
}
// .opencv_tensorflow.TensorShapeProto tensor_shape = 2;
inline bool TensorProto::has_tensor_shape() const {
return this != internal_default_instance() && tensor_shape_ != NULL;
}
inline const ::opencv_tensorflow::TensorShapeProto& TensorProto::tensor_shape() const {
const ::opencv_tensorflow::TensorShapeProto* p = tensor_shape_;
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.tensor_shape)
return p != NULL ? *p : *reinterpret_cast<const ::opencv_tensorflow::TensorShapeProto*>(
&::opencv_tensorflow::_TensorShapeProto_default_instance_);
}
inline ::opencv_tensorflow::TensorShapeProto* TensorProto::release_tensor_shape() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.TensorProto.tensor_shape)
::opencv_tensorflow::TensorShapeProto* temp = tensor_shape_;
if (GetArenaNoVirtual() != NULL) {
temp = ::google::protobuf::internal::DuplicateIfNonNull(temp, NULL);
}
tensor_shape_ = NULL;
return temp;
}
inline ::opencv_tensorflow::TensorShapeProto* TensorProto::unsafe_arena_release_tensor_shape() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.TensorProto.tensor_shape)
::opencv_tensorflow::TensorShapeProto* temp = tensor_shape_;
tensor_shape_ = NULL;
return temp;
}
inline ::opencv_tensorflow::TensorShapeProto* TensorProto::mutable_tensor_shape() {
if (tensor_shape_ == NULL) {
_slow_mutable_tensor_shape();
}
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorProto.tensor_shape)
return tensor_shape_;
}
inline void TensorProto::set_allocated_tensor_shape(::opencv_tensorflow::TensorShapeProto* tensor_shape) {
::google::protobuf::Arena* message_arena = GetArenaNoVirtual();
if (message_arena == NULL) {
delete reinterpret_cast< ::google::protobuf::MessageLite*>(tensor_shape_);
}
if (tensor_shape) {
::google::protobuf::Arena* submessage_arena =
reinterpret_cast< ::google::protobuf::MessageLite*>(tensor_shape)->GetArena();
if (message_arena != submessage_arena) {
tensor_shape = ::google::protobuf::internal::GetOwnedMessage(
message_arena, tensor_shape, submessage_arena);
}
} else {
}
tensor_shape_ = tensor_shape;
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.TensorProto.tensor_shape)
}
// int32 version_number = 3;
inline void TensorProto::clear_version_number() {
version_number_ = 0;
}
inline ::google::protobuf::int32 TensorProto::version_number() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.version_number)
return version_number_;
}
inline void TensorProto::set_version_number(::google::protobuf::int32 value) {
version_number_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.version_number)
}
// bytes tensor_content = 4;
inline void TensorProto::clear_tensor_content() {
tensor_content_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline const ::std::string& TensorProto::tensor_content() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.tensor_content)
return tensor_content_.Get();
}
inline void TensorProto::set_tensor_content(const ::std::string& value) {
tensor_content_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.tensor_content)
}
#if LANG_CXX11
inline void TensorProto::set_tensor_content(::std::string&& value) {
tensor_content_.Set(
&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.TensorProto.tensor_content)
}
#endif
inline void TensorProto::set_tensor_content(const char* value) {
GOOGLE_DCHECK(value != NULL);
tensor_content_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.TensorProto.tensor_content)
}
inline void TensorProto::set_tensor_content(const void* value,
size_t size) {
tensor_content_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.TensorProto.tensor_content)
}
inline ::std::string* TensorProto::mutable_tensor_content() {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorProto.tensor_content)
return tensor_content_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline ::std::string* TensorProto::release_tensor_content() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.TensorProto.tensor_content)
return tensor_content_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline void TensorProto::set_allocated_tensor_content(::std::string* tensor_content) {
if (tensor_content != NULL) {
} else {
}
tensor_content_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), tensor_content,
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.TensorProto.tensor_content)
}
inline ::std::string* TensorProto::unsafe_arena_release_tensor_content() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.TensorProto.tensor_content)
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
return tensor_content_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
GetArenaNoVirtual());
}
inline void TensorProto::unsafe_arena_set_allocated_tensor_content(
::std::string* tensor_content) {
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
if (tensor_content != NULL) {
} else {
}
tensor_content_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
tensor_content, GetArenaNoVirtual());
// @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.TensorProto.tensor_content)
}
// repeated int32 half_val = 13 [packed = true];
inline int TensorProto::half_val_size() const {
return half_val_.size();
}
inline void TensorProto::clear_half_val() {
half_val_.Clear();
}
inline ::google::protobuf::int32 TensorProto::half_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.half_val)
return half_val_.Get(index);
}
inline void TensorProto::set_half_val(int index, ::google::protobuf::int32 value) {
half_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.half_val)
}
inline void TensorProto::add_half_val(::google::protobuf::int32 value) {
half_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.half_val)
}
inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
TensorProto::half_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.half_val)
return half_val_;
}
inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
TensorProto::mutable_half_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.half_val)
return &half_val_;
}
// repeated float float_val = 5 [packed = true];
inline int TensorProto::float_val_size() const {
return float_val_.size();
}
inline void TensorProto::clear_float_val() {
float_val_.Clear();
}
inline float TensorProto::float_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.float_val)
return float_val_.Get(index);
}
inline void TensorProto::set_float_val(int index, float value) {
float_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.float_val)
}
inline void TensorProto::add_float_val(float value) {
float_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.float_val)
}
inline const ::google::protobuf::RepeatedField< float >&
TensorProto::float_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.float_val)
return float_val_;
}
inline ::google::protobuf::RepeatedField< float >*
TensorProto::mutable_float_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.float_val)
return &float_val_;
}
// repeated double double_val = 6 [packed = true];
inline int TensorProto::double_val_size() const {
return double_val_.size();
}
inline void TensorProto::clear_double_val() {
double_val_.Clear();
}
inline double TensorProto::double_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.double_val)
return double_val_.Get(index);
}
inline void TensorProto::set_double_val(int index, double value) {
double_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.double_val)
}
inline void TensorProto::add_double_val(double value) {
double_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.double_val)
}
inline const ::google::protobuf::RepeatedField< double >&
TensorProto::double_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.double_val)
return double_val_;
}
inline ::google::protobuf::RepeatedField< double >*
TensorProto::mutable_double_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.double_val)
return &double_val_;
}
// repeated int32 int_val = 7 [packed = true];
inline int TensorProto::int_val_size() const {
return int_val_.size();
}
inline void TensorProto::clear_int_val() {
int_val_.Clear();
}
inline ::google::protobuf::int32 TensorProto::int_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.int_val)
return int_val_.Get(index);
}
inline void TensorProto::set_int_val(int index, ::google::protobuf::int32 value) {
int_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.int_val)
}
inline void TensorProto::add_int_val(::google::protobuf::int32 value) {
int_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.int_val)
}
inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
TensorProto::int_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.int_val)
return int_val_;
}
inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
TensorProto::mutable_int_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.int_val)
return &int_val_;
}
// repeated bytes string_val = 8;
inline int TensorProto::string_val_size() const {
return string_val_.size();
}
inline void TensorProto::clear_string_val() {
string_val_.Clear();
}
inline const ::std::string& TensorProto::string_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.string_val)
return string_val_.Get(index);
}
inline ::std::string* TensorProto::mutable_string_val(int index) {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorProto.string_val)
return string_val_.Mutable(index);
}
inline void TensorProto::set_string_val(int index, const ::std::string& value) {
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.string_val)
string_val_.Mutable(index)->assign(value);
}
#if LANG_CXX11
inline void TensorProto::set_string_val(int index, ::std::string&& value) {
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.string_val)
string_val_.Mutable(index)->assign(std::move(value));
}
#endif
inline void TensorProto::set_string_val(int index, const char* value) {
GOOGLE_DCHECK(value != NULL);
string_val_.Mutable(index)->assign(value);
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.TensorProto.string_val)
}
inline void TensorProto::set_string_val(int index, const void* value, size_t size) {
string_val_.Mutable(index)->assign(
reinterpret_cast<const char*>(value), size);
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.TensorProto.string_val)
}
inline ::std::string* TensorProto::add_string_val() {
// @@protoc_insertion_point(field_add_mutable:opencv_tensorflow.TensorProto.string_val)
return string_val_.Add();
}
inline void TensorProto::add_string_val(const ::std::string& value) {
string_val_.Add()->assign(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.string_val)
}
#if LANG_CXX11
inline void TensorProto::add_string_val(::std::string&& value) {
string_val_.Add(std::move(value));
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.string_val)
}
#endif
inline void TensorProto::add_string_val(const char* value) {
GOOGLE_DCHECK(value != NULL);
string_val_.Add()->assign(value);
// @@protoc_insertion_point(field_add_char:opencv_tensorflow.TensorProto.string_val)
}
inline void TensorProto::add_string_val(const void* value, size_t size) {
string_val_.Add()->assign(reinterpret_cast<const char*>(value), size);
// @@protoc_insertion_point(field_add_pointer:opencv_tensorflow.TensorProto.string_val)
}
inline const ::google::protobuf::RepeatedPtrField< ::std::string>&
TensorProto::string_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.string_val)
return string_val_;
}
inline ::google::protobuf::RepeatedPtrField< ::std::string>*
TensorProto::mutable_string_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.string_val)
return &string_val_;
}
// repeated float scomplex_val = 9 [packed = true];
inline int TensorProto::scomplex_val_size() const {
return scomplex_val_.size();
}
inline void TensorProto::clear_scomplex_val() {
scomplex_val_.Clear();
}
inline float TensorProto::scomplex_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.scomplex_val)
return scomplex_val_.Get(index);
}
inline void TensorProto::set_scomplex_val(int index, float value) {
scomplex_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.scomplex_val)
}
inline void TensorProto::add_scomplex_val(float value) {
scomplex_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.scomplex_val)
}
inline const ::google::protobuf::RepeatedField< float >&
TensorProto::scomplex_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.scomplex_val)
return scomplex_val_;
}
inline ::google::protobuf::RepeatedField< float >*
TensorProto::mutable_scomplex_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.scomplex_val)
return &scomplex_val_;
}
// repeated int64 int64_val = 10 [packed = true];
inline int TensorProto::int64_val_size() const {
return int64_val_.size();
}
inline void TensorProto::clear_int64_val() {
int64_val_.Clear();
}
inline ::google::protobuf::int64 TensorProto::int64_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.int64_val)
return int64_val_.Get(index);
}
inline void TensorProto::set_int64_val(int index, ::google::protobuf::int64 value) {
int64_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.int64_val)
}
inline void TensorProto::add_int64_val(::google::protobuf::int64 value) {
int64_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.int64_val)
}
inline const ::google::protobuf::RepeatedField< ::google::protobuf::int64 >&
TensorProto::int64_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.int64_val)
return int64_val_;
}
inline ::google::protobuf::RepeatedField< ::google::protobuf::int64 >*
TensorProto::mutable_int64_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.int64_val)
return &int64_val_;
}
// repeated bool bool_val = 11 [packed = true];
inline int TensorProto::bool_val_size() const {
return bool_val_.size();
}
inline void TensorProto::clear_bool_val() {
bool_val_.Clear();
}
inline bool TensorProto::bool_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.bool_val)
return bool_val_.Get(index);
}
inline void TensorProto::set_bool_val(int index, bool value) {
bool_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.bool_val)
}
inline void TensorProto::add_bool_val(bool value) {
bool_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.bool_val)
}
inline const ::google::protobuf::RepeatedField< bool >&
TensorProto::bool_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.bool_val)
return bool_val_;
}
inline ::google::protobuf::RepeatedField< bool >*
TensorProto::mutable_bool_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.bool_val)
return &bool_val_;
}
// repeated double dcomplex_val = 12 [packed = true];
inline int TensorProto::dcomplex_val_size() const {
return dcomplex_val_.size();
}
inline void TensorProto::clear_dcomplex_val() {
dcomplex_val_.Clear();
}
inline double TensorProto::dcomplex_val(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorProto.dcomplex_val)
return dcomplex_val_.Get(index);
}
inline void TensorProto::set_dcomplex_val(int index, double value) {
dcomplex_val_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorProto.dcomplex_val)
}
inline void TensorProto::add_dcomplex_val(double value) {
dcomplex_val_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorProto.dcomplex_val)
}
inline const ::google::protobuf::RepeatedField< double >&
TensorProto::dcomplex_val() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorProto.dcomplex_val)
return dcomplex_val_;
}
inline ::google::protobuf::RepeatedField< double >*
TensorProto::mutable_dcomplex_val() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorProto.dcomplex_val)
return &dcomplex_val_;
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif // __GNUC__
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)
#endif // PROTOBUF_tensor_2eproto__INCLUDED

View File

@@ -0,0 +1,783 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: tensor_shape.proto
#include "tensor_shape.pb.h"
#include <algorithm>
#include <google/protobuf/stubs/common.h>
#include <google/protobuf/stubs/port.h>
#include <google/protobuf/stubs/once.h>
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/wire_format_lite_inl.h>
#include <google/protobuf/descriptor.h>
#include <google/protobuf/generated_message_reflection.h>
#include <google/protobuf/reflection_ops.h>
#include <google/protobuf/wire_format.h>
// This is a temporary google only hack
#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
#include "third_party/protobuf/version.h"
#endif
// @@protoc_insertion_point(includes)
namespace opencv_tensorflow {
class TensorShapeProto_DimDefaultTypeInternal {
public:
::google::protobuf::internal::ExplicitlyConstructed<TensorShapeProto_Dim>
_instance;
} _TensorShapeProto_Dim_default_instance_;
class TensorShapeProtoDefaultTypeInternal {
public:
::google::protobuf::internal::ExplicitlyConstructed<TensorShapeProto>
_instance;
} _TensorShapeProto_default_instance_;
} // namespace opencv_tensorflow
namespace protobuf_tensor_5fshape_2eproto {
void InitDefaultsTensorShapeProto_DimImpl() {
GOOGLE_PROTOBUF_VERIFY_VERSION;
#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
::google::protobuf::internal::InitProtobufDefaultsForceUnique();
#else
::google::protobuf::internal::InitProtobufDefaults();
#endif // GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
{
void* ptr = &::opencv_tensorflow::_TensorShapeProto_Dim_default_instance_;
new (ptr) ::opencv_tensorflow::TensorShapeProto_Dim();
::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
}
::opencv_tensorflow::TensorShapeProto_Dim::InitAsDefaultInstance();
}
void InitDefaultsTensorShapeProto_Dim() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &InitDefaultsTensorShapeProto_DimImpl);
}
void InitDefaultsTensorShapeProtoImpl() {
GOOGLE_PROTOBUF_VERIFY_VERSION;
#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
::google::protobuf::internal::InitProtobufDefaultsForceUnique();
#else
::google::protobuf::internal::InitProtobufDefaults();
#endif // GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
{
void* ptr = &::opencv_tensorflow::_TensorShapeProto_default_instance_;
new (ptr) ::opencv_tensorflow::TensorShapeProto();
::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
}
::opencv_tensorflow::TensorShapeProto::InitAsDefaultInstance();
}
void InitDefaultsTensorShapeProto() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &InitDefaultsTensorShapeProtoImpl);
}
::google::protobuf::Metadata file_level_metadata[2];
const ::google::protobuf::uint32 TableStruct::offsets[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
~0u, // no _has_bits_
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto_Dim, _internal_metadata_),
~0u, // no _extensions_
~0u, // no _oneof_case_
~0u, // no _weak_field_map_
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto_Dim, size_),
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto_Dim, name_),
~0u, // no _has_bits_
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto, _internal_metadata_),
~0u, // no _extensions_
~0u, // no _oneof_case_
~0u, // no _weak_field_map_
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto, dim_),
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::TensorShapeProto, unknown_rank_),
};
static const ::google::protobuf::internal::MigrationSchema schemas[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
{ 0, -1, sizeof(::opencv_tensorflow::TensorShapeProto_Dim)},
{ 7, -1, sizeof(::opencv_tensorflow::TensorShapeProto)},
};
static ::google::protobuf::Message const * const file_default_instances[] = {
reinterpret_cast<const ::google::protobuf::Message*>(&::opencv_tensorflow::_TensorShapeProto_Dim_default_instance_),
reinterpret_cast<const ::google::protobuf::Message*>(&::opencv_tensorflow::_TensorShapeProto_default_instance_),
};
void protobuf_AssignDescriptors() {
AddDescriptors();
::google::protobuf::MessageFactory* factory = NULL;
AssignDescriptors(
"tensor_shape.proto", schemas, file_default_instances, TableStruct::offsets, factory,
file_level_metadata, NULL, NULL);
}
void protobuf_AssignDescriptorsOnce() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);
}
void protobuf_RegisterTypes(const ::std::string&) GOOGLE_PROTOBUF_ATTRIBUTE_COLD;
void protobuf_RegisterTypes(const ::std::string&) {
protobuf_AssignDescriptorsOnce();
::google::protobuf::internal::RegisterAllTypes(file_level_metadata, 2);
}
void AddDescriptorsImpl() {
InitDefaults();
static const char descriptor[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
"\n\022tensor_shape.proto\022\021opencv_tensorflow\""
"\201\001\n\020TensorShapeProto\0224\n\003dim\030\002 \003(\0132\'.open"
"cv_tensorflow.TensorShapeProto.Dim\022\024\n\014un"
"known_rank\030\003 \001(\010\032!\n\003Dim\022\014\n\004size\030\001 \001(\003\022\014\n"
"\004name\030\002 \001(\tB2\n\030org.tensorflow.frameworkB"
"\021TensorShapeProtosP\001\370\001\001b\006proto3"
};
::google::protobuf::DescriptorPool::InternalAddGeneratedFile(
descriptor, 231);
::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
"tensor_shape.proto", &protobuf_RegisterTypes);
}
void AddDescriptors() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
}
// Force AddDescriptors() to be called at dynamic initialization time.
struct StaticDescriptorInitializer {
StaticDescriptorInitializer() {
AddDescriptors();
}
} static_descriptor_initializer;
} // namespace protobuf_tensor_5fshape_2eproto
namespace opencv_tensorflow {
// ===================================================================
void TensorShapeProto_Dim::InitAsDefaultInstance() {
}
#if !defined(_MSC_VER) || _MSC_VER >= 1900
const int TensorShapeProto_Dim::kSizeFieldNumber;
const int TensorShapeProto_Dim::kNameFieldNumber;
#endif // !defined(_MSC_VER) || _MSC_VER >= 1900
TensorShapeProto_Dim::TensorShapeProto_Dim()
: ::google::protobuf::Message(), _internal_metadata_(NULL) {
if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
}
SharedCtor();
// @@protoc_insertion_point(constructor:opencv_tensorflow.TensorShapeProto.Dim)
}
TensorShapeProto_Dim::TensorShapeProto_Dim(::google::protobuf::Arena* arena)
: ::google::protobuf::Message(),
_internal_metadata_(arena) {
::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
SharedCtor();
RegisterArenaDtor(arena);
// @@protoc_insertion_point(arena_constructor:opencv_tensorflow.TensorShapeProto.Dim)
}
TensorShapeProto_Dim::TensorShapeProto_Dim(const TensorShapeProto_Dim& from)
: ::google::protobuf::Message(),
_internal_metadata_(NULL),
_cached_size_(0) {
_internal_metadata_.MergeFrom(from._internal_metadata_);
name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
if (from.name().size() > 0) {
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.name(),
GetArenaNoVirtual());
}
size_ = from.size_;
// @@protoc_insertion_point(copy_constructor:opencv_tensorflow.TensorShapeProto.Dim)
}
void TensorShapeProto_Dim::SharedCtor() {
name_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
size_ = GOOGLE_LONGLONG(0);
_cached_size_ = 0;
}
TensorShapeProto_Dim::~TensorShapeProto_Dim() {
// @@protoc_insertion_point(destructor:opencv_tensorflow.TensorShapeProto.Dim)
SharedDtor();
}
void TensorShapeProto_Dim::SharedDtor() {
GOOGLE_DCHECK(GetArenaNoVirtual() == NULL);
name_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
}
void TensorShapeProto_Dim::ArenaDtor(void* object) {
TensorShapeProto_Dim* _this = reinterpret_cast< TensorShapeProto_Dim* >(object);
(void)_this;
}
void TensorShapeProto_Dim::RegisterArenaDtor(::google::protobuf::Arena* arena) {
}
void TensorShapeProto_Dim::SetCachedSize(int size) const {
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_cached_size_ = size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
}
const ::google::protobuf::Descriptor* TensorShapeProto_Dim::descriptor() {
::protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages].descriptor;
}
const TensorShapeProto_Dim& TensorShapeProto_Dim::default_instance() {
::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_Dim();
return *internal_default_instance();
}
TensorShapeProto_Dim* TensorShapeProto_Dim::New(::google::protobuf::Arena* arena) const {
return ::google::protobuf::Arena::CreateMessage<TensorShapeProto_Dim>(arena);
}
void TensorShapeProto_Dim::Clear() {
// @@protoc_insertion_point(message_clear_start:opencv_tensorflow.TensorShapeProto.Dim)
::google::protobuf::uint32 cached_has_bits = 0;
// Prevent compiler warnings about cached_has_bits being unused
(void) cached_has_bits;
name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
size_ = GOOGLE_LONGLONG(0);
_internal_metadata_.Clear();
}
bool TensorShapeProto_Dim::MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) {
#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
::google::protobuf::uint32 tag;
// @@protoc_insertion_point(parse_start:opencv_tensorflow.TensorShapeProto.Dim)
for (;;) {
::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
tag = p.first;
if (!p.second) goto handle_unusual;
switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
// int64 size = 1;
case 1: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(8u /* 8 & 0xFF */)) {
DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
::google::protobuf::int64, ::google::protobuf::internal::WireFormatLite::TYPE_INT64>(
input, &size_)));
} else {
goto handle_unusual;
}
break;
}
// string name = 2;
case 2: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(18u /* 18 & 0xFF */)) {
DO_(::google::protobuf::internal::WireFormatLite::ReadString(
input, this->mutable_name()));
DO_(::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
this->name().data(), static_cast<int>(this->name().length()),
::google::protobuf::internal::WireFormatLite::PARSE,
"opencv_tensorflow.TensorShapeProto.Dim.name"));
} else {
goto handle_unusual;
}
break;
}
default: {
handle_unusual:
if (tag == 0) {
goto success;
}
DO_(::google::protobuf::internal::WireFormat::SkipField(
input, tag, _internal_metadata_.mutable_unknown_fields()));
break;
}
}
}
success:
// @@protoc_insertion_point(parse_success:opencv_tensorflow.TensorShapeProto.Dim)
return true;
failure:
// @@protoc_insertion_point(parse_failure:opencv_tensorflow.TensorShapeProto.Dim)
return false;
#undef DO_
}
void TensorShapeProto_Dim::SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const {
// @@protoc_insertion_point(serialize_start:opencv_tensorflow.TensorShapeProto.Dim)
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// int64 size = 1;
if (this->size() != 0) {
::google::protobuf::internal::WireFormatLite::WriteInt64(1, this->size(), output);
}
// string name = 2;
if (this->name().size() > 0) {
::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
this->name().data(), static_cast<int>(this->name().length()),
::google::protobuf::internal::WireFormatLite::SERIALIZE,
"opencv_tensorflow.TensorShapeProto.Dim.name");
::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased(
2, this->name(), output);
}
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
::google::protobuf::internal::WireFormat::SerializeUnknownFields(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), output);
}
// @@protoc_insertion_point(serialize_end:opencv_tensorflow.TensorShapeProto.Dim)
}
::google::protobuf::uint8* TensorShapeProto_Dim::InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const {
(void)deterministic; // Unused
// @@protoc_insertion_point(serialize_to_array_start:opencv_tensorflow.TensorShapeProto.Dim)
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// int64 size = 1;
if (this->size() != 0) {
target = ::google::protobuf::internal::WireFormatLite::WriteInt64ToArray(1, this->size(), target);
}
// string name = 2;
if (this->name().size() > 0) {
::google::protobuf::internal::WireFormatLite::VerifyUtf8String(
this->name().data(), static_cast<int>(this->name().length()),
::google::protobuf::internal::WireFormatLite::SERIALIZE,
"opencv_tensorflow.TensorShapeProto.Dim.name");
target =
::google::protobuf::internal::WireFormatLite::WriteStringToArray(
2, this->name(), target);
}
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), target);
}
// @@protoc_insertion_point(serialize_to_array_end:opencv_tensorflow.TensorShapeProto.Dim)
return target;
}
size_t TensorShapeProto_Dim::ByteSizeLong() const {
// @@protoc_insertion_point(message_byte_size_start:opencv_tensorflow.TensorShapeProto.Dim)
size_t total_size = 0;
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
total_size +=
::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()));
}
// string name = 2;
if (this->name().size() > 0) {
total_size += 1 +
::google::protobuf::internal::WireFormatLite::StringSize(
this->name());
}
// int64 size = 1;
if (this->size() != 0) {
total_size += 1 +
::google::protobuf::internal::WireFormatLite::Int64Size(
this->size());
}
int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_cached_size_ = cached_size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
return total_size;
}
void TensorShapeProto_Dim::MergeFrom(const ::google::protobuf::Message& from) {
// @@protoc_insertion_point(generalized_merge_from_start:opencv_tensorflow.TensorShapeProto.Dim)
GOOGLE_DCHECK_NE(&from, this);
const TensorShapeProto_Dim* source =
::google::protobuf::internal::DynamicCastToGenerated<const TensorShapeProto_Dim>(
&from);
if (source == NULL) {
// @@protoc_insertion_point(generalized_merge_from_cast_fail:opencv_tensorflow.TensorShapeProto.Dim)
::google::protobuf::internal::ReflectionOps::Merge(from, this);
} else {
// @@protoc_insertion_point(generalized_merge_from_cast_success:opencv_tensorflow.TensorShapeProto.Dim)
MergeFrom(*source);
}
}
void TensorShapeProto_Dim::MergeFrom(const TensorShapeProto_Dim& from) {
// @@protoc_insertion_point(class_specific_merge_from_start:opencv_tensorflow.TensorShapeProto.Dim)
GOOGLE_DCHECK_NE(&from, this);
_internal_metadata_.MergeFrom(from._internal_metadata_);
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
if (from.name().size() > 0) {
set_name(from.name());
}
if (from.size() != 0) {
set_size(from.size());
}
}
void TensorShapeProto_Dim::CopyFrom(const ::google::protobuf::Message& from) {
// @@protoc_insertion_point(generalized_copy_from_start:opencv_tensorflow.TensorShapeProto.Dim)
if (&from == this) return;
Clear();
MergeFrom(from);
}
void TensorShapeProto_Dim::CopyFrom(const TensorShapeProto_Dim& from) {
// @@protoc_insertion_point(class_specific_copy_from_start:opencv_tensorflow.TensorShapeProto.Dim)
if (&from == this) return;
Clear();
MergeFrom(from);
}
bool TensorShapeProto_Dim::IsInitialized() const {
return true;
}
void TensorShapeProto_Dim::Swap(TensorShapeProto_Dim* other) {
if (other == this) return;
if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) {
InternalSwap(other);
} else {
TensorShapeProto_Dim* temp = New(GetArenaNoVirtual());
temp->MergeFrom(*other);
other->CopyFrom(*this);
InternalSwap(temp);
if (GetArenaNoVirtual() == NULL) {
delete temp;
}
}
}
void TensorShapeProto_Dim::UnsafeArenaSwap(TensorShapeProto_Dim* other) {
if (other == this) return;
GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual());
InternalSwap(other);
}
void TensorShapeProto_Dim::InternalSwap(TensorShapeProto_Dim* other) {
using std::swap;
name_.Swap(&other->name_);
swap(size_, other->size_);
_internal_metadata_.Swap(&other->_internal_metadata_);
swap(_cached_size_, other->_cached_size_);
}
::google::protobuf::Metadata TensorShapeProto_Dim::GetMetadata() const {
protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages];
}
// ===================================================================
void TensorShapeProto::InitAsDefaultInstance() {
}
#if !defined(_MSC_VER) || _MSC_VER >= 1900
const int TensorShapeProto::kDimFieldNumber;
const int TensorShapeProto::kUnknownRankFieldNumber;
#endif // !defined(_MSC_VER) || _MSC_VER >= 1900
TensorShapeProto::TensorShapeProto()
: ::google::protobuf::Message(), _internal_metadata_(NULL) {
if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto();
}
SharedCtor();
// @@protoc_insertion_point(constructor:opencv_tensorflow.TensorShapeProto)
}
TensorShapeProto::TensorShapeProto(::google::protobuf::Arena* arena)
: ::google::protobuf::Message(),
_internal_metadata_(arena),
dim_(arena) {
::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto();
SharedCtor();
RegisterArenaDtor(arena);
// @@protoc_insertion_point(arena_constructor:opencv_tensorflow.TensorShapeProto)
}
TensorShapeProto::TensorShapeProto(const TensorShapeProto& from)
: ::google::protobuf::Message(),
_internal_metadata_(NULL),
dim_(from.dim_),
_cached_size_(0) {
_internal_metadata_.MergeFrom(from._internal_metadata_);
unknown_rank_ = from.unknown_rank_;
// @@protoc_insertion_point(copy_constructor:opencv_tensorflow.TensorShapeProto)
}
void TensorShapeProto::SharedCtor() {
unknown_rank_ = false;
_cached_size_ = 0;
}
TensorShapeProto::~TensorShapeProto() {
// @@protoc_insertion_point(destructor:opencv_tensorflow.TensorShapeProto)
SharedDtor();
}
void TensorShapeProto::SharedDtor() {
GOOGLE_DCHECK(GetArenaNoVirtual() == NULL);
}
void TensorShapeProto::ArenaDtor(void* object) {
TensorShapeProto* _this = reinterpret_cast< TensorShapeProto* >(object);
(void)_this;
}
void TensorShapeProto::RegisterArenaDtor(::google::protobuf::Arena* arena) {
}
void TensorShapeProto::SetCachedSize(int size) const {
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_cached_size_ = size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
}
const ::google::protobuf::Descriptor* TensorShapeProto::descriptor() {
::protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages].descriptor;
}
const TensorShapeProto& TensorShapeProto::default_instance() {
::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto();
return *internal_default_instance();
}
TensorShapeProto* TensorShapeProto::New(::google::protobuf::Arena* arena) const {
return ::google::protobuf::Arena::CreateMessage<TensorShapeProto>(arena);
}
void TensorShapeProto::Clear() {
// @@protoc_insertion_point(message_clear_start:opencv_tensorflow.TensorShapeProto)
::google::protobuf::uint32 cached_has_bits = 0;
// Prevent compiler warnings about cached_has_bits being unused
(void) cached_has_bits;
dim_.Clear();
unknown_rank_ = false;
_internal_metadata_.Clear();
}
bool TensorShapeProto::MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) {
#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
::google::protobuf::uint32 tag;
// @@protoc_insertion_point(parse_start:opencv_tensorflow.TensorShapeProto)
for (;;) {
::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
tag = p.first;
if (!p.second) goto handle_unusual;
switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
case 2: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(18u /* 18 & 0xFF */)) {
DO_(::google::protobuf::internal::WireFormatLite::ReadMessage(input, add_dim()));
} else {
goto handle_unusual;
}
break;
}
// bool unknown_rank = 3;
case 3: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(24u /* 24 & 0xFF */)) {
DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
bool, ::google::protobuf::internal::WireFormatLite::TYPE_BOOL>(
input, &unknown_rank_)));
} else {
goto handle_unusual;
}
break;
}
default: {
handle_unusual:
if (tag == 0) {
goto success;
}
DO_(::google::protobuf::internal::WireFormat::SkipField(
input, tag, _internal_metadata_.mutable_unknown_fields()));
break;
}
}
}
success:
// @@protoc_insertion_point(parse_success:opencv_tensorflow.TensorShapeProto)
return true;
failure:
// @@protoc_insertion_point(parse_failure:opencv_tensorflow.TensorShapeProto)
return false;
#undef DO_
}
void TensorShapeProto::SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const {
// @@protoc_insertion_point(serialize_start:opencv_tensorflow.TensorShapeProto)
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
for (unsigned int i = 0,
n = static_cast<unsigned int>(this->dim_size()); i < n; i++) {
::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray(
2, this->dim(static_cast<int>(i)), output);
}
// bool unknown_rank = 3;
if (this->unknown_rank() != 0) {
::google::protobuf::internal::WireFormatLite::WriteBool(3, this->unknown_rank(), output);
}
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
::google::protobuf::internal::WireFormat::SerializeUnknownFields(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), output);
}
// @@protoc_insertion_point(serialize_end:opencv_tensorflow.TensorShapeProto)
}
::google::protobuf::uint8* TensorShapeProto::InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const {
(void)deterministic; // Unused
// @@protoc_insertion_point(serialize_to_array_start:opencv_tensorflow.TensorShapeProto)
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
for (unsigned int i = 0,
n = static_cast<unsigned int>(this->dim_size()); i < n; i++) {
target = ::google::protobuf::internal::WireFormatLite::
InternalWriteMessageToArray(
2, this->dim(static_cast<int>(i)), deterministic, target);
}
// bool unknown_rank = 3;
if (this->unknown_rank() != 0) {
target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(3, this->unknown_rank(), target);
}
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), target);
}
// @@protoc_insertion_point(serialize_to_array_end:opencv_tensorflow.TensorShapeProto)
return target;
}
size_t TensorShapeProto::ByteSizeLong() const {
// @@protoc_insertion_point(message_byte_size_start:opencv_tensorflow.TensorShapeProto)
size_t total_size = 0;
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
total_size +=
::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()));
}
// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
{
unsigned int count = static_cast<unsigned int>(this->dim_size());
total_size += 1UL * count;
for (unsigned int i = 0; i < count; i++) {
total_size +=
::google::protobuf::internal::WireFormatLite::MessageSize(
this->dim(static_cast<int>(i)));
}
}
// bool unknown_rank = 3;
if (this->unknown_rank() != 0) {
total_size += 1 + 1;
}
int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_cached_size_ = cached_size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
return total_size;
}
void TensorShapeProto::MergeFrom(const ::google::protobuf::Message& from) {
// @@protoc_insertion_point(generalized_merge_from_start:opencv_tensorflow.TensorShapeProto)
GOOGLE_DCHECK_NE(&from, this);
const TensorShapeProto* source =
::google::protobuf::internal::DynamicCastToGenerated<const TensorShapeProto>(
&from);
if (source == NULL) {
// @@protoc_insertion_point(generalized_merge_from_cast_fail:opencv_tensorflow.TensorShapeProto)
::google::protobuf::internal::ReflectionOps::Merge(from, this);
} else {
// @@protoc_insertion_point(generalized_merge_from_cast_success:opencv_tensorflow.TensorShapeProto)
MergeFrom(*source);
}
}
void TensorShapeProto::MergeFrom(const TensorShapeProto& from) {
// @@protoc_insertion_point(class_specific_merge_from_start:opencv_tensorflow.TensorShapeProto)
GOOGLE_DCHECK_NE(&from, this);
_internal_metadata_.MergeFrom(from._internal_metadata_);
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
dim_.MergeFrom(from.dim_);
if (from.unknown_rank() != 0) {
set_unknown_rank(from.unknown_rank());
}
}
void TensorShapeProto::CopyFrom(const ::google::protobuf::Message& from) {
// @@protoc_insertion_point(generalized_copy_from_start:opencv_tensorflow.TensorShapeProto)
if (&from == this) return;
Clear();
MergeFrom(from);
}
void TensorShapeProto::CopyFrom(const TensorShapeProto& from) {
// @@protoc_insertion_point(class_specific_copy_from_start:opencv_tensorflow.TensorShapeProto)
if (&from == this) return;
Clear();
MergeFrom(from);
}
bool TensorShapeProto::IsInitialized() const {
return true;
}
void TensorShapeProto::Swap(TensorShapeProto* other) {
if (other == this) return;
if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) {
InternalSwap(other);
} else {
TensorShapeProto* temp = New(GetArenaNoVirtual());
temp->MergeFrom(*other);
other->CopyFrom(*this);
InternalSwap(temp);
if (GetArenaNoVirtual() == NULL) {
delete temp;
}
}
}
void TensorShapeProto::UnsafeArenaSwap(TensorShapeProto* other) {
if (other == this) return;
GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual());
InternalSwap(other);
}
void TensorShapeProto::InternalSwap(TensorShapeProto* other) {
using std::swap;
dim_.InternalSwap(&other->dim_);
swap(unknown_rank_, other->unknown_rank_);
_internal_metadata_.Swap(&other->_internal_metadata_);
swap(_cached_size_, other->_cached_size_);
}
::google::protobuf::Metadata TensorShapeProto::GetMetadata() const {
protobuf_tensor_5fshape_2eproto::protobuf_AssignDescriptorsOnce();
return ::protobuf_tensor_5fshape_2eproto::file_level_metadata[kIndexInFileMessages];
}
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)

View File

@@ -0,0 +1,491 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: tensor_shape.proto
#ifndef PROTOBUF_tensor_5fshape_2eproto__INCLUDED
#define PROTOBUF_tensor_5fshape_2eproto__INCLUDED
#include <string>
#include <google/protobuf/stubs/common.h>
#if GOOGLE_PROTOBUF_VERSION < 3005000
#error This file was generated by a newer version of protoc which is
#error incompatible with your Protocol Buffer headers. Please update
#error your headers.
#endif
#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
#error This file was generated by an older version of protoc which is
#error incompatible with your Protocol Buffer headers. Please
#error regenerate this file with a newer version of protoc.
#endif
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/arena.h>
#include <google/protobuf/arenastring.h>
#include <google/protobuf/generated_message_table_driven.h>
#include <google/protobuf/generated_message_util.h>
#include <google/protobuf/metadata.h>
#include <google/protobuf/message.h>
#include <google/protobuf/repeated_field.h> // IWYU pragma: export
#include <google/protobuf/extension_set.h> // IWYU pragma: export
#include <google/protobuf/unknown_field_set.h>
// @@protoc_insertion_point(includes)
namespace protobuf_tensor_5fshape_2eproto {
// Internal implementation detail -- do not use these members.
struct TableStruct {
static const ::google::protobuf::internal::ParseTableField entries[];
static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
static const ::google::protobuf::internal::ParseTable schema[2];
static const ::google::protobuf::internal::FieldMetadata field_metadata[];
static const ::google::protobuf::internal::SerializationTable serialization_table[];
static const ::google::protobuf::uint32 offsets[];
};
void AddDescriptors();
void InitDefaultsTensorShapeProto_DimImpl();
void InitDefaultsTensorShapeProto_Dim();
void InitDefaultsTensorShapeProtoImpl();
void InitDefaultsTensorShapeProto();
inline void InitDefaults() {
InitDefaultsTensorShapeProto_Dim();
InitDefaultsTensorShapeProto();
}
} // namespace protobuf_tensor_5fshape_2eproto
namespace opencv_tensorflow {
class TensorShapeProto;
class TensorShapeProtoDefaultTypeInternal;
extern TensorShapeProtoDefaultTypeInternal _TensorShapeProto_default_instance_;
class TensorShapeProto_Dim;
class TensorShapeProto_DimDefaultTypeInternal;
extern TensorShapeProto_DimDefaultTypeInternal _TensorShapeProto_Dim_default_instance_;
} // namespace opencv_tensorflow
namespace opencv_tensorflow {
// ===================================================================
class TensorShapeProto_Dim : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.TensorShapeProto.Dim) */ {
public:
TensorShapeProto_Dim();
virtual ~TensorShapeProto_Dim();
TensorShapeProto_Dim(const TensorShapeProto_Dim& from);
inline TensorShapeProto_Dim& operator=(const TensorShapeProto_Dim& from) {
CopyFrom(from);
return *this;
}
#if LANG_CXX11
TensorShapeProto_Dim(TensorShapeProto_Dim&& from) noexcept
: TensorShapeProto_Dim() {
*this = ::std::move(from);
}
inline TensorShapeProto_Dim& operator=(TensorShapeProto_Dim&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
#endif
inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
return GetArenaNoVirtual();
}
inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
return MaybeArenaPtr();
}
static const ::google::protobuf::Descriptor* descriptor();
static const TensorShapeProto_Dim& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const TensorShapeProto_Dim* internal_default_instance() {
return reinterpret_cast<const TensorShapeProto_Dim*>(
&_TensorShapeProto_Dim_default_instance_);
}
static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
0;
void UnsafeArenaSwap(TensorShapeProto_Dim* other);
void Swap(TensorShapeProto_Dim* other);
friend void swap(TensorShapeProto_Dim& a, TensorShapeProto_Dim& b) {
a.Swap(&b);
}
// implements Message ----------------------------------------------
inline TensorShapeProto_Dim* New() const PROTOBUF_FINAL { return New(NULL); }
TensorShapeProto_Dim* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void CopyFrom(const TensorShapeProto_Dim& from);
void MergeFrom(const TensorShapeProto_Dim& from);
void Clear() PROTOBUF_FINAL;
bool IsInitialized() const PROTOBUF_FINAL;
size_t ByteSizeLong() const PROTOBUF_FINAL;
bool MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
void SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
private:
void SharedCtor();
void SharedDtor();
void SetCachedSize(int size) const PROTOBUF_FINAL;
void InternalSwap(TensorShapeProto_Dim* other);
protected:
explicit TensorShapeProto_Dim(::google::protobuf::Arena* arena);
private:
static void ArenaDtor(void* object);
inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
private:
inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
return _internal_metadata_.arena();
}
inline void* MaybeArenaPtr() const {
return _internal_metadata_.raw_arena_ptr();
}
public:
::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
// nested types ----------------------------------------------------
// accessors -------------------------------------------------------
// string name = 2;
void clear_name();
static const int kNameFieldNumber = 2;
const ::std::string& name() const;
void set_name(const ::std::string& value);
#if LANG_CXX11
void set_name(::std::string&& value);
#endif
void set_name(const char* value);
void set_name(const char* value, size_t size);
::std::string* mutable_name();
::std::string* release_name();
void set_allocated_name(::std::string* name);
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
::std::string* unsafe_arena_release_name();
PROTOBUF_RUNTIME_DEPRECATED("The unsafe_arena_ accessors for"
" string fields are deprecated and will be removed in a"
" future release.")
void unsafe_arena_set_allocated_name(
::std::string* name);
// int64 size = 1;
void clear_size();
static const int kSizeFieldNumber = 1;
::google::protobuf::int64 size() const;
void set_size(::google::protobuf::int64 value);
// @@protoc_insertion_point(class_scope:opencv_tensorflow.TensorShapeProto.Dim)
private:
::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
typedef void InternalArenaConstructable_;
typedef void DestructorSkippable_;
::google::protobuf::internal::ArenaStringPtr name_;
::google::protobuf::int64 size_;
mutable int _cached_size_;
friend struct ::protobuf_tensor_5fshape_2eproto::TableStruct;
friend void ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProto_DimImpl();
};
// -------------------------------------------------------------------
class TensorShapeProto : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.TensorShapeProto) */ {
public:
TensorShapeProto();
virtual ~TensorShapeProto();
TensorShapeProto(const TensorShapeProto& from);
inline TensorShapeProto& operator=(const TensorShapeProto& from) {
CopyFrom(from);
return *this;
}
#if LANG_CXX11
TensorShapeProto(TensorShapeProto&& from) noexcept
: TensorShapeProto() {
*this = ::std::move(from);
}
inline TensorShapeProto& operator=(TensorShapeProto&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
#endif
inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
return GetArenaNoVirtual();
}
inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
return MaybeArenaPtr();
}
static const ::google::protobuf::Descriptor* descriptor();
static const TensorShapeProto& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const TensorShapeProto* internal_default_instance() {
return reinterpret_cast<const TensorShapeProto*>(
&_TensorShapeProto_default_instance_);
}
static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
1;
void UnsafeArenaSwap(TensorShapeProto* other);
void Swap(TensorShapeProto* other);
friend void swap(TensorShapeProto& a, TensorShapeProto& b) {
a.Swap(&b);
}
// implements Message ----------------------------------------------
inline TensorShapeProto* New() const PROTOBUF_FINAL { return New(NULL); }
TensorShapeProto* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void CopyFrom(const TensorShapeProto& from);
void MergeFrom(const TensorShapeProto& from);
void Clear() PROTOBUF_FINAL;
bool IsInitialized() const PROTOBUF_FINAL;
size_t ByteSizeLong() const PROTOBUF_FINAL;
bool MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
void SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
private:
void SharedCtor();
void SharedDtor();
void SetCachedSize(int size) const PROTOBUF_FINAL;
void InternalSwap(TensorShapeProto* other);
protected:
explicit TensorShapeProto(::google::protobuf::Arena* arena);
private:
static void ArenaDtor(void* object);
inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
private:
inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
return _internal_metadata_.arena();
}
inline void* MaybeArenaPtr() const {
return _internal_metadata_.raw_arena_ptr();
}
public:
::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
// nested types ----------------------------------------------------
typedef TensorShapeProto_Dim Dim;
// accessors -------------------------------------------------------
// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
int dim_size() const;
void clear_dim();
static const int kDimFieldNumber = 2;
const ::opencv_tensorflow::TensorShapeProto_Dim& dim(int index) const;
::opencv_tensorflow::TensorShapeProto_Dim* mutable_dim(int index);
::opencv_tensorflow::TensorShapeProto_Dim* add_dim();
::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >*
mutable_dim();
const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >&
dim() const;
// bool unknown_rank = 3;
void clear_unknown_rank();
static const int kUnknownRankFieldNumber = 3;
bool unknown_rank() const;
void set_unknown_rank(bool value);
// @@protoc_insertion_point(class_scope:opencv_tensorflow.TensorShapeProto)
private:
::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
typedef void InternalArenaConstructable_;
typedef void DestructorSkippable_;
::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim > dim_;
bool unknown_rank_;
mutable int _cached_size_;
friend struct ::protobuf_tensor_5fshape_2eproto::TableStruct;
friend void ::protobuf_tensor_5fshape_2eproto::InitDefaultsTensorShapeProtoImpl();
};
// ===================================================================
// ===================================================================
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
// TensorShapeProto_Dim
// int64 size = 1;
inline void TensorShapeProto_Dim::clear_size() {
size_ = GOOGLE_LONGLONG(0);
}
inline ::google::protobuf::int64 TensorShapeProto_Dim::size() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.Dim.size)
return size_;
}
inline void TensorShapeProto_Dim::set_size(::google::protobuf::int64 value) {
size_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorShapeProto.Dim.size)
}
// string name = 2;
inline void TensorShapeProto_Dim::clear_name() {
name_.ClearToEmpty(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline const ::std::string& TensorShapeProto_Dim::name() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.Dim.name)
return name_.Get();
}
inline void TensorShapeProto_Dim::set_name(const ::std::string& value) {
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value, GetArenaNoVirtual());
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorShapeProto.Dim.name)
}
#if LANG_CXX11
inline void TensorShapeProto_Dim::set_name(::std::string&& value) {
name_.Set(
&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_rvalue:opencv_tensorflow.TensorShapeProto.Dim.name)
}
#endif
inline void TensorShapeProto_Dim::set_name(const char* value) {
GOOGLE_DCHECK(value != NULL);
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value),
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_char:opencv_tensorflow.TensorShapeProto.Dim.name)
}
inline void TensorShapeProto_Dim::set_name(const char* value,
size_t size) {
name_.Set(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(
reinterpret_cast<const char*>(value), size), GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_pointer:opencv_tensorflow.TensorShapeProto.Dim.name)
}
inline ::std::string* TensorShapeProto_Dim::mutable_name() {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorShapeProto.Dim.name)
return name_.Mutable(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline ::std::string* TensorShapeProto_Dim::release_name() {
// @@protoc_insertion_point(field_release:opencv_tensorflow.TensorShapeProto.Dim.name)
return name_.Release(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), GetArenaNoVirtual());
}
inline void TensorShapeProto_Dim::set_allocated_name(::std::string* name) {
if (name != NULL) {
} else {
}
name_.SetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), name,
GetArenaNoVirtual());
// @@protoc_insertion_point(field_set_allocated:opencv_tensorflow.TensorShapeProto.Dim.name)
}
inline ::std::string* TensorShapeProto_Dim::unsafe_arena_release_name() {
// @@protoc_insertion_point(field_unsafe_arena_release:opencv_tensorflow.TensorShapeProto.Dim.name)
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
return name_.UnsafeArenaRelease(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
GetArenaNoVirtual());
}
inline void TensorShapeProto_Dim::unsafe_arena_set_allocated_name(
::std::string* name) {
GOOGLE_DCHECK(GetArenaNoVirtual() != NULL);
if (name != NULL) {
} else {
}
name_.UnsafeArenaSetAllocated(&::google::protobuf::internal::GetEmptyStringAlreadyInited(),
name, GetArenaNoVirtual());
// @@protoc_insertion_point(field_unsafe_arena_set_allocated:opencv_tensorflow.TensorShapeProto.Dim.name)
}
// -------------------------------------------------------------------
// TensorShapeProto
// repeated .opencv_tensorflow.TensorShapeProto.Dim dim = 2;
inline int TensorShapeProto::dim_size() const {
return dim_.size();
}
inline void TensorShapeProto::clear_dim() {
dim_.Clear();
}
inline const ::opencv_tensorflow::TensorShapeProto_Dim& TensorShapeProto::dim(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.dim)
return dim_.Get(index);
}
inline ::opencv_tensorflow::TensorShapeProto_Dim* TensorShapeProto::mutable_dim(int index) {
// @@protoc_insertion_point(field_mutable:opencv_tensorflow.TensorShapeProto.dim)
return dim_.Mutable(index);
}
inline ::opencv_tensorflow::TensorShapeProto_Dim* TensorShapeProto::add_dim() {
// @@protoc_insertion_point(field_add:opencv_tensorflow.TensorShapeProto.dim)
return dim_.Add();
}
inline ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >*
TensorShapeProto::mutable_dim() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.TensorShapeProto.dim)
return &dim_;
}
inline const ::google::protobuf::RepeatedPtrField< ::opencv_tensorflow::TensorShapeProto_Dim >&
TensorShapeProto::dim() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.TensorShapeProto.dim)
return dim_;
}
// bool unknown_rank = 3;
inline void TensorShapeProto::clear_unknown_rank() {
unknown_rank_ = false;
}
inline bool TensorShapeProto::unknown_rank() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.TensorShapeProto.unknown_rank)
return unknown_rank_;
}
inline void TensorShapeProto::set_unknown_rank(bool value) {
unknown_rank_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.TensorShapeProto.unknown_rank)
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif // __GNUC__
// -------------------------------------------------------------------
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)
#endif // PROTOBUF_tensor_5fshape_2eproto__INCLUDED

View File

@@ -0,0 +1,144 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: types.proto
#include "types.pb.h"
#include <algorithm>
#include <google/protobuf/stubs/common.h>
#include <google/protobuf/stubs/port.h>
#include <google/protobuf/stubs/once.h>
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/wire_format_lite_inl.h>
#include <google/protobuf/descriptor.h>
#include <google/protobuf/generated_message_reflection.h>
#include <google/protobuf/reflection_ops.h>
#include <google/protobuf/wire_format.h>
// This is a temporary google only hack
#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
#include "third_party/protobuf/version.h"
#endif
// @@protoc_insertion_point(includes)
namespace opencv_tensorflow {
} // namespace opencv_tensorflow
namespace protobuf_types_2eproto {
const ::google::protobuf::EnumDescriptor* file_level_enum_descriptors[1];
const ::google::protobuf::uint32 TableStruct::offsets[1] = {};
static const ::google::protobuf::internal::MigrationSchema* schemas = NULL;
static const ::google::protobuf::Message* const* file_default_instances = NULL;
void protobuf_AssignDescriptors() {
AddDescriptors();
::google::protobuf::MessageFactory* factory = NULL;
AssignDescriptors(
"types.proto", schemas, file_default_instances, TableStruct::offsets, factory,
NULL, file_level_enum_descriptors, NULL);
}
void protobuf_AssignDescriptorsOnce() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);
}
void protobuf_RegisterTypes(const ::std::string&) GOOGLE_PROTOBUF_ATTRIBUTE_COLD;
void protobuf_RegisterTypes(const ::std::string&) {
protobuf_AssignDescriptorsOnce();
}
void AddDescriptorsImpl() {
InitDefaults();
static const char descriptor[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
"\n\013types.proto\022\021opencv_tensorflow*\234\005\n\010Dat"
"aType\022\016\n\nDT_INVALID\020\000\022\014\n\010DT_FLOAT\020\001\022\r\n\tD"
"T_DOUBLE\020\002\022\014\n\010DT_INT32\020\003\022\014\n\010DT_UINT8\020\004\022\014"
"\n\010DT_INT16\020\005\022\013\n\007DT_INT8\020\006\022\r\n\tDT_STRING\020\007"
"\022\020\n\014DT_COMPLEX64\020\010\022\014\n\010DT_INT64\020\t\022\013\n\007DT_B"
"OOL\020\n\022\014\n\010DT_QINT8\020\013\022\r\n\tDT_QUINT8\020\014\022\r\n\tDT"
"_QINT32\020\r\022\017\n\013DT_BFLOAT16\020\016\022\r\n\tDT_QINT16\020"
"\017\022\016\n\nDT_QUINT16\020\020\022\r\n\tDT_UINT16\020\021\022\021\n\rDT_C"
"OMPLEX128\020\022\022\013\n\007DT_HALF\020\023\022\020\n\014DT_FLOAT_REF"
"\020e\022\021\n\rDT_DOUBLE_REF\020f\022\020\n\014DT_INT32_REF\020g\022"
"\020\n\014DT_UINT8_REF\020h\022\020\n\014DT_INT16_REF\020i\022\017\n\013D"
"T_INT8_REF\020j\022\021\n\rDT_STRING_REF\020k\022\024\n\020DT_CO"
"MPLEX64_REF\020l\022\020\n\014DT_INT64_REF\020m\022\017\n\013DT_BO"
"OL_REF\020n\022\020\n\014DT_QINT8_REF\020o\022\021\n\rDT_QUINT8_"
"REF\020p\022\021\n\rDT_QINT32_REF\020q\022\023\n\017DT_BFLOAT16_"
"REF\020r\022\021\n\rDT_QINT16_REF\020s\022\022\n\016DT_QUINT16_R"
"EF\020t\022\021\n\rDT_UINT16_REF\020u\022\025\n\021DT_COMPLEX128"
"_REF\020v\022\017\n\013DT_HALF_REF\020wB,\n\030org.tensorflo"
"w.frameworkB\013TypesProtosP\001\370\001\001b\006proto3"
};
::google::protobuf::DescriptorPool::InternalAddGeneratedFile(
descriptor, 757);
::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
"types.proto", &protobuf_RegisterTypes);
}
void AddDescriptors() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
}
// Force AddDescriptors() to be called at dynamic initialization time.
struct StaticDescriptorInitializer {
StaticDescriptorInitializer() {
AddDescriptors();
}
} static_descriptor_initializer;
} // namespace protobuf_types_2eproto
namespace opencv_tensorflow {
const ::google::protobuf::EnumDescriptor* DataType_descriptor() {
protobuf_types_2eproto::protobuf_AssignDescriptorsOnce();
return protobuf_types_2eproto::file_level_enum_descriptors[0];
}
bool DataType_IsValid(int value) {
switch (value) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
case 8:
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 101:
case 102:
case 103:
case 104:
case 105:
case 106:
case 107:
case 108:
case 109:
case 110:
case 111:
case 112:
case 113:
case 114:
case 115:
case 116:
case 117:
case 118:
case 119:
return true;
default:
return false;
}
}
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)

View File

@@ -0,0 +1,143 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: types.proto
#ifndef PROTOBUF_types_2eproto__INCLUDED
#define PROTOBUF_types_2eproto__INCLUDED
#include <string>
#include <google/protobuf/stubs/common.h>
#if GOOGLE_PROTOBUF_VERSION < 3005000
#error This file was generated by a newer version of protoc which is
#error incompatible with your Protocol Buffer headers. Please update
#error your headers.
#endif
#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
#error This file was generated by an older version of protoc which is
#error incompatible with your Protocol Buffer headers. Please
#error regenerate this file with a newer version of protoc.
#endif
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/arena.h>
#include <google/protobuf/arenastring.h>
#include <google/protobuf/generated_message_table_driven.h>
#include <google/protobuf/generated_message_util.h>
#include <google/protobuf/metadata.h>
#include <google/protobuf/repeated_field.h> // IWYU pragma: export
#include <google/protobuf/extension_set.h> // IWYU pragma: export
#include <google/protobuf/generated_enum_reflection.h>
// @@protoc_insertion_point(includes)
namespace protobuf_types_2eproto {
// Internal implementation detail -- do not use these members.
struct TableStruct {
static const ::google::protobuf::internal::ParseTableField entries[];
static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
static const ::google::protobuf::internal::ParseTable schema[1];
static const ::google::protobuf::internal::FieldMetadata field_metadata[];
static const ::google::protobuf::internal::SerializationTable serialization_table[];
static const ::google::protobuf::uint32 offsets[];
};
void AddDescriptors();
inline void InitDefaults() {
}
} // namespace protobuf_types_2eproto
namespace opencv_tensorflow {
} // namespace opencv_tensorflow
namespace opencv_tensorflow {
enum DataType {
DT_INVALID = 0,
DT_FLOAT = 1,
DT_DOUBLE = 2,
DT_INT32 = 3,
DT_UINT8 = 4,
DT_INT16 = 5,
DT_INT8 = 6,
DT_STRING = 7,
DT_COMPLEX64 = 8,
DT_INT64 = 9,
DT_BOOL = 10,
DT_QINT8 = 11,
DT_QUINT8 = 12,
DT_QINT32 = 13,
DT_BFLOAT16 = 14,
DT_QINT16 = 15,
DT_QUINT16 = 16,
DT_UINT16 = 17,
DT_COMPLEX128 = 18,
DT_HALF = 19,
DT_FLOAT_REF = 101,
DT_DOUBLE_REF = 102,
DT_INT32_REF = 103,
DT_UINT8_REF = 104,
DT_INT16_REF = 105,
DT_INT8_REF = 106,
DT_STRING_REF = 107,
DT_COMPLEX64_REF = 108,
DT_INT64_REF = 109,
DT_BOOL_REF = 110,
DT_QINT8_REF = 111,
DT_QUINT8_REF = 112,
DT_QINT32_REF = 113,
DT_BFLOAT16_REF = 114,
DT_QINT16_REF = 115,
DT_QUINT16_REF = 116,
DT_UINT16_REF = 117,
DT_COMPLEX128_REF = 118,
DT_HALF_REF = 119,
DataType_INT_MIN_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32min,
DataType_INT_MAX_SENTINEL_DO_NOT_USE_ = ::google::protobuf::kint32max
};
bool DataType_IsValid(int value);
const DataType DataType_MIN = DT_INVALID;
const DataType DataType_MAX = DT_HALF_REF;
const int DataType_ARRAYSIZE = DataType_MAX + 1;
const ::google::protobuf::EnumDescriptor* DataType_descriptor();
inline const ::std::string& DataType_Name(DataType value) {
return ::google::protobuf::internal::NameOfEnum(
DataType_descriptor(), value);
}
inline bool DataType_Parse(
const ::std::string& name, DataType* value) {
return ::google::protobuf::internal::ParseNamedEnum<DataType>(
DataType_descriptor(), name, value);
}
// ===================================================================
// ===================================================================
// ===================================================================
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif // __GNUC__
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
namespace google {
namespace protobuf {
template <> struct is_proto_enum< ::opencv_tensorflow::DataType> : ::google::protobuf::internal::true_type {};
template <>
inline const EnumDescriptor* GetEnumDescriptor< ::opencv_tensorflow::DataType>() {
return ::opencv_tensorflow::DataType_descriptor();
}
} // namespace protobuf
} // namespace google
// @@protoc_insertion_point(global_scope)
#endif // PROTOBUF_types_2eproto__INCLUDED

View File

@@ -0,0 +1,492 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: versions.proto
#include "versions.pb.h"
#include <algorithm>
#include <google/protobuf/stubs/common.h>
#include <google/protobuf/stubs/port.h>
#include <google/protobuf/stubs/once.h>
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/wire_format_lite_inl.h>
#include <google/protobuf/descriptor.h>
#include <google/protobuf/generated_message_reflection.h>
#include <google/protobuf/reflection_ops.h>
#include <google/protobuf/wire_format.h>
// This is a temporary google only hack
#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
#include "third_party/protobuf/version.h"
#endif
// @@protoc_insertion_point(includes)
namespace opencv_tensorflow {
class VersionDefDefaultTypeInternal {
public:
::google::protobuf::internal::ExplicitlyConstructed<VersionDef>
_instance;
} _VersionDef_default_instance_;
} // namespace opencv_tensorflow
namespace protobuf_versions_2eproto {
void InitDefaultsVersionDefImpl() {
GOOGLE_PROTOBUF_VERIFY_VERSION;
#ifdef GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
::google::protobuf::internal::InitProtobufDefaultsForceUnique();
#else
::google::protobuf::internal::InitProtobufDefaults();
#endif // GOOGLE_PROTOBUF_ENFORCE_UNIQUENESS
{
void* ptr = &::opencv_tensorflow::_VersionDef_default_instance_;
new (ptr) ::opencv_tensorflow::VersionDef();
::google::protobuf::internal::OnShutdownDestroyMessage(ptr);
}
::opencv_tensorflow::VersionDef::InitAsDefaultInstance();
}
void InitDefaultsVersionDef() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &InitDefaultsVersionDefImpl);
}
::google::protobuf::Metadata file_level_metadata[1];
const ::google::protobuf::uint32 TableStruct::offsets[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
~0u, // no _has_bits_
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, _internal_metadata_),
~0u, // no _extensions_
~0u, // no _oneof_case_
~0u, // no _weak_field_map_
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, producer_),
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, min_consumer_),
GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(::opencv_tensorflow::VersionDef, bad_consumers_),
};
static const ::google::protobuf::internal::MigrationSchema schemas[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
{ 0, -1, sizeof(::opencv_tensorflow::VersionDef)},
};
static ::google::protobuf::Message const * const file_default_instances[] = {
reinterpret_cast<const ::google::protobuf::Message*>(&::opencv_tensorflow::_VersionDef_default_instance_),
};
void protobuf_AssignDescriptors() {
AddDescriptors();
::google::protobuf::MessageFactory* factory = NULL;
AssignDescriptors(
"versions.proto", schemas, file_default_instances, TableStruct::offsets, factory,
file_level_metadata, NULL, NULL);
}
void protobuf_AssignDescriptorsOnce() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);
}
void protobuf_RegisterTypes(const ::std::string&) GOOGLE_PROTOBUF_ATTRIBUTE_COLD;
void protobuf_RegisterTypes(const ::std::string&) {
protobuf_AssignDescriptorsOnce();
::google::protobuf::internal::RegisterAllTypes(file_level_metadata, 1);
}
void AddDescriptorsImpl() {
InitDefaults();
static const char descriptor[] GOOGLE_PROTOBUF_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = {
"\n\016versions.proto\022\021opencv_tensorflow\"K\n\nV"
"ersionDef\022\020\n\010producer\030\001 \001(\005\022\024\n\014min_consu"
"mer\030\002 \001(\005\022\025\n\rbad_consumers\030\003 \003(\005B/\n\030org."
"tensorflow.frameworkB\016VersionsProtosP\001\370\001"
"\001b\006proto3"
};
::google::protobuf::DescriptorPool::InternalAddGeneratedFile(
descriptor, 169);
::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
"versions.proto", &protobuf_RegisterTypes);
}
void AddDescriptors() {
static GOOGLE_PROTOBUF_DECLARE_ONCE(once);
::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);
}
// Force AddDescriptors() to be called at dynamic initialization time.
struct StaticDescriptorInitializer {
StaticDescriptorInitializer() {
AddDescriptors();
}
} static_descriptor_initializer;
} // namespace protobuf_versions_2eproto
namespace opencv_tensorflow {
// ===================================================================
void VersionDef::InitAsDefaultInstance() {
}
#if !defined(_MSC_VER) || _MSC_VER >= 1900
const int VersionDef::kProducerFieldNumber;
const int VersionDef::kMinConsumerFieldNumber;
const int VersionDef::kBadConsumersFieldNumber;
#endif // !defined(_MSC_VER) || _MSC_VER >= 1900
VersionDef::VersionDef()
: ::google::protobuf::Message(), _internal_metadata_(NULL) {
if (GOOGLE_PREDICT_TRUE(this != internal_default_instance())) {
::protobuf_versions_2eproto::InitDefaultsVersionDef();
}
SharedCtor();
// @@protoc_insertion_point(constructor:opencv_tensorflow.VersionDef)
}
VersionDef::VersionDef(::google::protobuf::Arena* arena)
: ::google::protobuf::Message(),
_internal_metadata_(arena),
bad_consumers_(arena) {
::protobuf_versions_2eproto::InitDefaultsVersionDef();
SharedCtor();
RegisterArenaDtor(arena);
// @@protoc_insertion_point(arena_constructor:opencv_tensorflow.VersionDef)
}
VersionDef::VersionDef(const VersionDef& from)
: ::google::protobuf::Message(),
_internal_metadata_(NULL),
bad_consumers_(from.bad_consumers_),
_cached_size_(0) {
_internal_metadata_.MergeFrom(from._internal_metadata_);
::memcpy(&producer_, &from.producer_,
static_cast<size_t>(reinterpret_cast<char*>(&min_consumer_) -
reinterpret_cast<char*>(&producer_)) + sizeof(min_consumer_));
// @@protoc_insertion_point(copy_constructor:opencv_tensorflow.VersionDef)
}
void VersionDef::SharedCtor() {
::memset(&producer_, 0, static_cast<size_t>(
reinterpret_cast<char*>(&min_consumer_) -
reinterpret_cast<char*>(&producer_)) + sizeof(min_consumer_));
_cached_size_ = 0;
}
VersionDef::~VersionDef() {
// @@protoc_insertion_point(destructor:opencv_tensorflow.VersionDef)
SharedDtor();
}
void VersionDef::SharedDtor() {
GOOGLE_DCHECK(GetArenaNoVirtual() == NULL);
}
void VersionDef::ArenaDtor(void* object) {
VersionDef* _this = reinterpret_cast< VersionDef* >(object);
(void)_this;
}
void VersionDef::RegisterArenaDtor(::google::protobuf::Arena* arena) {
}
void VersionDef::SetCachedSize(int size) const {
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_cached_size_ = size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
}
const ::google::protobuf::Descriptor* VersionDef::descriptor() {
::protobuf_versions_2eproto::protobuf_AssignDescriptorsOnce();
return ::protobuf_versions_2eproto::file_level_metadata[kIndexInFileMessages].descriptor;
}
const VersionDef& VersionDef::default_instance() {
::protobuf_versions_2eproto::InitDefaultsVersionDef();
return *internal_default_instance();
}
VersionDef* VersionDef::New(::google::protobuf::Arena* arena) const {
return ::google::protobuf::Arena::CreateMessage<VersionDef>(arena);
}
void VersionDef::Clear() {
// @@protoc_insertion_point(message_clear_start:opencv_tensorflow.VersionDef)
::google::protobuf::uint32 cached_has_bits = 0;
// Prevent compiler warnings about cached_has_bits being unused
(void) cached_has_bits;
bad_consumers_.Clear();
::memset(&producer_, 0, static_cast<size_t>(
reinterpret_cast<char*>(&min_consumer_) -
reinterpret_cast<char*>(&producer_)) + sizeof(min_consumer_));
_internal_metadata_.Clear();
}
bool VersionDef::MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) {
#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure
::google::protobuf::uint32 tag;
// @@protoc_insertion_point(parse_start:opencv_tensorflow.VersionDef)
for (;;) {
::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
tag = p.first;
if (!p.second) goto handle_unusual;
switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) {
// int32 producer = 1;
case 1: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(8u /* 8 & 0xFF */)) {
DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
input, &producer_)));
} else {
goto handle_unusual;
}
break;
}
// int32 min_consumer = 2;
case 2: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(16u /* 16 & 0xFF */)) {
DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
input, &min_consumer_)));
} else {
goto handle_unusual;
}
break;
}
// repeated int32 bad_consumers = 3;
case 3: {
if (static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(26u /* 26 & 0xFF */)) {
DO_((::google::protobuf::internal::WireFormatLite::ReadPackedPrimitive<
::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
input, this->mutable_bad_consumers())));
} else if (
static_cast< ::google::protobuf::uint8>(tag) ==
static_cast< ::google::protobuf::uint8>(24u /* 24 & 0xFF */)) {
DO_((::google::protobuf::internal::WireFormatLite::ReadRepeatedPrimitiveNoInline<
::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>(
1, 26u, input, this->mutable_bad_consumers())));
} else {
goto handle_unusual;
}
break;
}
default: {
handle_unusual:
if (tag == 0) {
goto success;
}
DO_(::google::protobuf::internal::WireFormat::SkipField(
input, tag, _internal_metadata_.mutable_unknown_fields()));
break;
}
}
}
success:
// @@protoc_insertion_point(parse_success:opencv_tensorflow.VersionDef)
return true;
failure:
// @@protoc_insertion_point(parse_failure:opencv_tensorflow.VersionDef)
return false;
#undef DO_
}
void VersionDef::SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const {
// @@protoc_insertion_point(serialize_start:opencv_tensorflow.VersionDef)
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// int32 producer = 1;
if (this->producer() != 0) {
::google::protobuf::internal::WireFormatLite::WriteInt32(1, this->producer(), output);
}
// int32 min_consumer = 2;
if (this->min_consumer() != 0) {
::google::protobuf::internal::WireFormatLite::WriteInt32(2, this->min_consumer(), output);
}
// repeated int32 bad_consumers = 3;
if (this->bad_consumers_size() > 0) {
::google::protobuf::internal::WireFormatLite::WriteTag(3, ::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
output->WriteVarint32(static_cast< ::google::protobuf::uint32>(
_bad_consumers_cached_byte_size_));
}
for (int i = 0, n = this->bad_consumers_size(); i < n; i++) {
::google::protobuf::internal::WireFormatLite::WriteInt32NoTag(
this->bad_consumers(i), output);
}
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
::google::protobuf::internal::WireFormat::SerializeUnknownFields(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), output);
}
// @@protoc_insertion_point(serialize_end:opencv_tensorflow.VersionDef)
}
::google::protobuf::uint8* VersionDef::InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const {
(void)deterministic; // Unused
// @@protoc_insertion_point(serialize_to_array_start:opencv_tensorflow.VersionDef)
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// int32 producer = 1;
if (this->producer() != 0) {
target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(1, this->producer(), target);
}
// int32 min_consumer = 2;
if (this->min_consumer() != 0) {
target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(2, this->min_consumer(), target);
}
// repeated int32 bad_consumers = 3;
if (this->bad_consumers_size() > 0) {
target = ::google::protobuf::internal::WireFormatLite::WriteTagToArray(
3,
::google::protobuf::internal::WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
target);
target = ::google::protobuf::io::CodedOutputStream::WriteVarint32ToArray(
static_cast< ::google::protobuf::int32>(
_bad_consumers_cached_byte_size_), target);
target = ::google::protobuf::internal::WireFormatLite::
WriteInt32NoTagToArray(this->bad_consumers_, target);
}
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), target);
}
// @@protoc_insertion_point(serialize_to_array_end:opencv_tensorflow.VersionDef)
return target;
}
size_t VersionDef::ByteSizeLong() const {
// @@protoc_insertion_point(message_byte_size_start:opencv_tensorflow.VersionDef)
size_t total_size = 0;
if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) {
total_size +=
::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize(
(::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()));
}
// repeated int32 bad_consumers = 3;
{
size_t data_size = ::google::protobuf::internal::WireFormatLite::
Int32Size(this->bad_consumers_);
if (data_size > 0) {
total_size += 1 +
::google::protobuf::internal::WireFormatLite::Int32Size(
static_cast< ::google::protobuf::int32>(data_size));
}
int cached_size = ::google::protobuf::internal::ToCachedSize(data_size);
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_bad_consumers_cached_byte_size_ = cached_size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
total_size += data_size;
}
// int32 producer = 1;
if (this->producer() != 0) {
total_size += 1 +
::google::protobuf::internal::WireFormatLite::Int32Size(
this->producer());
}
// int32 min_consumer = 2;
if (this->min_consumer() != 0) {
total_size += 1 +
::google::protobuf::internal::WireFormatLite::Int32Size(
this->min_consumer());
}
int cached_size = ::google::protobuf::internal::ToCachedSize(total_size);
GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN();
_cached_size_ = cached_size;
GOOGLE_SAFE_CONCURRENT_WRITES_END();
return total_size;
}
void VersionDef::MergeFrom(const ::google::protobuf::Message& from) {
// @@protoc_insertion_point(generalized_merge_from_start:opencv_tensorflow.VersionDef)
GOOGLE_DCHECK_NE(&from, this);
const VersionDef* source =
::google::protobuf::internal::DynamicCastToGenerated<const VersionDef>(
&from);
if (source == NULL) {
// @@protoc_insertion_point(generalized_merge_from_cast_fail:opencv_tensorflow.VersionDef)
::google::protobuf::internal::ReflectionOps::Merge(from, this);
} else {
// @@protoc_insertion_point(generalized_merge_from_cast_success:opencv_tensorflow.VersionDef)
MergeFrom(*source);
}
}
void VersionDef::MergeFrom(const VersionDef& from) {
// @@protoc_insertion_point(class_specific_merge_from_start:opencv_tensorflow.VersionDef)
GOOGLE_DCHECK_NE(&from, this);
_internal_metadata_.MergeFrom(from._internal_metadata_);
::google::protobuf::uint32 cached_has_bits = 0;
(void) cached_has_bits;
bad_consumers_.MergeFrom(from.bad_consumers_);
if (from.producer() != 0) {
set_producer(from.producer());
}
if (from.min_consumer() != 0) {
set_min_consumer(from.min_consumer());
}
}
void VersionDef::CopyFrom(const ::google::protobuf::Message& from) {
// @@protoc_insertion_point(generalized_copy_from_start:opencv_tensorflow.VersionDef)
if (&from == this) return;
Clear();
MergeFrom(from);
}
void VersionDef::CopyFrom(const VersionDef& from) {
// @@protoc_insertion_point(class_specific_copy_from_start:opencv_tensorflow.VersionDef)
if (&from == this) return;
Clear();
MergeFrom(from);
}
bool VersionDef::IsInitialized() const {
return true;
}
void VersionDef::Swap(VersionDef* other) {
if (other == this) return;
if (GetArenaNoVirtual() == other->GetArenaNoVirtual()) {
InternalSwap(other);
} else {
VersionDef* temp = New(GetArenaNoVirtual());
temp->MergeFrom(*other);
other->CopyFrom(*this);
InternalSwap(temp);
if (GetArenaNoVirtual() == NULL) {
delete temp;
}
}
}
void VersionDef::UnsafeArenaSwap(VersionDef* other) {
if (other == this) return;
GOOGLE_DCHECK(GetArenaNoVirtual() == other->GetArenaNoVirtual());
InternalSwap(other);
}
void VersionDef::InternalSwap(VersionDef* other) {
using std::swap;
bad_consumers_.InternalSwap(&other->bad_consumers_);
swap(producer_, other->producer_);
swap(min_consumer_, other->min_consumer_);
_internal_metadata_.Swap(&other->_internal_metadata_);
swap(_cached_size_, other->_cached_size_);
}
::google::protobuf::Metadata VersionDef::GetMetadata() const {
protobuf_versions_2eproto::protobuf_AssignDescriptorsOnce();
return ::protobuf_versions_2eproto::file_level_metadata[kIndexInFileMessages];
}
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)

View File

@@ -0,0 +1,272 @@
// Generated by the protocol buffer compiler. DO NOT EDIT!
// source: versions.proto
#ifndef PROTOBUF_versions_2eproto__INCLUDED
#define PROTOBUF_versions_2eproto__INCLUDED
#include <string>
#include <google/protobuf/stubs/common.h>
#if GOOGLE_PROTOBUF_VERSION < 3005000
#error This file was generated by a newer version of protoc which is
#error incompatible with your Protocol Buffer headers. Please update
#error your headers.
#endif
#if 3005001 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
#error This file was generated by an older version of protoc which is
#error incompatible with your Protocol Buffer headers. Please
#error regenerate this file with a newer version of protoc.
#endif
#include <google/protobuf/io/coded_stream.h>
#include <google/protobuf/arena.h>
#include <google/protobuf/arenastring.h>
#include <google/protobuf/generated_message_table_driven.h>
#include <google/protobuf/generated_message_util.h>
#include <google/protobuf/metadata.h>
#include <google/protobuf/message.h>
#include <google/protobuf/repeated_field.h> // IWYU pragma: export
#include <google/protobuf/extension_set.h> // IWYU pragma: export
#include <google/protobuf/unknown_field_set.h>
// @@protoc_insertion_point(includes)
namespace protobuf_versions_2eproto {
// Internal implementation detail -- do not use these members.
struct TableStruct {
static const ::google::protobuf::internal::ParseTableField entries[];
static const ::google::protobuf::internal::AuxillaryParseTableField aux[];
static const ::google::protobuf::internal::ParseTable schema[1];
static const ::google::protobuf::internal::FieldMetadata field_metadata[];
static const ::google::protobuf::internal::SerializationTable serialization_table[];
static const ::google::protobuf::uint32 offsets[];
};
void AddDescriptors();
void InitDefaultsVersionDefImpl();
void InitDefaultsVersionDef();
inline void InitDefaults() {
InitDefaultsVersionDef();
}
} // namespace protobuf_versions_2eproto
namespace opencv_tensorflow {
class VersionDef;
class VersionDefDefaultTypeInternal;
extern VersionDefDefaultTypeInternal _VersionDef_default_instance_;
} // namespace opencv_tensorflow
namespace opencv_tensorflow {
// ===================================================================
class VersionDef : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:opencv_tensorflow.VersionDef) */ {
public:
VersionDef();
virtual ~VersionDef();
VersionDef(const VersionDef& from);
inline VersionDef& operator=(const VersionDef& from) {
CopyFrom(from);
return *this;
}
#if LANG_CXX11
VersionDef(VersionDef&& from) noexcept
: VersionDef() {
*this = ::std::move(from);
}
inline VersionDef& operator=(VersionDef&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
#endif
inline ::google::protobuf::Arena* GetArena() const PROTOBUF_FINAL {
return GetArenaNoVirtual();
}
inline void* GetMaybeArenaPointer() const PROTOBUF_FINAL {
return MaybeArenaPtr();
}
static const ::google::protobuf::Descriptor* descriptor();
static const VersionDef& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const VersionDef* internal_default_instance() {
return reinterpret_cast<const VersionDef*>(
&_VersionDef_default_instance_);
}
static PROTOBUF_CONSTEXPR int const kIndexInFileMessages =
0;
void UnsafeArenaSwap(VersionDef* other);
void Swap(VersionDef* other);
friend void swap(VersionDef& a, VersionDef& b) {
a.Swap(&b);
}
// implements Message ----------------------------------------------
inline VersionDef* New() const PROTOBUF_FINAL { return New(NULL); }
VersionDef* New(::google::protobuf::Arena* arena) const PROTOBUF_FINAL;
void CopyFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void MergeFrom(const ::google::protobuf::Message& from) PROTOBUF_FINAL;
void CopyFrom(const VersionDef& from);
void MergeFrom(const VersionDef& from);
void Clear() PROTOBUF_FINAL;
bool IsInitialized() const PROTOBUF_FINAL;
size_t ByteSizeLong() const PROTOBUF_FINAL;
bool MergePartialFromCodedStream(
::google::protobuf::io::CodedInputStream* input) PROTOBUF_FINAL;
void SerializeWithCachedSizes(
::google::protobuf::io::CodedOutputStream* output) const PROTOBUF_FINAL;
::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray(
bool deterministic, ::google::protobuf::uint8* target) const PROTOBUF_FINAL;
int GetCachedSize() const PROTOBUF_FINAL { return _cached_size_; }
private:
void SharedCtor();
void SharedDtor();
void SetCachedSize(int size) const PROTOBUF_FINAL;
void InternalSwap(VersionDef* other);
protected:
explicit VersionDef(::google::protobuf::Arena* arena);
private:
static void ArenaDtor(void* object);
inline void RegisterArenaDtor(::google::protobuf::Arena* arena);
private:
inline ::google::protobuf::Arena* GetArenaNoVirtual() const {
return _internal_metadata_.arena();
}
inline void* MaybeArenaPtr() const {
return _internal_metadata_.raw_arena_ptr();
}
public:
::google::protobuf::Metadata GetMetadata() const PROTOBUF_FINAL;
// nested types ----------------------------------------------------
// accessors -------------------------------------------------------
// repeated int32 bad_consumers = 3;
int bad_consumers_size() const;
void clear_bad_consumers();
static const int kBadConsumersFieldNumber = 3;
::google::protobuf::int32 bad_consumers(int index) const;
void set_bad_consumers(int index, ::google::protobuf::int32 value);
void add_bad_consumers(::google::protobuf::int32 value);
const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
bad_consumers() const;
::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
mutable_bad_consumers();
// int32 producer = 1;
void clear_producer();
static const int kProducerFieldNumber = 1;
::google::protobuf::int32 producer() const;
void set_producer(::google::protobuf::int32 value);
// int32 min_consumer = 2;
void clear_min_consumer();
static const int kMinConsumerFieldNumber = 2;
::google::protobuf::int32 min_consumer() const;
void set_min_consumer(::google::protobuf::int32 value);
// @@protoc_insertion_point(class_scope:opencv_tensorflow.VersionDef)
private:
::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_;
template <typename T> friend class ::google::protobuf::Arena::InternalHelper;
typedef void InternalArenaConstructable_;
typedef void DestructorSkippable_;
::google::protobuf::RepeatedField< ::google::protobuf::int32 > bad_consumers_;
mutable int _bad_consumers_cached_byte_size_;
::google::protobuf::int32 producer_;
::google::protobuf::int32 min_consumer_;
mutable int _cached_size_;
friend struct ::protobuf_versions_2eproto::TableStruct;
friend void ::protobuf_versions_2eproto::InitDefaultsVersionDefImpl();
};
// ===================================================================
// ===================================================================
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
// VersionDef
// int32 producer = 1;
inline void VersionDef::clear_producer() {
producer_ = 0;
}
inline ::google::protobuf::int32 VersionDef::producer() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.VersionDef.producer)
return producer_;
}
inline void VersionDef::set_producer(::google::protobuf::int32 value) {
producer_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.VersionDef.producer)
}
// int32 min_consumer = 2;
inline void VersionDef::clear_min_consumer() {
min_consumer_ = 0;
}
inline ::google::protobuf::int32 VersionDef::min_consumer() const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.VersionDef.min_consumer)
return min_consumer_;
}
inline void VersionDef::set_min_consumer(::google::protobuf::int32 value) {
min_consumer_ = value;
// @@protoc_insertion_point(field_set:opencv_tensorflow.VersionDef.min_consumer)
}
// repeated int32 bad_consumers = 3;
inline int VersionDef::bad_consumers_size() const {
return bad_consumers_.size();
}
inline void VersionDef::clear_bad_consumers() {
bad_consumers_.Clear();
}
inline ::google::protobuf::int32 VersionDef::bad_consumers(int index) const {
// @@protoc_insertion_point(field_get:opencv_tensorflow.VersionDef.bad_consumers)
return bad_consumers_.Get(index);
}
inline void VersionDef::set_bad_consumers(int index, ::google::protobuf::int32 value) {
bad_consumers_.Set(index, value);
// @@protoc_insertion_point(field_set:opencv_tensorflow.VersionDef.bad_consumers)
}
inline void VersionDef::add_bad_consumers(::google::protobuf::int32 value) {
bad_consumers_.Add(value);
// @@protoc_insertion_point(field_add:opencv_tensorflow.VersionDef.bad_consumers)
}
inline const ::google::protobuf::RepeatedField< ::google::protobuf::int32 >&
VersionDef::bad_consumers() const {
// @@protoc_insertion_point(field_list:opencv_tensorflow.VersionDef.bad_consumers)
return bad_consumers_;
}
inline ::google::protobuf::RepeatedField< ::google::protobuf::int32 >*
VersionDef::mutable_bad_consumers() {
// @@protoc_insertion_point(field_mutable_list:opencv_tensorflow.VersionDef.bad_consumers)
return &bad_consumers_;
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif // __GNUC__
// @@protoc_insertion_point(namespace_scope)
} // namespace opencv_tensorflow
// @@protoc_insertion_point(global_scope)
#endif // PROTOBUF_versions_2eproto__INCLUDED

View File

@@ -0,0 +1,111 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2017, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
// Recommends run this performance test via
// ./bin/opencv_perf_dnn 2> /dev/null | grep "PERFSTAT" -A 3
// because whole output includes Caffe's logs.
//
// Note: Be sure that interesting version of Caffe was linked.
// Note: There is an impact on Halide performance. Comment this tests if you
// want to run the last one.
//
// How to build Intel-Caffe with MKLDNN backend
// ============================================
// mkdir build && cd build
// cmake -DCMAKE_BUILD_TYPE=Release \
// -DUSE_MKLDNN_AS_DEFAULT_ENGINE=ON \
// -DUSE_MKL2017_AS_DEFAULT_ENGINE=OFF \
// -DCPU_ONLY=ON \
// -DCMAKE_INSTALL_PREFIX=/usr/local .. && make -j8
// sudo make install
//
// In case of problems with cublas_v2.h at include/caffe/util/device_alternate.hpp: add line
// #define CPU_ONLY
// before the first line
// #ifdef CPU_ONLY // CPU-only Caffe.
#if defined(HAVE_CAFFE) || defined(HAVE_CLCAFFE)
#include "perf_precomp.hpp"
#include <iostream>
#include <caffe/caffe.hpp>
namespace opencv_test {
static caffe::Net<float>* initNet(std::string proto, std::string weights)
{
proto = findDataFile(proto);
weights = findDataFile(weights, false);
#ifdef HAVE_CLCAFFE
caffe::Caffe::set_mode(caffe::Caffe::GPU);
caffe::Caffe::SetDevice(0);
caffe::Net<float>* net =
new caffe::Net<float>(proto, caffe::TEST, caffe::Caffe::GetDefaultDevice());
#else
caffe::Caffe::set_mode(caffe::Caffe::CPU);
caffe::Net<float>* net = new caffe::Net<float>(proto, caffe::TEST);
#endif
net->CopyTrainedLayersFrom(weights);
caffe::Blob<float>* input = net->input_blobs()[0];
CV_Assert(input->num() == 1);
CV_Assert(input->channels() == 3);
Mat inputMat(input->height(), input->width(), CV_32FC3, (char*)input->cpu_data());
randu(inputMat, 0.0f, 1.0f);
net->Forward();
return net;
}
PERF_TEST(AlexNet_caffe, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/bvlc_alexnet.prototxt",
"dnn/bvlc_alexnet.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(GoogLeNet_caffe, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/bvlc_googlenet.prototxt",
"dnn/bvlc_googlenet.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(ResNet50_caffe, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/ResNet-50-deploy.prototxt",
"dnn/ResNet-50-model.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/squeezenet_v1.1.prototxt",
"dnn/squeezenet_v1.1.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}
PERF_TEST(MobileNet_SSD, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
"dnn/MobileNetSSD_deploy.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}
} // namespace
#endif // HAVE_CAFFE

View File

@@ -0,0 +1,6 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
#include "../test/test_common.impl.hpp" // shared with accuracy tests

View File

@@ -0,0 +1,894 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace opencv_test {
// Flops_Kernel_Input_OutCN_Group_Stride_Pad_Dilation_PadAdjust_PadMode_Bias
struct TestSize_ {
int width, height;
operator Size() const { return Size(width, height); }
};
struct ConvParam_t {
struct TestSize_ kernel;
struct BlobShape { int dims[4]; } shapeIn;
int outCN;
int groups;
struct TestSize_ stride;
struct TestSize_ dilation;
struct TestSize_ pad;
struct TestSize_ padAdjust;
const char* padMode;
bool hasBias;
double declared_flops;
};
// Details: #12142
// Last update: 2021-09
static const ConvParam_t testConvolutionConfigs[] = {
/* GFLOPS 3.398 x 20 = 67.956 */ {{7, 7}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {3, 3}, {0, 0}, "", true, 3397788160.},
/* GFLOPS 16.987 x 3 = 50.962 */ {{5, 5}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16987226112.},
/* GFLOPS 23.122 x 2 = 46.244 */ {{5, 5}, {{1, 672, 32, 32}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23121788928.},
/* GFLOPS 9.987 x 3 = 29.960 */ {{3, 3}, {{1, 256, 92, 92}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9986707456.},
/* GFLOPS 1.595 x 16 = 25.524 */ {{3, 3}, {{1, 256, 26, 26}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595230208.},
/* GFLOPS 4.566 x 5 = 22.828 */ {{7, 7}, {{1, 172, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {3, 3}, {0, 0}, "", true, 4565684736.},
/* GFLOPS 1.596 x 14 = 22.338 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
/* GFLOPS 1.595 x 12 = 19.141 */ {{3, 3}, {{1, 512, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
/* GFLOPS 6.814 x 2 = 13.629 */ {{3, 3}, {{1, 512, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6814386176.},
/* GFLOPS 6.637 x 2 = 13.274 */ {{3, 3}, {{1, 256, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6636960000.},
/* GFLOPS 11.797 x 1 = 11.797 */ {{5, 5}, {{1, 240, 64, 64}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 11797463040.},
/* GFLOPS 11.797 x 1 = 11.797 */ {{5, 5}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 11796971520.},
/* GFLOPS 10.701 x 1 = 10.701 */ {{3, 3}, {{1, 512, 38, 38}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 10700715792.},
/* GFLOPS 10.087 x 1 = 10.087 */ {{3, 3}, {{1, 576, 38, 50}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10086963200.},
/* GFLOPS 9.993 x 1 = 9.993 */ {{3, 3}, {{1, 64, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9993207808.},
/* GFLOPS 9.989 x 1 = 9.989 */ {{3, 3}, {{1, 128, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9988874240.},
/* GFLOPS 9.986 x 1 = 9.986 */ {{3, 3}, {{1, 512, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9985624064.},
/* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
/* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703596544.},
/* GFLOPS 4.247 x 2 = 8.494 */ {{3, 3}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247224320.},
/* GFLOPS 8.025 x 1 = 8.025 */ {{3, 3}, {{1, 1024, 19, 19}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 8025101478.},
/* GFLOPS 0.798 x 9 = 7.180 */ {{3, 3}, {{1, 128, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797788160.},
/* GFLOPS 0.798 x 9 = 7.179 */ {{3, 3}, {{1, 256, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797615104.},
/* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6641280000.},
/* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 150, 200}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6641280000.},
/* GFLOPS 6.638 x 1 = 6.638 */ {{3, 3}, {{1, 128, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6638400000.},
/* GFLOPS 6.118 x 1 = 6.118 */ {{3, 3}, {{1, 144, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6117654528.},
/* GFLOPS 6.116 x 1 = 6.116 */ {{3, 3}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6115590144.},
/* GFLOPS 5.780 x 1 = 5.780 */ {{5, 5}, {{1, 672, 32, 32}}, 672, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5780447232.},
/* GFLOPS 1.704 x 3 = 5.111 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
/* GFLOPS 4.997 x 1 = 4.997 */ {{3, 3}, {{1, 64, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4996603904.},
/* GFLOPS 4.994 x 1 = 4.994 */ {{3, 3}, {{1, 128, 92, 92}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4994437120.},
/* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 256, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4993353728.},
/* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 512, 46, 46}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4992812032.},
/* GFLOPS 1.659 x 3 = 4.977 */ {{3, 3}, {{1, 960, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1658976000.},
/* GFLOPS 2.156 x 2 = 4.312 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2156088384.},
/* GFLOPS 4.247 x 1 = 4.247 */ {{5, 5}, {{1, 144, 128, 128}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247322624.},
/* GFLOPS 0.798 x 5 = 3.988 */ {{3, 3}, {{1, 512, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797528576.},
/* GFLOPS 0.958 x 4 = 3.833 */ {{3, 3}, {{1, 384, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958307712.},
/* GFLOPS 0.624 x 6 = 3.746 */ {{3, 3}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 624304640.},
/* GFLOPS 3.408 x 1 = 3.408 */ {{3, 3}, {{1, 256, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3407562752.},
/* GFLOPS 3.407 x 1 = 3.407 */ {{3, 3}, {{1, 512, 19, 19}}, 1024, 1, {1, 1}, {6, 6}, {6, 6}, {0, 0}, "", true, 3407193088.},
/* GFLOPS 0.177 x 19 = 3.370 */ {{1, 1}, {{1, 512, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177382400.},
/* GFLOPS 0.302 x 11 = 3.325 */ {{3, 3}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 302252032.},
/* GFLOPS 3.321 x 1 = 3.321 */ {{3, 3}, {{1, 64, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3320640000.},
/* GFLOPS 0.830 x 4 = 3.321 */ {{3, 3}, {{1, 64, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 830160000.},
/* GFLOPS 3.319 x 1 = 3.319 */ {{3, 3}, {{1, 128, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3319200000.},
/* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 416, 416}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
/* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
/* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 208, 208}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
/* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
/* GFLOPS 1.596 x 2 = 3.191 */ {{3, 3}, {{1, 128, 104, 104}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
/* GFLOPS 1.595 x 2 = 3.190 */ {{3, 3}, {{1, 256, 52, 52}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595230208.},
/* GFLOPS 1.595 x 2 = 3.190 */ {{3, 3}, {{1, 512, 26, 26}}, 1024, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
/* GFLOPS 0.178 x 16 = 2.841 */ {{1, 1}, {{1, 256, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177555456.},
/* GFLOPS 2.719 x 1 = 2.719 */ {{3, 3}, {{1, 96, 256, 256}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2719481856.},
/* GFLOPS 0.177 x 15 = 2.659 */ {{1, 1}, {{1, 1024, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177295872.},
/* GFLOPS 1.245 x 2 = 2.490 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1244880000.},
/* GFLOPS 0.798 x 3 = 2.394 */ {{3, 3}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 798134272.},
/* GFLOPS 0.472 x 5 = 2.360 */ {{3, 3}, {{1, 256, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471961600.},
/* GFLOPS 2.255 x 1 = 2.255 */ {{3, 3}, {{1, 128, 80, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2255285760.},
/* GFLOPS 2.153 x 1 = 2.153 */ {{3, 3}, {{1, 128, 78, 98}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2152611840.},
/* GFLOPS 2.100 x 1 = 2.100 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2100330000.},
/* GFLOPS 2.052 x 1 = 2.052 */ {{3, 3}, {{1, 128, 76, 96}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2052298240.},
/* GFLOPS 1.022 x 2 = 2.044 */ {{3, 3}, {{1, 576, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1021896057.},
/* GFLOPS 1.995 x 1 = 1.995 */ {{9, 9}, {{1, 3, 320, 400}}, 32, 1, {1, 1}, {1, 1}, {4, 4}, {0, 0}, "", true, 1994752000.},
/* GFLOPS 1.954 x 1 = 1.954 */ {{3, 3}, {{1, 128, 74, 94}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1954344960.},
/* GFLOPS 0.958 x 2 = 1.917 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958446336.},
/* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1887539200.},
/* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1024, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1887539200.},
/* GFLOPS 1.859 x 1 = 1.859 */ {{3, 3}, {{1, 128, 72, 92}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1858752000.},
/* GFLOPS 1.766 x 1 = 1.766 */ {{3, 3}, {{1, 128, 70, 90}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1765519360.},
/* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703781376.},
/* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703781376.},
/* GFLOPS 1.675 x 1 = 1.675 */ {{3, 3}, {{1, 128, 68, 88}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1674647040.},
/* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1659600000.},
/* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1659600000.},
/* GFLOPS 1.586 x 1 = 1.586 */ {{3, 3}, {{1, 128, 66, 86}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1586135040.},
/* GFLOPS 1.500 x 1 = 1.500 */ {{3, 3}, {{1, 128, 64, 84}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1499983360.},
/* GFLOPS 1.416 x 1 = 1.416 */ {{3, 3}, {{1, 128, 62, 82}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1416192000.},
/* GFLOPS 0.472 x 3 = 1.416 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 472064000.},
/* GFLOPS 0.472 x 3 = 1.416 */ {{3, 3}, {{1, 512, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
/* GFLOPS 0.280 x 5 = 1.402 */ {{1, 1}, {{1, 576, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 280409600.},
/* GFLOPS 0.701 x 2 = 1.401 */ {{3, 3}, {{1, 128, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
/* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 128, 56, 56}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
/* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 256, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231261184.},
/* GFLOPS 0.210 x 6 = 1.262 */ {{1, 1}, {{1, 576, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
/* GFLOPS 0.420 x 3 = 1.261 */ {{3, 3}, {{1, 96, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420492800.},
/* GFLOPS 1.261 x 1 = 1.261 */ {{3, 3}, {{1, 192, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1261113600.},
/* GFLOPS 1.258 x 1 = 1.258 */ {{3, 3}, {{1, 1280, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258038600.},
/* GFLOPS 1.248 x 1 = 1.248 */ {{3, 3}, {{1, 256, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1248338432.},
/* GFLOPS 1.245 x 1 = 1.245 */ {{3, 3}, {{1, 64, 75, 75}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1245240000.},
/* GFLOPS 1.210 x 1 = 1.210 */ {{3, 3}, {{1, 32, 256, 256}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1210056704.},
/* GFLOPS 1.196 x 1 = 1.196 */ {{3, 3}, {{1, 384, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1196336128.},
/* GFLOPS 1.195 x 1 = 1.195 */ {{9, 9}, {{1, 32, 240, 320}}, 3, 1, {1, 1}, {1, 1}, {4, 4}, {0, 0}, "", true, 1194624000.},
/* GFLOPS 1.182 x 1 = 1.182 */ {{3, 3}, {{1, 32, 320, 400}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1181696000.},
/* GFLOPS 1.181 x 1 = 1.181 */ {{3, 3}, {{1, 64, 160, 200}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1180672000.},
/* GFLOPS 0.561 x 2 = 1.121 */ {{3, 3}, {{1, 128, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 560576000.},
/* GFLOPS 1.112 x 1 = 1.112 */ {{3, 3}, {{1, 512, 10, 10}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1111570200.},
/* GFLOPS 0.357 x 3 = 1.072 */ {{1, 1}, {{1, 64, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 357187584.},
/* GFLOPS 1.062 x 1 = 1.062 */ {{3, 3}, {{1, 240, 64, 64}}, 240, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1061928960.},
/* GFLOPS 0.076 x 14 = 1.058 */ {{3, 3}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75563008.},
/* GFLOPS 1.051 x 1 = 1.051 */ {{3, 3}, {{1, 160, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1050988800.},
/* GFLOPS 0.210 x 5 = 1.051 */ {{1, 1}, {{1, 256, 20, 20}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
/* GFLOPS 0.210 x 5 = 1.049 */ {{1, 1}, {{1, 1024, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
/* GFLOPS 1.006 x 1 = 1.006 */ {{3, 3}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1006441800.},
/* GFLOPS 0.246 x 4 = 0.985 */ {{1, 1}, {{1, 256, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 246240000.},
/* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189452800.},
/* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189452800.},
/* GFLOPS 0.472 x 2 = 0.945 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 472268800.},
/* GFLOPS 0.934 x 1 = 0.934 */ {{3, 3}, {{1, 96, 150, 150}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 933660000.},
/* GFLOPS 0.231 x 4 = 0.925 */ {{3, 3}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
/* GFLOPS 0.896 x 1 = 0.896 */ {{5, 5}, {{1, 96, 27, 27}}, 256, 2, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 895981824.},
/* GFLOPS 0.089 x 10 = 0.890 */ {{1, 1}, {{1, 128, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88950784.},
/* GFLOPS 0.089 x 10 = 0.888 */ {{1, 1}, {{1, 256, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88777728.},
/* GFLOPS 0.876 x 1 = 0.876 */ {{3, 3}, {{1, 160, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 875824000.},
/* GFLOPS 0.850 x 1 = 0.850 */ {{7, 7}, {{1, 3, 600, 800}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 849600000.},
/* GFLOPS 0.841 x 1 = 0.841 */ {{3, 3}, {{1, 128, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 840864000.},
/* GFLOPS 0.415 x 2 = 0.831 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415440000.},
/* GFLOPS 0.757 x 1 = 0.757 */ {{1, 1}, {{1, 1024, 19, 19}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 757441536.},
/* GFLOPS 0.712 x 1 = 0.712 */ {{1, 1}, {{1, 128, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 711606272.},
/* GFLOPS 0.178 x 4 = 0.712 */ {{1, 1}, {{1, 128, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177901568.},
/* GFLOPS 0.354 x 2 = 0.707 */ {{1, 1}, {{1, 256, 52, 52}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 353723760.},
/* GFLOPS 0.351 x 2 = 0.701 */ {{1, 1}, {{1, 576, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 350512000.},
/* GFLOPS 0.701 x 1 = 0.701 */ {{3, 3}, {{1, 128, 75, 100}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
/* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 694235136.},
/* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 694235136.},
/* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231411712.},
/* GFLOPS 0.058 x 12 = 0.694 */ {{3, 3}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 57827840.},
/* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 512, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231236096.},
/* GFLOPS 0.160 x 4 = 0.639 */ {{3, 3}, {{1, 64, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159833472.},
/* GFLOPS 0.211 x 3 = 0.634 */ {{1, 1}, {{1, 64, 80, 80}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 211353600.},
/* GFLOPS 0.211 x 3 = 0.632 */ {{1, 1}, {{1, 128, 40, 40}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210534400.},
/* GFLOPS 0.210 x 3 = 0.630 */ {{1, 1}, {{1, 512, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209920000.},
/* GFLOPS 0.210 x 3 = 0.630 */ {{1, 1}, {{1, 512, 10, 10}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209920000.},
/* GFLOPS 0.103 x 6 = 0.618 */ {{1, 1}, {{1, 256, 14, 14}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
/* GFLOPS 0.615 x 1 = 0.615 */ {{1, 1}, {{1, 320, 75, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615360000.},
/* GFLOPS 0.305 x 2 = 0.609 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 304578560.},
/* GFLOPS 0.597 x 1 = 0.597 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 597254400.},
/* GFLOPS 0.278 x 2 = 0.557 */ {{1, 1}, {{1, 128, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 278431744.},
/* GFLOPS 0.185 x 3 = 0.554 */ {{1, 1}, {{1, 192, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 184800000.},
/* GFLOPS 0.553 x 1 = 0.553 */ {{3, 3}, {{1, 64, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 553440000.},
/* GFLOPS 0.539 x 1 = 0.539 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 539178048.},
/* GFLOPS 0.103 x 5 = 0.514 */ {{1, 1}, {{1, 1024, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102810624.},
/* GFLOPS 0.491 x 1 = 0.491 */ {{1, 1}, {{1, 576, 38, 50}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 490716800.},
/* GFLOPS 0.483 x 1 = 0.483 */ {{7, 7}, {{1, 3, 320, 320}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 483328000.},
/* GFLOPS 0.240 x 2 = 0.479 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239680896.},
/* GFLOPS 0.477 x 1 = 0.477 */ {{3, 3}, {{1, 3, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 476692480.},
/* GFLOPS 0.237 x 2 = 0.474 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 236830720.},
/* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 471910400.},
/* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
/* GFLOPS 0.155 x 3 = 0.464 */ {{1, 1}, {{1, 112, 32, 32}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 154828800.},
/* GFLOPS 0.114 x 4 = 0.454 */ {{1, 1}, {{1, 192, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113541120.},
/* GFLOPS 0.449 x 1 = 0.449 */ {{3, 3}, {{1, 384, 13, 13}}, 384, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 448626048.},
/* GFLOPS 0.089 x 5 = 0.443 */ {{1, 1}, {{1, 512, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88691200.},
/* GFLOPS 0.428 x 1 = 0.428 */ {{1, 1}, {{1, 64, 64, 64}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 427991040.},
/* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 426037760.},
/* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 426037760.},
/* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 426037760.},
/* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 425945344.},
/* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 425945344.},
/* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 425945344.},
/* GFLOPS 0.421 x 1 = 0.421 */ {{1, 1}, {{1, 576, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420614400.},
/* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 256, 40, 40}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 420249600.},
/* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 256, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
/* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 512, 20, 20}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 419840000.},
/* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 1024, 10, 10}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 419635200.},
/* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 2048, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209766400.},
/* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 415440000.},
/* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 415080000.},
/* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415080000.},
/* GFLOPS 0.104 x 4 = 0.414 */ {{1, 1}, {{1, 64, 56, 56}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103563264.},
/* GFLOPS 0.103 x 4 = 0.413 */ {{1, 1}, {{1, 128, 28, 28}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
/* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 399413248.},
/* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 32, 104, 104}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199706624.},
/* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 64, 52, 52}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199533568.},
/* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 398894080.},
/* GFLOPS 0.199 x 2 = 0.399 */ {{3, 3}, {{1, 128, 26, 26}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199447040.},
/* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 26, 26}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
/* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
/* GFLOPS 0.376 x 1 = 0.376 */ {{1, 1}, {{1, 24, 300, 400}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 376320000.},
/* GFLOPS 0.179 x 2 = 0.357 */ {{1, 1}, {{1, 64, 208, 208}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 178593792.},
/* GFLOPS 0.089 x 4 = 0.357 */ {{1, 1}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 89296896.},
/* GFLOPS 0.356 x 1 = 0.356 */ {{1, 1}, {{1, 128, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 355803136.},
/* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 256, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 355110912.},
/* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 512, 26, 26}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354764800.},
/* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 1024, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354591744.},
/* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 2048, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354505216.},
/* GFLOPS 0.177 x 2 = 0.353 */ {{1, 1}, {{1, 512, 26, 26}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 176689500.},
/* GFLOPS 0.070 x 5 = 0.348 */ {{1, 1}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 69607936.},
/* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 346967040.},
/* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 346967040.},
/* GFLOPS 0.014 x 24 = 0.347 */ {{3, 3}, {{1, 128, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 14456960.},
/* GFLOPS 0.113 x 3 = 0.340 */ {{1, 1}, {{1, 1152, 16, 16}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113295360.},
/* GFLOPS 0.053 x 6 = 0.320 */ {{1, 1}, {{1, 576, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53277824.},
/* GFLOPS 0.319 x 1 = 0.319 */ {{3, 3}, {{1, 192, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 319482112.},
/* GFLOPS 0.317 x 1 = 0.317 */ {{3, 3}, {{1, 3, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 316800000.},
/* GFLOPS 0.315 x 1 = 0.315 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 315369600.},
/* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 7, 7}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
/* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
/* GFLOPS 0.154 x 2 = 0.309 */ {{1, 1}, {{1, 672, 32, 32}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 154255360.},
/* GFLOPS 0.308 x 1 = 0.308 */ {{1, 1}, {{1, 320, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 307680000.},
/* GFLOPS 0.034 x 9 = 0.304 */ {{1, 1}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 33816576.},
/* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 256, 13, 13}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299105664.},
/* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 384, 13, 13}}, 256, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299084032.},
/* GFLOPS 0.017 x 17 = 0.290 */ {{1, 1}, {{1, 32, 32, 64}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17039360.},
/* GFLOPS 0.017 x 16 = 0.269 */ {{1, 1}, {{1, 128, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16842752.},
/* GFLOPS 0.133 x 2 = 0.266 */ {{3, 3}, {{1, 128, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
/* GFLOPS 0.266 x 1 = 0.266 */ {{1, 1}, {{1, 384, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 266160128.},
/* GFLOPS 0.266 x 1 = 0.266 */ {{1, 1}, {{1, 768, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 265987072.},
/* GFLOPS 0.038 x 7 = 0.265 */ {{3, 3}, {{1, 16, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37879808.},
/* GFLOPS 0.019 x 14 = 0.264 */ {{3, 3}, {{1, 64, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18890752.},
/* GFLOPS 0.262 x 1 = 0.262 */ {{1, 1}, {{1, 2560, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 262195200.},
/* GFLOPS 0.126 x 2 = 0.252 */ {{3, 3}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 125812050.},
/* GFLOPS 0.248 x 1 = 0.248 */ {{1, 1}, {{1, 64, 150, 200}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 247680000.},
/* GFLOPS 0.040 x 6 = 0.240 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
/* GFLOPS 0.080 x 3 = 0.240 */ {{3, 3}, {{1, 96, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79893632.},
/* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 239611584.},
/* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239611584.},
/* GFLOPS 0.079 x 3 = 0.237 */ {{1, 1}, {{1, 80, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 79134720.},
/* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 236830720.},
/* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 236830720.},
/* GFLOPS 0.118 x 2 = 0.236 */ {{3, 3}, {{1, 32, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118169600.},
/* GFLOPS 0.236 x 1 = 0.236 */ {{3, 3}, {{1, 256, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 235980800.},
/* GFLOPS 0.116 x 2 = 0.231 */ {{1, 1}, {{1, 24, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115605504.},
/* GFLOPS 0.111 x 2 = 0.221 */ {{3, 3}, {{1, 192, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110624000.},
/* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 213018880.},
/* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", false, 213018880.},
/* GFLOPS 0.107 x 2 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106509440.},
/* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 212972672.},
/* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 512, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 212949568.},
/* GFLOPS 0.212 x 1 = 0.212 */ {{7, 7}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 212400000.},
/* GFLOPS 0.211 x 1 = 0.211 */ {{11, 11}, {{1, 3, 227, 227}}, 96, 1, {4, 4}, {1, 1}, {0, 0}, {0, 0}, "", true, 211120800.},
/* GFLOPS 0.210 x 1 = 0.210 */ {{3, 3}, {{1, 64, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
/* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 209817600.},
/* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
/* GFLOPS 0.104 x 2 = 0.208 */ {{3, 3}, {{1, 32, 75, 75}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 103860000.},
/* GFLOPS 0.208 x 1 = 0.208 */ {{1, 1}, {{1, 16, 256, 256}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 207618048.},
/* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
/* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
/* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
/* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
/* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 1024, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
/* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
/* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 2048, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
/* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 2048, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102785536.},
/* GFLOPS 0.201 x 1 = 0.201 */ {{1, 1}, {{1, 512, 14, 14}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 200900000.},
/* GFLOPS 0.200 x 1 = 0.200 */ {{3, 3}, {{1, 160, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 199687872.},
/* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189637632.},
/* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189637632.},
/* GFLOPS 0.047 x 4 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 47409408.},
/* GFLOPS 0.189 x 1 = 0.189 */ {{1, 1}, {{1, 1024, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189360384.},
/* GFLOPS 0.038 x 5 = 0.189 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37814272.},
/* GFLOPS 0.189 x 1 = 0.189 */ {{1, 1}, {{1, 1152, 16, 16}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 188825600.},
/* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 185040000.},
/* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 185040000.},
/* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 180696320.},
/* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 180696320.},
/* GFLOPS 0.090 x 2 = 0.181 */ {{3, 3}, {{1, 224, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 90339200.},
/* GFLOPS 0.180 x 1 = 0.180 */ {{1, 1}, {{1, 224, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 180232192.},
/* GFLOPS 0.088 x 2 = 0.177 */ {{1, 1}, {{1, 1024, 13, 13}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 88301655.},
/* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 173508608.},
/* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 173508608.},
/* GFLOPS 0.166 x 1 = 0.166 */ {{3, 3}, {{1, 160, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 166406560.},
/* GFLOPS 0.080 x 2 = 0.160 */ {{1, 1}, {{1, 576, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79916736.},
/* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159764160.},
/* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 1024, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 159703512.},
/* GFLOPS 0.159 x 1 = 0.159 */ {{7, 7}, {{1, 3, 300, 300}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 159300000.},
/* GFLOPS 0.080 x 2 = 0.159 */ {{1, 1}, {{1, 40, 64, 64}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 79626240.},
/* GFLOPS 0.079 x 2 = 0.157 */ {{1, 1}, {{1, 480, 32, 32}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78725120.},
/* GFLOPS 0.155 x 1 = 0.155 */ {{1, 1}, {{1, 192, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 154542080.},
/* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 146369664.},
/* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 146369664.},
/* GFLOPS 0.072 x 2 = 0.144 */ {{1, 1}, {{1, 1024, 10, 10}}, 352, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 72124800.},
/* GFLOPS 0.140 x 1 = 0.140 */ {{1, 1}, {{1, 576, 38, 50}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140204800.},
/* GFLOPS 0.139 x 1 = 0.139 */ {{3, 3}, {{1, 256, 5, 5}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 138961350.},
/* GFLOPS 0.017 x 8 = 0.138 */ {{1, 1}, {{1, 16, 64, 128}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17301504.},
/* GFLOPS 0.067 x 2 = 0.133 */ {{1, 1}, {{1, 576, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 66597280.},
/* GFLOPS 0.133 x 1 = 0.133 */ {{3, 3}, {{1, 128, 38, 38}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
/* GFLOPS 0.044 x 3 = 0.133 */ {{1, 1}, {{1, 512, 13, 13}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44345600.},
/* GFLOPS 0.129 x 1 = 0.129 */ {{1, 1}, {{1, 160, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 128851968.},
/* GFLOPS 0.128 x 1 = 0.128 */ {{3, 3}, {{1, 64, 24, 24}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 127512576.},
/* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 120497664.},
/* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 120497664.},
/* GFLOPS 0.040 x 3 = 0.120 */ {{1, 1}, {{1, 96, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 40131648.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{1, 1}, {{1, 320, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 118477312.},
/* GFLOPS 0.017 x 7 = 0.118 */ {{1, 1}, {{1, 64, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118067200.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118067200.},
/* GFLOPS 0.039 x 3 = 0.118 */ {{1, 1}, {{1, 1024, 10, 10}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39340800.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118016000.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118016000.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 20, 20}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 117990400.},
/* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 117990400.},
/* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 58003456.},
/* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57903104.},
/* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57852928.},
/* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 115655680.},
/* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 115655680.},
/* GFLOPS 0.115 x 1 = 0.115 */ {{3, 3}, {{1, 3, 512, 512}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115343360.},
/* GFLOPS 0.114 x 1 = 0.114 */ {{1, 1}, {{1, 144, 128, 128}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113639424.},
/* GFLOPS 0.112 x 1 = 0.112 */ {{1, 1}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111875400.},
/* GFLOPS 0.110 x 1 = 0.110 */ {{1, 1}, {{1, 480, 32, 32}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 110215168.},
/* GFLOPS 0.107 x 1 = 0.107 */ {{1, 1}, {{1, 64, 32, 32}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 106997760.},
/* GFLOPS 0.036 x 3 = 0.107 */ {{1, 1}, {{1, 192, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 35580160.},
/* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 106648064.},
/* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 64, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106555648.},
/* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 256, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 105062400.},
/* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
/* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 104960000.},
/* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
/* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 1024, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104908800.},
/* GFLOPS 0.103 x 1 = 0.103 */ {{1, 1}, {{1, 128, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
/* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 51480576.},
/* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 51480576.},
/* GFLOPS 0.008 x 12 = 0.101 */ {{1, 1}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 8454144.},
/* GFLOPS 0.101 x 1 = 0.101 */ {{1, 1}, {{1, 512, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 101016825.},
/* GFLOPS 0.096 x 1 = 0.096 */ {{1, 1}, {{1, 480, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 96438272.},
/* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 95003648.},
/* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 95003648.},
/* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 94818816.},
/* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 94818816.},
/* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 93600000.},
/* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 93600000.},
/* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 512, 38, 50}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93480000.},
/* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 576, 19, 19}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93236192.},
/* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 92880000.},
/* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 92880000.},
/* GFLOPS 0.031 x 3 = 0.092 */ {{1, 1}, {{1, 160, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30816000.},
/* GFLOPS 0.092 x 1 = 0.092 */ {{1, 1}, {{1, 192, 75, 100}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 92400000.},
/* GFLOPS 0.090 x 1 = 0.090 */ {{1, 1}, {{1, 448, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 90015744.},
/* GFLOPS 0.045 x 2 = 0.090 */ {{3, 3}, {{1, 576, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44918508.},
/* GFLOPS 0.044 x 2 = 0.089 */ {{1, 1}, {{1, 256, 26, 26}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44388864.},
/* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 88554368.},
/* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 88554368.},
/* GFLOPS 0.088 x 1 = 0.088 */ {{1, 1}, {{1, 256, 26, 26}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 88430940.},
/* GFLOPS 0.021 x 4 = 0.084 */ {{5, 1}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {2, 0}, {0, 0}, "", false, 21037056.},
/* GFLOPS 0.021 x 4 = 0.084 */ {{1, 5}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 2}, {0, 0}, "", true, 21037056.},
/* GFLOPS 0.084 x 1 = 0.084 */ {{1, 1}, {{1, 416, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 83593216.},
/* GFLOPS 0.082 x 1 = 0.082 */ {{1, 1}, {{1, 320, 10, 10}}, 1280, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 82048000.},
/* GFLOPS 0.040 x 2 = 0.080 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39958368.},
/* GFLOPS 0.040 x 2 = 0.079 */ {{1, 1}, {{1, 24, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39690000.},
/* GFLOPS 0.040 x 2 = 0.079 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39600000.},
/* GFLOPS 0.079 x 1 = 0.079 */ {{1, 1}, {{1, 240, 64, 64}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78807040.},
/* GFLOPS 0.079 x 1 = 0.079 */ {{1, 1}, {{1, 384, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78745600.},
/* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 96, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77471744.},
/* GFLOPS 0.077 x 1 = 0.077 */ {{3, 3}, {{1, 192, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 77436800.},
/* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 384, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77170688.},
/* GFLOPS 0.076 x 1 = 0.076 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 76144640.},
/* GFLOPS 0.076 x 1 = 0.076 */ {{1, 1}, {{1, 96, 128, 128}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75890688.},
/* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {8, 8}, {8, 8}, {0, 0}, "", true, 37814272.},
/* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {4, 4}, {4, 4}, {0, 0}, "", true, 37814272.},
/* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", true, 37814272.},
/* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {16, 16}, {16, 16}, {0, 0}, "", true, 37814272.},
/* GFLOPS 0.018 x 4 = 0.072 */ {{1, 1}, {{1, 64, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17882496.},
/* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 16, 150, 150}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 71280000.},
/* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 352, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 70748160.},
/* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 24, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 70560000.},
/* GFLOPS 0.070 x 1 = 0.070 */ {{3, 3}, {{1, 96, 14, 14}}, 208, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 70487872.},
/* GFLOPS 0.069 x 1 = 0.069 */ {{3, 3}, {{1, 96, 14, 14}}, 204, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 69132336.},
/* GFLOPS 0.068 x 1 = 0.068 */ {{1, 1}, {{1, 32, 256, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 68157440.},
/* GFLOPS 0.005 x 14 = 0.066 */ {{3, 3}, {{1, 64, 8, 8}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4722688.},
/* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 672, 16, 16}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 66109440.},
/* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 1280, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 65561600.},
/* GFLOPS 0.033 x 2 = 0.065 */ {{3, 3}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 32551680.},
/* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 65046912.},
/* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 65046912.},
/* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 160, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 64534400.},
/* GFLOPS 0.064 x 1 = 0.064 */ {{1, 1}, {{1, 320, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 64325632.},
/* GFLOPS 0.032 x 2 = 0.064 */ {{3, 3}, {{1, 96, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 31868928.},
/* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 61472000.},
/* GFLOPS 0.031 x 2 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30736000.},
/* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 512, 46, 46}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 60729200.},
/* GFLOPS 0.060 x 1 = 0.060 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59920224.},
/* GFLOPS 0.059 x 1 = 0.059 */ {{1, 1}, {{1, 320, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59238656.},
/* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 59008000.},
/* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 58995200.},
/* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 58995200.},
/* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 58995200.},
/* GFLOPS 0.058 x 1 = 0.058 */ {{1, 1}, {{1, 288, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 57903104.},
/* GFLOPS 0.004 x 16 = 0.058 */ {{3, 3}, {{1, 128, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 3614240.},
/* GFLOPS 0.055 x 1 = 0.055 */ {{3, 3}, {{1, 1280, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55298400.},
/* GFLOPS 0.018 x 3 = 0.054 */ {{1, 1}, {{1, 32, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18021120.},
/* GFLOPS 0.018 x 3 = 0.053 */ {{1, 1}, {{1, 384, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17766976.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{3, 3}, {{1, 128, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 53254720.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 53036032.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53036032.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 64, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52838400.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 64, 40, 40}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52838400.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 128, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52633600.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 128, 20, 20}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52633600.},
/* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 256, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52531200.},
/* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 52454400.},
/* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 52454400.},
/* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52454400.},
/* GFLOPS 0.026 x 2 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26227200.},
/* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 64, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51781632.},
/* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
/* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
/* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51430400.},
/* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25715200.},
/* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25715200.},
/* GFLOPS 0.013 x 4 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12857600.},
/* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51405312.},
/* GFLOPS 0.050 x 1 = 0.050 */ {{1, 1}, {{1, 992, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 49799680.},
/* GFLOPS 0.048 x 1 = 0.048 */ {{1, 1}, {{1, 960, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 48194048.},
/* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 47409408.},
/* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 144, 64, 64}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 47349760.},
/* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 512, 38, 50}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46740000.},
/* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 928, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 46588416.},
/* GFLOPS 0.046 x 1 = 0.046 */ {{1, 1}, {{1, 64, 75, 75}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46440000.},
/* GFLOPS 0.023 x 2 = 0.045 */ {{3, 3}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22648626.},
/* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 45174080.},
/* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 45174080.},
/* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 224, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 45058048.},
/* GFLOPS 0.023 x 2 = 0.045 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 22500800.},
/* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 896, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44982784.},
/* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 3, 227, 227}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 44946880.},
/* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44256000.},
/* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44239200.},
/* GFLOPS 0.044 x 1 = 0.044 */ {{1, 1}, {{1, 512, 13, 13}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 44172375.},
/* GFLOPS 0.043 x 1 = 0.043 */ {{7, 7}, {{1, 3, 96, 96}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 43499520.},
/* GFLOPS 0.043 x 1 = 0.043 */ {{1, 1}, {{1, 864, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 43377152.},
/* GFLOPS 0.042 x 1 = 0.042 */ {{1, 1}, {{1, 832, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 41771520.},
/* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 40165888.},
/* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 40165888.},
/* GFLOPS 0.040 x 1 = 0.040 */ {{1, 1}, {{1, 800, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 40165888.},
/* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 64, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
/* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 256, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 39932376.},
/* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 39600000.},
/* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 240, 32, 32}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39403520.},
/* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 144, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39015000.},
/* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 192, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38635520.},
/* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 768, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38560256.},
/* GFLOPS 0.037 x 1 = 0.037 */ {{1, 1}, {{1, 736, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 36954624.},
/* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 36164352.},
/* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 36164352.},
/* GFLOPS 0.018 x 2 = 0.036 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17790080.},
/* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 704, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 35348992.},
/* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 512, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 34702400.},
/* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 672, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33743360.},
/* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 128, 32, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33685504.},
/* GFLOPS 0.034 x 1 = 0.034 */ {{2, 2}, {{1, 64, 64, 128}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 33619968.},
/* GFLOPS 0.033 x 1 = 0.033 */ {{3, 3}, {{1, 256, 3, 3}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 33350724.},
/* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 33147520.},
/* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 33147520.},
/* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 1024, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 32784000.},
/* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 160, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32212992.},
/* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 512, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32144000.},
/* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 640, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32137728.},
/* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 508, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31893120.},
/* GFLOPS 0.011 x 3 = 0.032 */ {{1, 1}, {{1, 320, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10502144.},
/* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 31328640.},
/* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31328640.},
/* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 608, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 30532096.},
/* GFLOPS 0.015 x 2 = 0.030 */ {{1, 1}, {{1, 128, 46, 46}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15226736.},
/* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15065344.},
/* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15065344.},
/* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15059072.},
/* GFLOPS 0.029 x 1 = 0.029 */ {{3, 3}, {{1, 256, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 29497600.},
/* GFLOPS 0.015 x 2 = 0.029 */ {{1, 1}, {{1, 112, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 14745600.},
/* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28976640.},
/* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28976640.},
/* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28929600.},
/* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28929600.},
/* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 576, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 28926464.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 544, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 27320832.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 64, 16, 16}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 26749440.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 384, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26650464.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 576, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26638912.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{3, 3}, {{1, 128, 38, 38}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 26627360.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26518016.},
/* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26518016.},
/* GFLOPS 0.009 x 3 = 0.026 */ {{1, 1}, {{1, 128, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8700992.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 96, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26055000.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 25890816.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25890816.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25890816.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 1024, 10, 10}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25817400.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25790464.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25740288.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25740288.},
/* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12870144.},
/* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25715200.},
/* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12857600.},
/* GFLOPS 0.002 x 12 = 0.025 */ {{1, 1}, {{1, 64, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 2113536.},
/* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 480, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 24109568.},
/* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 23750912.},
/* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23704704.},
/* GFLOPS 0.023 x 1 = 0.023 */ {{3, 3}, {{1, 3, 256, 512}}, 13, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 23429120.},
/* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 32, 150, 150}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23400000.},
/* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 19, 19}}, 63, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23311575.},
/* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 448, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 22503936.},
/* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22500800.},
/* GFLOPS 0.022 x 1 = 0.022 */ {{1, 1}, {{1, 508, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22325184.},
/* GFLOPS 0.022 x 1 = 0.022 */ {{3, 3}, {{1, 512, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 22120800.},
/* GFLOPS 0.021 x 1 = 0.021 */ {{3, 3}, {{1, 128, 12, 12}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 21242880.},
/* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 40, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 21233664.},
/* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 416, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 20898304.},
/* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 20885760.},
/* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20885760.},
/* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 10442880.},
/* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10442880.},
/* GFLOPS 0.010 x 2 = 0.020 */ {{3, 3}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10066056.},
/* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20095488.},
/* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20095488.},
/* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20082944.},
/* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20082944.},
/* GFLOPS 0.020 x 1 = 0.020 */ {{3, 3}, {{1, 256, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 19966188.},
/* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 19317760.},
/* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19317760.},
/* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 384, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 19292672.},
/* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 64, 64, 64}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 19021824.},
/* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 576, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18448000.},
/* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 18082176.},
/* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 18082176.},
/* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 17790080.},
/* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 352, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17687040.},
/* GFLOPS 0.017 x 1 = 0.017 */ {{2, 2}, {{1, 16, 128, 256}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
/* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 320, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16081408.},
/* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15664320.},
/* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15664320.},
/* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15059072.},
/* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 14754816.},
/* GFLOPS 0.015 x 1 = 0.015 */ {{3, 3}, {{1, 128, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 14752000.},
/* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 288, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 14475776.},
/* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13991250.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 144, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 13354112.},
/* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6623232.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 512, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13120000.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13053600.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13053600.},
/* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6522880.},
/* GFLOPS 0.001 x 11 = 0.013 */ {{3, 3}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180672.},
/* GFLOPS 0.006 x 2 = 0.013 */ {{1, 1}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 128, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12895232.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12870144.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12870144.},
/* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 508, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12757248.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 992, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12449920.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12054784.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12054784.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 960, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12048512.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 12014080.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{3, 3}, {{1, 96, 6, 6}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11950848.},
/* GFLOPS 0.006 x 2 = 0.012 */ {{3, 3}, {{1, 96, 3, 3}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5975424.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 320, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11814912.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 640, 6, 6}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11805696.},
/* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 928, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11647104.},
/* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 896, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11245696.},
/* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 256, 13, 13}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11097216.},
/* GFLOPS 0.011 x 1 = 0.011 */ {{3, 3}, {{1, 256, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11061600.},
/* GFLOPS 0.006 x 2 = 0.011 */ {{3, 3}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5530200.},
/* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 864, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10844288.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10442880.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{5, 5}, {{1, 32, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 10041472.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 800, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10041472.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9658880.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 9658880.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 384, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9646336.},
/* GFLOPS 0.005 x 2 = 0.010 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4821600.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 768, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9640064.},
/* GFLOPS 0.010 x 1 = 0.010 */ {{3, 3}, {{1, 4, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9568256.},
/* GFLOPS 0.005 x 2 = 0.009 */ {{1, 1}, {{1, 4, 128, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4718592.},
/* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 736, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9238656.},
/* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 192, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 8895040.},
/* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 704, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8837248.},
/* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 672, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8435840.},
/* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 128, 32, 64}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8421376.},
/* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 640, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8034432.},
/* GFLOPS 0.004 x 2 = 0.008 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3916080.},
/* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 608, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7633024.},
/* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 7535808.},
/* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 7535808.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 640, 6, 6}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7378560.},
/* GFLOPS 0.004 x 2 = 0.007 */ {{1, 1}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3650304.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 384, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7234752.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 576, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7231616.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7091712.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 544, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6830208.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 64, 8, 8}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 6687360.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{3, 3}, {{1, 160, 6, 6}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 6637824.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6629504.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6629504.},
/* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 5, 5}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6566400.},
/* GFLOPS 0.003 x 2 = 0.007 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 3280000.},
/* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 64, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
/* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6447616.},
/* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6428800.},
/* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6428800.},
/* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6428800.},
/* GFLOPS 0.001 x 12 = 0.006 */ {{1, 1}, {{1, 64, 8, 8}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 528384.},
/* GFLOPS 0.006 x 1 = 0.006 */ {{3, 3}, {{1, 256, 10, 10}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5530800.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5322240.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5310720.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5310720.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5310720.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5310720.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4917600.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4917600.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4829440.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4829440.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4826304.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4821600.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 508, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4783968.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 32, 32}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 4755456.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 24, 24}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4755456.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4727808.},
/* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4720896.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4440300.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4440300.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 640, 6, 6}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4427136.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 16, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4325376.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 64, 64, 128}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4227072.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3916080.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 256, 1, 1}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3705636.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 16, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3691008.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 64, 10, 10}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 3689600.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 64, 6, 6}}, 128, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3687552.},
/* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 192, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3548160.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 736, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3393792.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 10, 10}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3283200.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3280000.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3280000.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3228750.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3013696.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3013696.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 320, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2953728.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 640, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2951424.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 256, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2765400.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 128, 5, 5}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2655360.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 832, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2610720.},
/* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2520882.},
/* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258530.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2363904.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 2360320.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2360320.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2360320.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 528, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2164736.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 508, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2082816.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 1, 1}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2049000.},
/* GFLOPS 0.001 x 2 = 0.002 */ {{3, 3}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 995544.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1770336.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 64, 4, 4}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 1671840.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 32, 80, 80}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1664000.},
/* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1641600.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 6, 6}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1475712.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1383000.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 64, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1328256.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 736, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1272672.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 64, 16, 16}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 1188864.},
/* GFLOPS 0.000 x 9 = 0.001 */ {{1, 1}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 132096.},
/* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590976.},
/* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 590976.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180160.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1120392.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 192, 12, 12}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 887040.},
/* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 442464.},
/* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 32, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 416000.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 691500.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 256, 3, 3}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 663696.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 2, 2}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 655872.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 615000.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615000.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 592128.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 590976.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590080.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 581742.},
/* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 525312.},
/* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 48, 1, 1}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111744.},
/* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 1152, 1, 1}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110640.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 5, 5}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 411200.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 3, 3}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 331920.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 192, 5, 5}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 308000.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 8, 8}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 297216.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 2, 2}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 263168.},
/* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 131328.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 258552.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 1024, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 196704.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 3, 3}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 165960.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 3, 3}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 148032.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 147584.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 147584.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 147584.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 147584.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 736, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 141408.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140322.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 131328.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 131328.},
/* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 28, 1, 1}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 38304.},
/* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 672, 1, 1}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 37660.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 110808.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110808.},
/* GFLOPS 0.000 x 2 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55320.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 4, 4}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 74304.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 73792.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 256, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 73744.},
/* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 20, 1, 1}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19680.},
/* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 480, 1, 1}}, 20, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19220.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 49248.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 49248.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 36880.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32382.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 18440.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 1, 1}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16512.},
/* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 10, 1, 1}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5040.},
/* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 240, 1, 1}}, 10, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4810.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6168.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6168.},
/* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 6, 1, 1}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1872.},
/* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 144, 1, 1}}, 6, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1734.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 4, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 864.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 96, 1, 1}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 772.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 8, 1, 1}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 544.},
/* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 32, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 520.}
};
struct ConvParamID
{
enum {
CONV_0 = 0,
CONV_100 = 100,
CONV_LAST = sizeof(testConvolutionConfigs) / sizeof(testConvolutionConfigs[0])
};
int val_;
ConvParamID(int val = 0) : val_(val) {}
operator int() const { return val_; }
static ::testing::internal::ParamGenerator<ConvParamID> all()
{
#if 0
enum { NUM = (int)CONV_LAST };
#else
enum { NUM = (int)CONV_100 };
#endif
ConvParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = ConvParamID(i); } // reduce generated code size
return ::testing::ValuesIn(v_, v_ + NUM);
}
};
static inline void PrintTo(const ConvParamID& v, std::ostream* os)
{
CV_Assert((int)v >= 0); CV_Assert((int)v < ConvParamID::CONV_LAST);
const ConvParam_t& p = testConvolutionConfigs[(int)v];
*os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
<< ", K=" << (Size)p.kernel
<< ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << ", " << p.shapeIn.dims[3] << "}"
<< ", OCN=" << p.outCN;
if (p.groups > 1)
*os << ", G=" << p.groups;
if (((Size)p.stride).area() != 1)
*os << ", S=" << ((Size)p.stride);
if (((Size)p.dilation).area() != 1)
*os << ", D=" << ((Size)p.dilation);
if (!((Size)p.pad).empty())
*os << ", P=" << ((Size)p.pad);
if (!((Size)p.padAdjust).empty())
*os << ", PAdj=" << ((Size)p.padAdjust);
if (!((std::string)p.padMode).empty())
*os << ", PM=" << ((std::string)p.padMode);
if (p.hasBias)
*os << ", BIAS";
}
typedef tuple<ConvParamID, tuple<Backend, Target> > ConvTestParam_t;
typedef TestBaseWithParam<ConvTestParam_t> Conv;
PERF_TEST_P_(Conv, conv)
{
int test_id = (int)get<0>(GetParam());
ASSERT_GE(test_id, 0); ASSERT_LT(test_id, ConvParamID::CONV_LAST);
const ConvParam_t& params = testConvolutionConfigs[test_id];
double declared_flops = params.declared_flops;
Size kernel = params.kernel;
MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 4);
int outChannels = params.outCN;
int groups = params.groups;
Size stride = params.stride;
Size dilation = params.dilation;
Size pad = params.pad;
Size padAdjust = params.padAdjust;
std::string padMode(params.padMode);
bool hasBias = params.hasBias;
Backend backendId = get<0>(get<1>(GetParam()));
Target targetId = get<1>(get<1>(GetParam()));
int inChannels = inputShape[1];
Size inSize(inputShape[3], inputShape[2]);
int sz[] = {outChannels, inChannels / groups, kernel.height, kernel.width};
Mat weights(4, &sz[0], CV_32F);
randu(weights, -1.0f, 1.0f);
LayerParams lp;
lp.set("kernel_w", kernel.width);
lp.set("kernel_h", kernel.height);
lp.set("pad_w", pad.width);
lp.set("pad_h", pad.height);
if (padAdjust.width > 0 || padAdjust.height > 0)
{
lp.set("adj_w", padAdjust.width);
lp.set("adj_h", padAdjust.height);
}
if (!padMode.empty())
lp.set("pad_mode", padMode);
lp.set("stride_w", stride.width);
lp.set("stride_h", stride.height);
lp.set("dilation_w", dilation.width);
lp.set("dilation_h", dilation.height);
lp.set("num_output", outChannels);
lp.set("group", groups);
lp.set("bias_term", hasBias);
lp.type = "Convolution";
lp.name = "testLayer";
lp.blobs.push_back(weights);
if (hasBias)
{
Mat bias(1, outChannels, CV_32F);
randu(bias, -1.0f, 1.0f);
lp.blobs.push_back(bias);
}
int inpSz[] = {1, inChannels, inSize.height, inSize.width};
Mat input(4, &inpSz[0], CV_32F);
randu(input, -1.0f, 1.0f);
Net net;
net.addLayerToPrev(lp.name, lp.type, lp);
net.setInput(input);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
// warmup
Mat output = net.forward();
MatShape netInputShape = shape(input);
size_t weightsMemory = 0, blobsMemory = 0;
net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
int64 flops = net.getFLOPS(netInputShape);
CV_Assert(flops > 0);
std::cout
<< "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
<< " OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
<< " Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
<< " MFLOPS=" << flops * 1e-6 << std::endl;
TEST_CYCLE()
{
Mat res = net.forward();
}
EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/**/, Conv, Combine(
ConvParamID::all(),
dnnBackendsAndTargets(false, false) // defined in ../test/test_common.hpp
));
} // namespace

View File

@@ -0,0 +1,163 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace opencv_test {
struct Conv1DParam_t {
int kernel;
struct BlobShape { int dims[3]; } shapeIn;
int outCN;
int groups;
int stride;
int dilation;
int pad[2];
const char* padMode;
bool hasBias;
double declared_flops;
};
// Details: #12142
static const Conv1DParam_t testConvolution1DConfigs[] = {
{3, {{1, 6, 10}}, 6, 1, 1, 1, {0, 0}, "VALID", true, 1776.},
{3, {{1, 2, 19}}, 2, 2, 2, 1, {1, 1}, "", true, 260.},
{3, {{1, 2, 25}}, 2, 2, 1, 1, {2, 2}, "SAME", false, 650.},
};
struct Conv1DParamID
{
enum {
CONV_0 = 0,
CONV_LAST = sizeof(testConvolution1DConfigs) / sizeof(testConvolution1DConfigs[0])
};
int val_;
Conv1DParamID(int val = 0) : val_(val) {}
operator int() const { return val_; }
static ::testing::internal::ParamGenerator<Conv1DParamID> all()
{
enum { NUM = (int)CONV_LAST };
Conv1DParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = Conv1DParamID(i); } // reduce generated code size
return ::testing::ValuesIn(v_, v_ + NUM);
}
};
static inline void PrintTo(const Conv1DParamID& v, std::ostream* os)
{
CV_Assert((int)v >= 0); CV_Assert((int)v < Conv1DParamID::CONV_LAST);
const Conv1DParam_t& p = testConvolution1DConfigs[(int)v];
*os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
<< ", K=[" << p.kernel << "]"
<< ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << "}"
<< ", OCN=" << p.outCN;
if (p.groups > 1)
*os << ", G=" << p.groups;
if (p.stride != 1)
*os << ", S=" << p.stride;
if (p.dilation != 1)
*os << ", D=" << p.dilation;
if (p.pad[0] != 0 && p.pad[1] != 0 )
*os << ", P=(" << p.pad[0] << ", " << p.pad[1] << ")";
if (!((std::string)p.padMode).empty())
*os << ", PM=" << ((std::string)p.padMode);
if (p.hasBias)
*os << ", BIAS";
}
typedef tuple<Conv1DParamID, tuple<Backend, Target> > Conv1DTestParam_t;
typedef TestBaseWithParam<Conv1DTestParam_t> Conv1D;
PERF_TEST_P_(Conv1D, conv1d)
{
int test_id = (int)get<0>(GetParam());
ASSERT_GE(test_id, 0); ASSERT_LT(test_id, Conv1DParamID::CONV_LAST);
const Conv1DParam_t& params = testConvolution1DConfigs[test_id];
double declared_flops = params.declared_flops;
DictValue kernel = DictValue::arrayInt(&params.kernel, 1);
DictValue stride = DictValue::arrayInt(&params.stride, 1);
DictValue pad = DictValue::arrayInt(&params.pad[0], 2);
DictValue dilation = DictValue::arrayInt(&params.dilation, 1);
MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 3);
int outChannels = params.outCN;
int groups = params.groups;
std::string padMode(params.padMode);
bool hasBias = params.hasBias;
Backend backendId = get<0>(get<1>(GetParam()));
Target targetId = get<1>(get<1>(GetParam()));
if (targetId != DNN_TARGET_CPU)
throw SkipTestException("Only CPU is supported");
int inChannels = inputShape[1];
int sz[] = {outChannels, inChannels / groups, params.kernel};
Mat weights(3, &sz[0], CV_32F);
randu(weights, -1.0f, 1.0f);
LayerParams lp;
lp.set("kernel_size", kernel);
lp.set("pad", pad);
if (!padMode.empty())
lp.set("pad_mode", padMode);
lp.set("stride", stride);
lp.set("dilation", dilation);
lp.set("num_output", outChannels);
lp.set("group", groups);
lp.set("bias_term", hasBias);
lp.type = "Convolution";
lp.name = "testLayer";
lp.blobs.push_back(weights);
if (hasBias)
{
Mat bias(1, outChannels, CV_32F);
randu(bias, -1.0f, 1.0f);
lp.blobs.push_back(bias);
}
int inpSz[] = {1, inChannels, inputShape[2]};
Mat input(3, &inpSz[0], CV_32F);
randu(input, -1.0f, 1.0f);
Net net;
net.addLayerToPrev(lp.name, lp.type, lp);
net.setInput(input);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
// warmup
Mat output = net.forward();
MatShape netInputShape = shape(input);
size_t weightsMemory = 0, blobsMemory = 0;
net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
int64 flops = net.getFLOPS(netInputShape);
CV_Assert(flops > 0);
std::cout
<< "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
<< " OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
<< " Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
<< " MFLOPS=" << flops * 1e-6 << std::endl;
TEST_CYCLE()
{
Mat res = net.forward();
}
EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/**/, Conv1D, Combine(
Conv1DParamID::all(),
dnnBackendsAndTargets(false, false) // defined in ../test/test_common.hpp
));
} // namespace

View File

@@ -0,0 +1,182 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace opencv_test {
struct Conv3DParam_t {
int kernel[3];
struct BlobShape { int dims[5]; } shapeIn;
int outCN;
int groups;
int stride[3];
int dilation[3];
int pad[6];
const char* padMode;
bool hasBias;
double declared_flops;
};
// Details: #12142
static const Conv3DParam_t testConvolution3DConfigs[] = {
{{3, 3, 3}, {{1, 6, 10, 38, 50}}, 6, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "VALID", true, 26956800.},
{{3, 3, 3}, {{1, 2, 19, 19, 19}}, 2, 2, {2, 2, 2}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "", true, 218000.},
{{3, 3, 3}, {{1, 2, 25, 19, 19}}, 2, 2, {1, 2, 2}, {1, 1, 1}, {2, 2, 2, 2, 2, 2}, "SAME", false, 545000.},
{{3, 3, 3}, {{1, 11, 9, 150, 200}}, 11, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "VALID", true, 1342562760.},
{{3, 3, 3}, {{1, 10, 98, 10, 10}}, 10, 1, {1, 1, 1}, {1, 1, 1}, {1, 0, 1, 1, 0,1}, "SAME", false, 53018000.},
{{5, 5, 5}, {{1, 6, 19, 19, 19}}, 6, 2, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", false, 30395250.},
{{5, 5, 5}, {{1, 4, 50, 19, 19}}, 4, 1, {2, 2, 2}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "VALID", false, 5893888.},
{{5, 5, 5}, {{1, 3, 75, 75, 100}}, 3, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "SAME", true, 1267312500.},
{{5, 5, 5}, {{1, 2, 21, 75, 100}}, 2, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", true, 116103744.},
{{5, 5, 5}, {{1, 4, 40, 75, 75}}, 4, 1, {2, 2, 2}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", false, 93405312.},
{{7, 7, 7}, {{1, 6, 15, 19, 19}}, 6, 1, {2, 1, 1}, {1, 1, 1}, {3, 3, 3, 3, 3, 3}, "SAME", true, 71339376.},
{{7, 7, 7}, {{1, 2, 38, 38, 38}}, 2, 1, {1, 2, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", false, 44990464.},
{{1, 1, 1}, {{1, 4, 9, 10, 10}}, 4, 1, {1, 1, 2}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "VALID", false, 16200.},
{{3, 1, 4}, {{1, 14, 5, 10, 10}}, 14, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "SAME", false, 2359000.},
{{1, 1, 1}, {{1, 8, 1, 10, 10}}, 8, 8, {1, 1, 1}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, "", true, 58752.},
{{3, 4, 2}, {{1, 4, 8, 10, 10}}, 4, 4, {1, 2, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}, "", true, 166752.}
};
struct Conv3DParamID
{
enum {
CONV_0 = 0,
CONV_100 = 16,
CONV_LAST = sizeof(testConvolution3DConfigs) / sizeof(testConvolution3DConfigs[0])
};
int val_;
Conv3DParamID(int val = 0) : val_(val) {}
operator int() const { return val_; }
static ::testing::internal::ParamGenerator<Conv3DParamID> all()
{
#if 0
enum { NUM = (int)CONV_LAST };
#else
enum { NUM = (int)CONV_100 };
#endif
Conv3DParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = Conv3DParamID(i); } // reduce generated code size
return ::testing::ValuesIn(v_, v_ + NUM);
}
};
static inline void PrintTo(const Conv3DParamID& v, std::ostream* os)
{
CV_Assert((int)v >= 0); CV_Assert((int)v < Conv3DParamID::CONV_LAST);
const Conv3DParam_t& p = testConvolution3DConfigs[(int)v];
*os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
<< ", K=[" << p.kernel[0] << " x " << p.kernel[1] << " x " << p.kernel[2] << "]"
<< ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << ", " << p.shapeIn.dims[3] << ", " << p.shapeIn.dims[4] << "}"
<< ", OCN=" << p.outCN;
if (p.groups > 1)
*os << ", G=" << p.groups;
if (p.stride[0] * p.stride[1] * p.stride[2] != 1)
*os << ", S=[" << p.stride[0] << " x " << p.stride[1] << " x " << p.stride[2] << "]";
if (p.dilation[0] * p.dilation[1] * p.dilation[2] != 1)
*os << ", D=[" << p.dilation[0] << " x " << p.dilation[1] << " x " << p.dilation[2] << "]";
if (p.pad[0] != 0 && p.pad[1] != 0 && p.pad[2] != 0 &&
p.pad[3] != 0 && p.pad[4] != 0 && p.pad[5] != 0)
*os << ", P=(" << p.pad[0] << ", " << p.pad[3] << ") x ("
<< p.pad[1] << ", " << p.pad[4] << ") x ("
<< p.pad[2] << ", " << p.pad[5] << ")";
if (!((std::string)p.padMode).empty())
*os << ", PM=" << ((std::string)p.padMode);
if (p.hasBias)
*os << ", BIAS";
}
typedef tuple<Conv3DParamID, tuple<Backend, Target> > Conv3DTestParam_t;
typedef TestBaseWithParam<Conv3DTestParam_t> Conv3D;
PERF_TEST_P_(Conv3D, conv3d)
{
int test_id = (int)get<0>(GetParam());
ASSERT_GE(test_id, 0); ASSERT_LT(test_id, Conv3DParamID::CONV_LAST);
const Conv3DParam_t& params = testConvolution3DConfigs[test_id];
double declared_flops = params.declared_flops;
DictValue kernel = DictValue::arrayInt(&params.kernel[0], 3);
DictValue stride = DictValue::arrayInt(&params.stride[0], 3);
DictValue pad = DictValue::arrayInt(&params.pad[0], 6);
DictValue dilation = DictValue::arrayInt(&params.dilation[0], 3);
MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 5);
int outChannels = params.outCN;
int groups = params.groups;
std::string padMode(params.padMode);
bool hasBias = params.hasBias;
Backend backendId = get<0>(get<1>(GetParam()));
Target targetId = get<1>(get<1>(GetParam()));
if (targetId != DNN_TARGET_CPU && backendId != DNN_BACKEND_CUDA)
throw SkipTestException("Only CPU and CUDA is supported");
int inChannels = inputShape[1];
int sz[] = {outChannels, inChannels / groups, params.kernel[0], params.kernel[1], params.kernel[2]};
Mat weights(5, &sz[0], CV_32F);
randu(weights, -1.0f, 1.0f);
LayerParams lp;
lp.set("kernel_size", kernel);
lp.set("pad", pad);
if (!padMode.empty())
lp.set("pad_mode", padMode);
lp.set("stride", stride);
lp.set("dilation", dilation);
lp.set("num_output", outChannels);
lp.set("group", groups);
lp.set("bias_term", hasBias);
lp.type = "Convolution";
lp.name = "testLayer";
lp.blobs.push_back(weights);
if (hasBias)
{
Mat bias(1, outChannels, CV_32F);
randu(bias, -1.0f, 1.0f);
lp.blobs.push_back(bias);
}
int inpSz[] = {1, inChannels, inputShape[2], inputShape[3], inputShape[4]};
Mat input(5, &inpSz[0], CV_32F);
randu(input, -1.0f, 1.0f);
Net net;
net.addLayerToPrev(lp.name, lp.type, lp);
net.setInput(input);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
Mat output = net.forward();
MatShape netInputShape = shape(input);
size_t weightsMemory = 0, blobsMemory = 0;
net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
int64 flops = net.getFLOPS(netInputShape);
CV_Assert(flops > 0);
std::cout
<< "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
<< " OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
<< " Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
<< " MFLOPS=" << flops * 1e-6 << std::endl;
TEST_CYCLE()
{
Mat res = net.forward();
}
EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/**/, Conv3D, Combine(
Conv3DParamID::all(),
dnnBackendsAndTargets(false, false) // defined in ../test/test_common.hpp
));
} // namespace

View File

@@ -0,0 +1,95 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "perf_precomp.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace opencv_test {
struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
{
template<int DIMS>
void test_slice(const int* inputShape, const int* begin, const int* end)
{
int backendId = get<0>(GetParam());
int targetId = get<1>(GetParam());
Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
for (int i = 0; i < (int)input.total(); ++i)
input.ptr<float>()[i] = (float)(i & 4095);
std::vector<Range> range(DIMS);
for (int i = 0; i < DIMS; ++i)
range[i] = Range(begin[i], end[i]);
Net net;
LayerParams lp;
lp.type = "Slice";
lp.name = "testLayer";
lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
net.addLayerToPrev(lp.name, lp.type, lp);
// warmup
{
net.setInput(input);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
Mat out = net.forward();
EXPECT_GT(cv::norm(out, NORM_INF), 0);
#if 0
//normAssert(out, input(range));
cout << input(range).clone().reshape(1, 1) << endl;
cout << out.reshape(1, 1) << endl;
#endif
}
TEST_CYCLE()
{
Mat res = net.forward();
}
SANITY_CHECK_NOTHING();
}
};
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
{
const int inputShape[4] = {1, 64, 104, 104};
const int begin[] = {0, 32, 0, 0};
const int end[] = {1, 64, 104, 104};
test_slice<4>(inputShape, begin, end);
}
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2)
{
const int inputShape[4] = {1, 128, 52, 52};
const int begin[] = {0, 64, 0, 0};
const int end[] = {1, 128, 52, 52};
test_slice<4>(inputShape, begin, end);
}
PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3)
{
const int inputShape[4] = {1, 256, 26, 26};
const int begin[] = {0, 128, 0, 0};
const int end[] = {1, 256, 26, 26};
test_slice<4>(inputShape, begin, end);
}
PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
{
const int inputShape[4] = {1, 128, 80, 100};
const int begin[] = {0, 0, 2, 2};
const int end[] = {1, 128, 76, 96};
test_slice<4>(inputShape, begin, end);
}
INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
} // namespace

View File

@@ -0,0 +1,16 @@
#include "perf_precomp.hpp"
static const char* extraTestDataPath =
#ifdef WINRT
NULL;
#else
getenv("OPENCV_DNN_TEST_DATA_PATH");
#endif
#if defined(HAVE_HPX)
#include <hpx/hpx_main.hpp>
#endif
CV_PERF_TEST_MAIN(dnn,
extraTestDataPath ? (void)cvtest::addDataSearchPath(extraTestDataPath) : (void)0
)

View File

@@ -0,0 +1,305 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2017, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
#include "perf_precomp.hpp"
#include "opencv2/core/ocl.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#include "../test/test_common.hpp"
namespace opencv_test {
class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target> >
{
public:
dnn::Backend backend;
dnn::Target target;
dnn::Net net;
DNNTestNetwork()
{
backend = (dnn::Backend)(int)get<0>(GetParam());
target = (dnn::Target)(int)get<1>(GetParam());
}
void processNet(std::string weights, std::string proto, std::string halide_scheduler,
const Mat& input, const std::string& outputLayer = "")
{
randu(input, 0.0f, 1.0f);
weights = findDataFile(weights, false);
if (!proto.empty())
proto = findDataFile(proto);
if (backend == DNN_BACKEND_HALIDE)
{
if (halide_scheduler == "disabled")
throw cvtest::SkipTestException("Halide test is disabled");
if (!halide_scheduler.empty())
halide_scheduler = findDataFile(std::string("dnn/halide_scheduler_") + (target == DNN_TARGET_OPENCL ? "opencl_" : "") + halide_scheduler, true);
}
net = readNet(proto, weights);
net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
net.setPreferableBackend(backend);
net.setPreferableTarget(target);
if (backend == DNN_BACKEND_HALIDE)
{
net.setHalideScheduler(halide_scheduler);
}
MatShape netInputShape = shape(1, 3, input.rows, input.cols);
size_t weightsMemory = 0, blobsMemory = 0;
net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
int64 flops = net.getFLOPS(netInputShape);
CV_Assert(flops > 0);
net.forward(outputLayer); // warmup
std::cout << "Memory consumption:" << std::endl;
std::cout << " Weights(parameters): " << divUp(weightsMemory, 1u<<20) << " Mb" << std::endl;
std::cout << " Blobs: " << divUp(blobsMemory, 1u<<20) << " Mb" << std::endl;
std::cout << "Calculation complexity: " << flops * 1e-9 << " GFlops" << std::endl;
PERF_SAMPLE_BEGIN()
net.forward();
PERF_SAMPLE_END()
SANITY_CHECK_NOTHING();
}
};
PERF_TEST_P_(DNNTestNetwork, AlexNet)
{
processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
"alexnet.yml", Mat(cv::Size(227, 227), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
{
processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
"", Mat(cv::Size(224, 224), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, ResNet_50)
{
processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
"resnet_50.yml", Mat(cv::Size(224, 224), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
{
processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
"squeezenet_v1_1.yml", Mat(cv::Size(227, 227), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, Inception_5h)
{
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) throw SkipTestException("");
processNet("dnn/tensorflow_inception_graph.pb", "",
"inception_5h.yml",
Mat(cv::Size(224, 224), CV_32FC3), "softmax2");
}
PERF_TEST_P_(DNNTestNetwork, ENet)
{
if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU) ||
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
throw SkipTestException("");
#endif
processNet("dnn/Enet-model-best.net", "", "enet.yml",
Mat(cv::Size(512, 256), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, SSD)
{
processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", "disabled",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, OpenFace)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL))
throw SkipTestException("");
#endif
processNet("dnn/openface_nn4.small2.v1.t7", "", "",
Mat(cv::Size(96, 96), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
Mat(cv::Size(224, 224), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
{
if (backend == DNN_BACKEND_HALIDE ||
(backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL)))
throw SkipTestException("");
// The same .caffemodel but modified .prototxt
// See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", "",
Mat(cv::Size(368, 368), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, YOLOv3)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) // nGraph compilation failure
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
throw SkipTestException("Test is disabled in OpenVINO 2020.4");
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
throw SkipTestException("Test is disabled in OpenVINO 2020.4");
#endif
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000) // nGraph compilation failure
if (target == DNN_TARGET_MYRIAD)
throw SkipTestException("");
#endif
Mat sample = imread(findDataFile("dnn/dog416.png"));
cvtColor(sample, sample, COLOR_BGR2RGB);
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
processNet("dnn/yolov3.weights", "dnn/yolov3.cfg", "", inp);
}
PERF_TEST_P_(DNNTestNetwork, YOLOv4)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
if (target == DNN_TARGET_MYRIAD) // not enough resources
throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) // nGraph compilation failure
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
throw SkipTestException("Test is disabled in OpenVINO 2020.4");
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
throw SkipTestException("Test is disabled in OpenVINO 2020.4");
#endif
Mat sample = imread(findDataFile("dnn/dog416.png"));
cvtColor(sample, sample, COLOR_BGR2RGB);
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
processNet("dnn/yolov4.weights", "dnn/yolov4.cfg", "", inp);
}
PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000) // nGraph compilation failure
if (target == DNN_TARGET_MYRIAD)
throw SkipTestException("");
#endif
Mat sample = imread(findDataFile("dnn/dog416.png"));
cvtColor(sample, sample, COLOR_BGR2RGB);
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
processNet("dnn/yolov4-tiny.weights", "dnn/yolov4-tiny.cfg", "", inp);
}
PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
{
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
throw SkipTestException("Test is disabled in OpenVINO 2019R1");
#endif
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
throw SkipTestException("Test is disabled in OpenVINO 2019R2");
#endif
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
if (target == DNN_TARGET_MYRIAD)
throw SkipTestException("Test is disabled in OpenVINO 2021.1+ / MYRIAD");
#endif
if (backend == DNN_BACKEND_HALIDE ||
(backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target != DNN_TARGET_CPU) ||
(backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("");
processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
"dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", "",
Mat(cv::Size(800, 600), CV_32FC3));
}
PERF_TEST_P_(DNNTestNetwork, EfficientDet)
{
if (backend == DNN_BACKEND_HALIDE || target != DNN_TARGET_CPU)
throw SkipTestException("");
Mat sample = imread(findDataFile("dnn/dog416.png"));
resize(sample, sample, Size(512, 512));
Mat inp;
sample.convertTo(inp, CV_32FC3, 1.0/255);
processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", "", inp);
}
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
} // namespace

View File

@@ -0,0 +1,14 @@
#ifndef __OPENCV_PERF_PRECOMP_HPP__
#define __OPENCV_PERF_PRECOMP_HPP__
#include <opencv2/ts.hpp>
#include <opencv2/dnn.hpp>
#include "../test/test_common.hpp"
namespace opencv_test {
using namespace perf;
using namespace cv::dnn;
} // namespace
#endif

View File

@@ -0,0 +1,593 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "../precomp.hpp"
#ifdef HAVE_PROTOBUF
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <google/protobuf/message.h>
#include <google/protobuf/text_format.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include "caffe_io.hpp"
#endif
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
#ifdef HAVE_PROTOBUF
using ::google::protobuf::RepeatedField;
using ::google::protobuf::RepeatedPtrField;
using ::google::protobuf::Message;
using ::google::protobuf::Descriptor;
using ::google::protobuf::FieldDescriptor;
using ::google::protobuf::Reflection;
namespace
{
template<typename T>
static cv::String toString(const T &v)
{
std::ostringstream ss;
ss << v;
return ss.str();
}
static inline
MatShape parseBlobShape(const caffe::BlobShape& _input_shape)
{
MatShape shape;
for (int i = 0; i < _input_shape.dim_size(); i++)
{
shape.push_back((int)_input_shape.dim(i));
}
return shape;
}
class CaffeImporter
{
caffe::NetParameter net;
caffe::NetParameter netBinary;
public:
CaffeImporter(const char *pototxt, const char *caffeModel)
{
CV_TRACE_FUNCTION();
ReadNetParamsFromTextFileOrDie(pototxt, &net);
if (caffeModel && caffeModel[0])
ReadNetParamsFromBinaryFileOrDie(caffeModel, &netBinary);
}
CaffeImporter(const char *dataProto, size_t lenProto,
const char *dataModel, size_t lenModel)
{
CV_TRACE_FUNCTION();
ReadNetParamsFromTextBufferOrDie(dataProto, lenProto, &net);
if (dataModel != NULL && lenModel > 0)
ReadNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBinary);
}
void extractCustomParams(const google::protobuf::UnknownFieldSet& unknownFields, cv::dnn::LayerParams &params)
{
const int numFields = unknownFields.field_count();
for (int i = 0; i < numFields; ++i)
{
const google::protobuf::UnknownField& field = unknownFields.field(i);
CV_Assert(field.type() == google::protobuf::UnknownField::TYPE_GROUP);
std::string fieldName = field.group().field(0).length_delimited();
std::string fieldValue = field.group().field(1).length_delimited();
params.set(fieldName, fieldValue);
}
}
void addParam(const Message &msg, const FieldDescriptor *field, cv::dnn::LayerParams &params)
{
const Reflection *refl = msg.GetReflection();
int type = field->cpp_type();
bool isRepeated = field->is_repeated();
const std::string &name = field->name();
#define SET_UP_FILED(getter, arrayConstr, gtype) \
if (isRepeated) { \
const RepeatedField<gtype> &v = refl->GetRepeatedField<gtype>(msg, field); \
params.set(name, DictValue::arrayConstr(v.begin(), (int)v.size())); \
} \
else { \
params.set(name, refl->getter(msg, field)); \
}
switch (type)
{
case FieldDescriptor::CPPTYPE_INT32:
SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int32);
break;
case FieldDescriptor::CPPTYPE_UINT32:
SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint32);
break;
case FieldDescriptor::CPPTYPE_INT64:
SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int64);
break;
case FieldDescriptor::CPPTYPE_UINT64:
SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint64);
break;
case FieldDescriptor::CPPTYPE_BOOL:
SET_UP_FILED(GetBool, arrayInt, bool);
break;
case FieldDescriptor::CPPTYPE_DOUBLE:
SET_UP_FILED(GetDouble, arrayReal, double);
break;
case FieldDescriptor::CPPTYPE_FLOAT:
SET_UP_FILED(GetFloat, arrayReal, float);
break;
case FieldDescriptor::CPPTYPE_STRING:
if (isRepeated) {
const RepeatedPtrField<std::string> &v = refl->GetRepeatedPtrField<std::string>(msg, field);
params.set(name, DictValue::arrayString(v.begin(), (int)v.size()));
}
else {
params.set(name, refl->GetString(msg, field));
}
break;
case FieldDescriptor::CPPTYPE_ENUM:
if (isRepeated) {
int size = refl->FieldSize(msg, field);
std::vector<cv::String> buf(size);
for (int i = 0; i < size; i++)
buf[i] = refl->GetRepeatedEnum(msg, field, i)->name();
params.set(name, DictValue::arrayString(buf.begin(), size));
}
else {
params.set(name, refl->GetEnum(msg, field)->name());
}
break;
default:
CV_Error(Error::StsError, "Unknown type \"" + String(field->type_name()) + "\" in prototxt");
break;
}
}
inline static bool ends_with_param(const std::string &str)
{
static const std::string _param("_param");
return (str.size() >= _param.size()) && str.compare(str.size() - _param.size(), _param.size(), _param) == 0;
}
void extractLayerParams(const Message &msg, cv::dnn::LayerParams &params, bool isInternal = false)
{
const Descriptor *msgDesc = msg.GetDescriptor();
const Reflection *msgRefl = msg.GetReflection();
for (int fieldId = 0; fieldId < msgDesc->field_count(); fieldId++)
{
const FieldDescriptor *fd = msgDesc->field(fieldId);
if (!isInternal && !ends_with_param(fd->name()))
continue;
const google::protobuf::UnknownFieldSet& unknownFields = msgRefl->GetUnknownFields(msg);
bool hasData = fd->is_required() ||
(fd->is_optional() && msgRefl->HasField(msg, fd)) ||
(fd->is_repeated() && msgRefl->FieldSize(msg, fd) > 0) ||
!unknownFields.empty();
if (!hasData)
continue;
extractCustomParams(unknownFields, params);
if (fd->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE)
{
if (fd->is_repeated()) //Extract only first item!
extractLayerParams(msgRefl->GetRepeatedMessage(msg, fd, 0), params, true);
else
extractLayerParams(msgRefl->GetMessage(msg, fd), params, true);
}
else
{
addParam(msg, fd, params);
}
}
}
void blobShapeFromProto(const caffe::BlobProto &pbBlob, MatShape& shape)
{
shape.clear();
if (pbBlob.has_num() || pbBlob.has_channels() || pbBlob.has_height() || pbBlob.has_width())
{
shape.push_back(pbBlob.num());
shape.push_back(pbBlob.channels());
shape.push_back(pbBlob.height());
shape.push_back(pbBlob.width());
}
else if (pbBlob.has_shape())
{
shape = parseBlobShape(pbBlob.shape());
}
else
shape.resize(1, 1); // Is a scalar.
}
void blobFromProto(const caffe::BlobProto &pbBlob, cv::Mat &dstBlob)
{
MatShape shape;
blobShapeFromProto(pbBlob, shape);
dstBlob.create((int)shape.size(), &shape[0], CV_32F);
if (pbBlob.data_size())
{
// Single precision floats.
CV_Assert(pbBlob.data_size() == (int)dstBlob.total());
CV_DbgAssert(pbBlob.GetDescriptor()->FindFieldByLowercaseName("data")->cpp_type() == FieldDescriptor::CPPTYPE_FLOAT);
Mat(dstBlob.dims, &dstBlob.size[0], CV_32F, (void*)pbBlob.data().data()).copyTo(dstBlob);
}
else
{
CV_Assert(pbBlob.has_raw_data());
const std::string& raw_data = pbBlob.raw_data();
if (pbBlob.raw_data_type() == caffe::FLOAT16)
{
// Half precision floats.
CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());
Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
convertFp16(halfs, dstBlob);
}
else if (pbBlob.raw_data_type() == caffe::FLOAT)
{
CV_Assert(raw_data.size() / 4 == (int)dstBlob.total());
Mat((int)shape.size(), &shape[0], CV_32FC1, (void*)raw_data.c_str()).copyTo(dstBlob);
}
else
CV_Error(Error::StsNotImplemented, "Unexpected blob data type");
}
}
void extractBinaryLayerParams(const caffe::LayerParameter& layer, LayerParams& layerParams)
{
const std::string &name = layer.name();
int li;
for (li = 0; li != netBinary.layer_size(); li++)
{
const caffe::LayerParameter& binLayer = netBinary.layer(li);
// Break if the layer name is the same and the blobs are not cleared
if (binLayer.name() == name && binLayer.blobs_size() != 0)
break;
}
if (li == netBinary.layer_size())
return;
caffe::LayerParameter* binLayer = netBinary.mutable_layer(li);
const int numBlobs = binLayer->blobs_size();
std::vector<caffe::BlobProto*> blobs(numBlobs);
binLayer->mutable_blobs()->ExtractSubrange(0, numBlobs, blobs.data());
layerParams.blobs.resize(numBlobs);
for (int bi = 0; bi < numBlobs; bi++)
{
blobFromProto(*blobs[bi], layerParams.blobs[bi]);
delete blobs[bi];
}
}
struct BlobNote
{
BlobNote(const std::string &_name, int _layerId, int _outNum) :
name(_name), layerId(_layerId), outNum(_outNum) {}
std::string name;
int layerId, outNum;
};
std::vector<BlobNote> addedBlobs;
std::map<String, int> layerCounter;
void populateNet(Net dstNet)
{
CV_TRACE_FUNCTION();
int layersSize = net.layer_size();
layerCounter.clear();
addedBlobs.clear();
addedBlobs.reserve(layersSize + 1);
//setup input layer names
std::vector<String> netInputs(net.input_size());
std::vector<MatShape> inp_shapes;
{
int net_input_size = net.input_size();
for (int inNum = 0; inNum < net_input_size; inNum++)
{
addedBlobs.push_back(BlobNote(net.input(inNum), 0, inNum));
netInputs[inNum] = net.input(inNum);
}
if (net.input_dim_size() > 0) // deprecated in Caffe proto
{
int net_input_dim_size = net.input_dim_size();
CV_Check(net_input_dim_size, net_input_dim_size % 4 == 0, "");
CV_CheckEQ(net_input_dim_size, net_input_size * 4, "");
for (int inp_id = 0; inp_id < net_input_size; inp_id++)
{
int dim = inp_id * 4;
MatShape shape(4);
shape[0] = net.input_dim(dim);
shape[1] = net.input_dim(dim+1);
shape[2] = net.input_dim(dim+2);
shape[3] = net.input_dim(dim+3);
inp_shapes.push_back(shape);
}
}
else if (net.input_shape_size() > 0) // deprecated in Caffe proto
{
int net_input_shape_size = net.input_shape_size();
CV_CheckEQ(net_input_shape_size, net_input_size, "");
for (int inp_id = 0; inp_id < net_input_shape_size; inp_id++)
{
MatShape shape = parseBlobShape(net.input_shape(inp_id));
inp_shapes.push_back(shape);
}
}
else
{
for (int inp_id = 0; inp_id < net_input_size; inp_id++)
{
MatShape shape; // empty
inp_shapes.push_back(shape);
}
}
}
for (int li = 0; li < layersSize; li++)
{
const caffe::LayerParameter &layer = net.layer(li);
String name = layer.name();
String type = layer.type();
LayerParams layerParams;
extractLayerParams(layer, layerParams);
extractBinaryLayerParams(layer, layerParams);
int repetitions = layerCounter[name]++;
if (repetitions)
name += String("_") + toString(repetitions);
if (type == "Input")
{
for (int outNum = 0; outNum < layer.top_size(); outNum++)
{
addOutput(layer, 0, outNum);
addedBlobs.back().outNum = netInputs.size();
netInputs.push_back(addedBlobs.back().name);
}
if (layer.has_input_param())
{
const caffe::InputParameter &inputParameter = layer.input_param();
int input_shape_size = inputParameter.shape_size();
CV_CheckEQ(input_shape_size, layer.top_size(), "");
for (int inp_id = 0; inp_id < input_shape_size; inp_id++)
{
MatShape shape = parseBlobShape(inputParameter.shape(inp_id));
inp_shapes.push_back(shape);
}
}
continue;
}
else if (type == "BatchNorm")
{
if (!layerParams.get<bool>("use_global_stats", true))
{
CV_Assert_N(layer.bottom_size() == 1, layer.top_size() == 1);
LayerParams mvnParams;
mvnParams.set("eps", layerParams.get<float>("eps", 1e-5));
std::string mvnName = name + "/mvn";
int repetitions = layerCounter[mvnName]++;
if (repetitions)
mvnName += String("_") + toString(repetitions);
int mvnId = dstNet.addLayer(mvnName, "MVN", mvnParams);
addInput(layer.bottom(0), mvnId, 0, dstNet);
addOutput(layer, mvnId, 0);
net.mutable_layer(li)->set_bottom(0, layer.top(0));
layerParams.blobs[0].setTo(0); // mean
layerParams.blobs[1].setTo(1); // std
}
}
else if (type == "Axpy")
{
CV_Assert_N(layer.bottom_size() == 3, layer.top_size() == 1);
std::string scaleName = name + "/scale";
int repetitions = layerCounter[scaleName]++;
if (repetitions) {
scaleName += String("_") + toString(repetitions);
}
LayerParams scaleParams;
scaleParams.set("axis", 1);
scaleParams.set("has_bias", false);
int scaleId = dstNet.addLayer(scaleName, "Scale", scaleParams);
addInput(layer.bottom(2), scaleId, 0, dstNet);
addInput(layer.bottom(0), scaleId, 1, dstNet);
addOutput(layer, scaleId, 0);
net.mutable_layer(li)->set_bottom(0, layer.top(0));
net.mutable_layer(li)->mutable_bottom()->RemoveLast();
type = "Eltwise";
}
else if (type == "Resample")
{
CV_Assert(layer.bottom_size() == 1 || layer.bottom_size() == 2);
type = "Resize";
String interp = toLowerCase(layerParams.get<String>("type"));
layerParams.set("interpolation", interp == "linear" ? "bilinear" : interp);
if (layerParams.has("factor"))
{
float factor = layerParams.get<float>("factor");
CV_Assert(layer.bottom_size() != 2 || factor == 1.0);
layerParams.set("zoom_factor", factor);
if ((interp == "linear" && factor != 1.0) ||
(interp == "nearest" && factor < 1.0))
CV_Error(Error::StsNotImplemented, "Unsupported Resample mode");
}
}
else if ("Convolution" == type)
{
CV_Assert(layer.bottom_size() == layer.top_size());
for (int i = 0; i < layer.bottom_size(); i++)
{
int conv_id = dstNet.addLayer(layer.top(i), type, layerParams);
addInput(layer.bottom(i), conv_id, 0, dstNet);
addedBlobs.push_back(BlobNote(layer.top(i), conv_id, 0));
}
continue;
}
else if ("ConvolutionDepthwise" == type)
{
type = "Convolution";
}
int id = dstNet.addLayer(name, type, layerParams);
for (int inNum = 0; inNum < layer.bottom_size(); inNum++)
addInput(layer.bottom(inNum), id, inNum, dstNet);
for (int outNum = 0; outNum < layer.top_size(); outNum++)
addOutput(layer, id, outNum);
}
dstNet.setInputsNames(netInputs);
if (inp_shapes.size() > 0)
{
CV_CheckEQ(inp_shapes.size(), netInputs.size(), "");
for (int inp_id = 0; inp_id < inp_shapes.size(); inp_id++)
dstNet.setInputShape(netInputs[inp_id], inp_shapes[inp_id]);
}
addedBlobs.clear();
}
void addOutput(const caffe::LayerParameter &layer, int layerId, int outNum)
{
const std::string &name = layer.top(outNum);
bool haveDups = false;
for (int idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
{
if (addedBlobs[idx].name == name)
{
haveDups = true;
break;
}
}
if (haveDups)
{
bool isInplace = layer.bottom_size() > outNum && layer.bottom(outNum) == name;
if (!isInplace)
CV_Error(Error::StsBadArg, "Duplicate blobs produced by multiple sources");
}
addedBlobs.push_back(BlobNote(name, layerId, outNum));
}
void addInput(const std::string &name, int layerId, int inNum, Net &dstNet)
{
int idx;
for (idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
{
if (addedBlobs[idx].name == name)
break;
}
if (idx < 0)
{
CV_Error(Error::StsObjectNotFound, "Can't find output blob \"" + name + "\"");
return;
}
dstNet.connect(addedBlobs[idx].layerId, addedBlobs[idx].outNum, layerId, inNum);
}
};
}
Net readNetFromCaffe(const String &prototxt, const String &caffeModel /*= String()*/)
{
CaffeImporter caffeImporter(prototxt.c_str(), caffeModel.c_str());
Net net;
caffeImporter.populateNet(net);
return net;
}
Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
const char *bufferModel, size_t lenModel)
{
CaffeImporter caffeImporter(bufferProto, lenProto, bufferModel, lenModel);
Net net;
caffeImporter.populateNet(net);
return net;
}
Net readNetFromCaffe(const std::vector<uchar>& bufferProto, const std::vector<uchar>& bufferModel)
{
const char* bufferProtoPtr = reinterpret_cast<const char*>(&bufferProto[0]);
const char* bufferModelPtr = bufferModel.empty() ? NULL :
reinterpret_cast<const char*>(&bufferModel[0]);
return readNetFromCaffe(bufferProtoPtr, bufferProto.size(),
bufferModelPtr, bufferModel.size());
}
#endif //HAVE_PROTOBUF
CV__DNN_INLINE_NS_END
}} // namespace

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,129 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/*M///////////////////////////////////////////////////////////////////////////////////////
//COPYRIGHT
//
//All contributions by the University of California:
//Copyright (c) 2014, The Regents of the University of California (Regents)
//All rights reserved.
//
//All other contributions:
//Copyright (c) 2014, the respective contributors
//All rights reserved.
//
//Caffe uses a shared copyright model: each contributor holds copyright over
//their contributions to Caffe. The project versioning records all such
//contribution and copyright details. If a contributor wants to further mark
//their specific copyright on a particular contribution, they should indicate
//their copyright solely in the commit message of the change when it is
//committed.
//
//LICENSE
//
//Redistribution and use in source and binary forms, with or without
//modification, are permitted provided that the following conditions are met:
//
//1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
//THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
//ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
//WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
//DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
//ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
//(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
//LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
//ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
//(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
//SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//CONTRIBUTION AGREEMENT
//
//By contributing to the BVLC/caffe repository through pull-request, comment,
//or otherwise, the contributor releases their content to the
//license and copyright terms herein.
//
//M*/
#ifndef __OPENCV_DNN_CAFFE_IO_HPP__
#define __OPENCV_DNN_CAFFE_IO_HPP__
#ifdef HAVE_PROTOBUF
#if defined(__GNUC__) && __GNUC__ >= 5
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-override"
#endif
#include "opencv-caffe.pb.h"
#if defined(__GNUC__) && __GNUC__ >= 5
#pragma GCC diagnostic pop
#endif
namespace caffe { using namespace opencv_caffe; } // avoid massive renames from caffe proto package
namespace cv {
namespace dnn {
// Read parameters from a file into a NetParameter proto message.
void ReadNetParamsFromTextFileOrDie(const char* param_file,
caffe::NetParameter* param);
void ReadNetParamsFromBinaryFileOrDie(const char* param_file,
caffe::NetParameter* param);
// Read parameters from a memory buffer into a NetParammeter proto message.
void ReadNetParamsFromBinaryBufferOrDie(const char* data, size_t len,
caffe::NetParameter* param);
void ReadNetParamsFromTextBufferOrDie(const char* data, size_t len,
caffe::NetParameter* param);
// Utility functions used internally by Caffe and TensorFlow loaders
bool ReadProtoFromTextFile(const char* filename, ::google::protobuf::Message* proto);
bool ReadProtoFromBinaryFile(const char* filename, ::google::protobuf::Message* proto);
bool ReadProtoFromTextBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
bool ReadProtoFromBinaryBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
}
}
#endif
#endif

View File

@@ -0,0 +1,80 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2017, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
#include "../precomp.hpp"
#ifdef HAVE_PROTOBUF
#include <fstream>
#include "caffe_io.hpp"
#endif
namespace cv { namespace dnn {
CV__DNN_INLINE_NS_BEGIN
#ifdef HAVE_PROTOBUF
void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& layersTypes)
{
CV_TRACE_FUNCTION();
std::vector<String> types(layersTypes);
if (types.empty())
{
types.push_back("Convolution");
types.push_back("InnerProduct");
}
caffe::NetParameter net;
ReadNetParamsFromBinaryFileOrDie(src.c_str(), &net);
for (int i = 0; i < net.layer_size(); ++i)
{
caffe::LayerParameter* lp = net.mutable_layer(i);
if (std::find(types.begin(), types.end(), lp->type()) == types.end())
{
continue;
}
for (int j = 0; j < lp->blobs_size(); ++j)
{
caffe::BlobProto* blob = lp->mutable_blobs(j);
CV_Assert(blob->data_size() != 0); // float32 array.
Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
Mat halfs(1, blob->data_size(), CV_16SC1);
convertFp16(floats, halfs); // Convert to float16.
blob->clear_data(); // Clear float32 data.
// Set float16 data.
blob->set_raw_data(halfs.data, halfs.total() * halfs.elemSize());
blob->set_raw_data_type(caffe::FLOAT16);
}
}
#if GOOGLE_PROTOBUF_VERSION < 3005000
size_t msgSize = saturate_cast<size_t>(net.ByteSize());
#else
size_t msgSize = net.ByteSizeLong();
#endif
std::vector<uint8_t> output(msgSize);
net.SerializeWithCachedSizesToArray(&output[0]);
std::ofstream ofs(dst.c_str(), std::ios::binary);
ofs.write((const char*)&output[0], msgSize);
ofs.close();
}
#else
void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& types)
{
CV_Error(cv::Error::StsNotImplemented, "libprotobuf required to import data from Caffe models");
}
#endif // HAVE_PROTOBUF
CV__DNN_INLINE_NS_END
}} // namespace

View File

@@ -0,0 +1,106 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
#define __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <opencv2/core.hpp>
#define CHECK(cond) for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #cond, cond); _logger.exit(); _logger.check()) _logger.stream()
#define CHECK_EQ(a, b) for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #a"="#b, ((a) == (b))); _logger.exit(); _logger.check()) _logger.stream()
#define LOG(TYPE) for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, #TYPE); _logger.exit(); _logger.check()) _logger.stream()
namespace cv
{
namespace dnn
{
class GLogWrapper
{
const char *file, *func, *type, *cond_str;
int line;
bool cond_status, exit_loop;
std::stringstream sstream;
public:
GLogWrapper(const char *_file, const char *_func, int _line,
const char *_type,
const char *_cond_str = NULL, bool _cond_status = true
) :
file(_file), func(_func), type(_type), cond_str(_cond_str),
line(_line), cond_status(_cond_status), exit_loop(true) {}
std::iostream &stream()
{
return sstream;
}
bool exit()
{
return exit_loop;
}
void check()
{
exit_loop = false;
if (cond_str && !cond_status)
{
cv::error(cv::Error::StsError, "FAILED: " + String(cond_str) + ". " + sstream.str(), func, file, line);
}
else if (!cond_str && strcmp(type, "CHECK"))
{
#ifndef NDEBUG
if (!std::strcmp(type, "INFO"))
std::cout << sstream.str() << std::endl;
else
std::cerr << sstream.str() << std::endl;
#endif
}
}
};
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,121 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "functors.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
__global__ void generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
using vector_type = get_vector_type_t<T, N>;
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
ActivationOp activation_op(act_params);
EltwiseOp eltwise_op(eltwise_params);
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
vector_type output_vec, eltwise_vec;
v_load(output_vec, inplace_output_vPtr[i]);
v_load(eltwise_vec, eltwise_vPtr[i]);
for(int j = 0; j < output_vec.size(); j++)
output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j]), eltwise_vec.data[j]);
v_store(inplace_output_vPtr[i], output_vec);
}
}
}
template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
void launch_vectorized_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
CV_Assert(is_fully_aligned<T>(inplace_output, N));
CV_Assert(is_fully_aligned<T>(eltwise, N));
auto kernel = raw::generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
launch_kernel(kernel, policy, inplace_output, eltwise, act_params, eltwise_params);
}
template <class T, class ActivationOp, class EltwiseOp> static
void generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
CV_Assert(inplace_output.size() == eltwise.size());
if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4)) {
launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, eltwise, act_params, eltwise_params);
} else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2)) {
launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, eltwise, act_params, eltwise_params);
} else {
launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, eltwise, act_params, eltwise_params);
}
}
template <class T>
void relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T slope) {
generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {slope});
}
template <class T>
void clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T floor, T ceiling) {
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {floor, ceiling});
}
template <class T>
void tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
}
template <class T>
void swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
}
template <class T>
void mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
}
template <class T>
void sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
}
template <class T>
void power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T exp, T scale, T shift) {
generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {exp, scale, shift});
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half);
template void clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
template void tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
template void swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
template void mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
template void sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
template void power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
#endif
template void relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float);
template void clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float);
template void tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
template void swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
template void mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
template void sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
template void power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float, float);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,209 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "functors.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/scale_shift.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class ActivationOp, std::size_t N>
__global__ void generic_op_vec(Span<T> output, View<T> input, const typename ActivationOp::Params params) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
ActivationOp activation_op(params);
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vector_type::size(); j++)
vec.data[j] = activation_op(vec.data[j]);
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
const index_type c = (i / inner_size) % slope.size();
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vector_type::size(); j++)
vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c];
v_store(output_vPtr[i], vec);
}
}
} /* namespace raw */
template <class T, class ActivationOp, std::size_t N> static
void launch_vectorized_generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
auto kernel = raw::generic_op_vec<T, ActivationOp, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, params);
}
template <class T, class ActivationOp> static
void generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params = {}) {
CV_Assert(input.size() == output.size());
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
launch_vectorized_generic_op<T, ActivationOp, 4>(stream, output, input, params);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
launch_vectorized_generic_op<T, ActivationOp, 2>(stream, output, input, params);
} else {
launch_vectorized_generic_op<T, ActivationOp, 1>(stream, output, input, params);
}
}
template <class T>
void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
generic_op<T, ReLUFunctor<T>>(stream, output, input, {slope});
}
template <class T>
void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
generic_op<T, ClippedReLUFunctor<T>>(stream, output, input, {floor, ceiling});
}
template <class T>
void tanh(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, TanHFunctor<T>>(stream, output, input);
}
template <class T>
void swish(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, SwishFunctor<T>>(stream, output, input);
}
template <class T>
void mish(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, MishFunctor<T>>(stream, output, input);
}
template <class T>
void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, SigmoidFunctor<T>>(stream, output, input);
}
template <class T>
void elu(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, ELUFunctor<T>>(stream, output, input);
}
template <class T>
void bnll(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, BNLLFunctor<T>>(stream, output, input);
}
template <class T>
void abs(const Stream& stream, Span<T> output, View<T> input) {
generic_op<T, AbsFunctor<T>>(stream, output, input);
}
template <class T>
void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
CV_Assert(input.size() == output.size());
if (static_cast<float>(exp) == 1.0f) {
scale1_with_bias1(stream, output, input, scale, shift);
return;
}
generic_op<T, PowerFunctor<T>>(stream, output, input, {exp, scale, shift});
}
template <class T>
void exp(const Stream& stream, Span<T> output, View<T> input, T normScale, T normShift) {
generic_op<T, ExpFunctor<T>>(stream, output, input, {normScale, normShift});
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
template void swish<__half>(const Stream&, Span<__half>, View<__half>);
template void mish<__half>(const Stream&, Span<__half>, View<__half>);
template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
template void elu<__half>(const Stream&, Span<__half>, View<__half>);
template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
#endif
template void relu<float>(const Stream&, Span<float>, View<float>, float);
template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
template void tanh<float>(const Stream&, Span<float>, View<float>);
template void swish<float>(const Stream&, Span<float>, View<float>);
template void mish<float>(const Stream&, Span<float>, View<float>);
template void sigmoid<float>(const Stream&, Span<float>, View<float>);
template void elu<float>(const Stream&, Span<float>, View<float>);
template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
template void bnll<float>(const Stream&, Span<float>, View<float>);
template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
template <class T, std::size_t N> static
void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::axiswise_relu_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, inner_size / N, slope);
}
template <class T>
void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
CV_Assert(input.size() == output.size());
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope);
} else {
launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>);
#endif
template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,73 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP
#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP
#include <cuda_runtime.h>
#include "types.hpp"
#include <cstddef>
#include <type_traits>
#include <iterator>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
template <class T, std::size_t N>
struct array {
using value_type = T;
using size_type = device::size_type;
using difference_type = std::ptrdiff_t;
using reference = typename std::add_lvalue_reference<value_type>::type;
using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type;
using pointer = typename std::add_pointer<value_type>::type;
using const_pointer = typename std::add_pointer<typename std::add_const<value_type>::type>::type;
using iterator = pointer;
using const_iterator = const_pointer;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
__host__ __device__ bool empty() const noexcept { return N == 0; }
__host__ __device__ size_type size() const noexcept { return N; }
__host__ __device__ iterator begin() noexcept { return ptr; }
__host__ __device__ iterator end() noexcept { return ptr + N; }
__host__ __device__ const_iterator begin() const noexcept { return ptr; }
__host__ __device__ const_iterator end() const noexcept { return ptr + N; }
__host__ __device__ const_iterator cbegin() const noexcept { return ptr; }
__host__ __device__ const_iterator cend() const noexcept { return ptr + N; }
__host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; }
__host__ __device__ reverse_iterator rend() noexcept { return ptr; }
__host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; }
__host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; }
__host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; }
__host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; }
template <class InputItr>
__host__ void assign(InputItr first, InputItr last) {
std::copy(first, last, std::begin(ptr));
}
__host__ __device__ reference operator[](int idx) { return ptr[idx]; }
__host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; }
__host__ __device__ reference front() { return ptr[0]; }
__host__ __device__ const_reference front() const { return ptr[0]; }
__host__ __device__ reference back() { return ptr[N - 1]; }
__host__ __device__ const_reference back() const { return ptr[N - 1]; }
__host__ __device__ pointer data() noexcept { return ptr; }
__host__ __device__ const_pointer data() const noexcept { return ptr; }
T ptr[N];
};
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */

View File

@@ -0,0 +1,38 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
#include <cuda_runtime.h>
#include <cuda_fp16.h>
// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
// This function was introduced in CUDA 10.
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700 && CUDART_VERSION >= 10000)
// And half-precision floating-point operations are not supported by devices of compute capability strictly lower than 5.3
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
#elif __CUDA_ARCH__ < 530
#else
inline __device__ void atomicAdd(__half* address, __half val) {
unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
do {
assumed = old;
__half_raw hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
__half tmpres = hsum + val;
hsum = __half_raw(tmpres);
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
#endif
#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */

View File

@@ -0,0 +1,39 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
#define OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP
#include "math.hpp"
#include <cuda_runtime.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
struct BoundingBox
{
float xmin, ymin, xmax, ymax;
};
template <bool NORMALIZED_BBOX>
__device__ __forceinline__ float compute_bbox_size(BoundingBox bbox)
{
float width = bbox.xmax - bbox.xmin;
float height = bbox.ymax - bbox.ymin;
if (width < 0 || height < 0)
return 0.0;
if (!NORMALIZED_BBOX)
{
width += 1;
height += 1;
}
using csl::device::mul_ftz;
return mul_ftz(width, height);
}
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA_BBOX_UTILS_HPP */

View File

@@ -0,0 +1,120 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "functors.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class ActivationOp, std::size_t N>
__global__ void biasN_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, const typename ActivationOp::Params params) {
using vector_type = get_vector_type_t<T, N>;
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
ActivationOp activation_op(params);
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
const index_type bias_idx = (i / inner_size) % bias.size();
vector_type vec;
v_load(vec, inplace_output_vPtr[i]);
for(int j = 0; j < vec.size(); j++)
vec.data[j] = activation_op(vec.data[j] + bias[bias_idx]);
v_store(inplace_output_vPtr[i], vec);
}
}
} /* namespace raw */
template <class T, class ActivationOp, std::size_t N> static
void launch_vectorized_biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params) {
CV_Assert(inplace_output.size() % inner_size == 0);
CV_Assert(is_fully_aligned<T>(inplace_output, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::biasN_generic_op_inplace_vec<T, ActivationOp, N>;
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, params);
}
template <class T, class ActivationOp> static
void biasN_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, const typename ActivationOp::Params& params = {}) {
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 4>(stream, inplace_output, inner_size, bias, params);
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 2>(stream, inplace_output, inner_size, bias, params);
} else {
launch_vectorized_biasN_generic_op_inplace<T, ActivationOp, 1>(stream, inplace_output, inner_size, bias, params);
}
}
template <class T>
void biasN_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
biasN_generic_op_inplace<T, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {slope});
}
template <class T>
void biasN_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceil));
biasN_generic_op_inplace<T, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, {floor, ceil});
}
template <class T>
void biasN_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
biasN_generic_op_inplace<T, TanHFunctor<T>>(stream, inplace_output, inner_size, bias);
}
template <class T>
void biasN_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
biasN_generic_op_inplace<T, SwishFunctor<T>>(stream, inplace_output, inner_size, bias);
}
template <class T>
void biasN_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
biasN_generic_op_inplace<T, MishFunctor<T>>(stream, inplace_output, inner_size, bias);
}
template <class T>
void biasN_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
biasN_generic_op_inplace<T, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias);
}
template <class T>
void biasN_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power, T scale, T shift) {
biasN_generic_op_inplace<T, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, {power, scale, shift});
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half);
template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half, __half);
#endif
template void biasN_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
template void biasN_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float);
template void biasN_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
template void biasN_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
template void biasN_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
template void biasN_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
template void biasN_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float, float);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,125 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "functors.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
__global__ void biasN_generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
using vector_type = get_vector_type_t<T, N>;
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
ActivationOp activation_op(act_params);
EltwiseOp eltwise_op(eltwise_params);
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
const index_type bias_idx = (i / inner_size) % bias.size();
vector_type output_vec, eltwise_vec;
v_load(output_vec, inplace_output_vPtr[i]);
v_load(eltwise_vec, eltwise_vPtr[i]);
for(int j = 0; j < output_vec.size(); j++)
output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j] + bias[bias_idx]), eltwise_vec.data[j]);
v_store(inplace_output_vPtr[i], output_vec);
}
}
}
template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
void launch_vectorized_biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
CV_Assert(is_fully_aligned<T>(inplace_output, N));
CV_Assert(is_fully_aligned<T>(eltwise, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::biasN_generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, act_params, eltwise_params);
}
template <class T, class ActivationOp, class EltwiseOp> static
void biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
CV_Assert(inplace_output.size() == eltwise.size());
if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
} else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
} else {
launch_vectorized_biasN_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params);
}
}
template <class T>
void biasN_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
biasN_generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {slope});
}
template <class T>
void biasN_clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
biasN_generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {floor, ceiling});
}
template <class T>
void biasN_tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
biasN_generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {exp, scale, shift});
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void biasN_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
template void biasN_clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
template void biasN_tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
#endif
template void biasN_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
template void biasN_clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
template void biasN_tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,132 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "functors.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
__global__ void biasN_eltwise_op_generic_op_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
using vector_type = get_vector_type_t<T, N>;
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
EltwiseOp eltwise_op(eltwise_params);
ActivationOp activation_op(act_params);
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
const index_type bias_idx = (i / inner_size) % bias.size();
vector_type output_vec, eltwise_vec;
v_load(output_vec, inplace_output_vPtr[i]);
v_load(eltwise_vec, eltwise_vPtr[i]);
for(int j = 0; j < output_vec.size(); j++)
output_vec.data[j] = activation_op(eltwise_op(output_vec.data[j] + bias[bias_idx], eltwise_vec.data[j]));
v_store(inplace_output_vPtr[i], output_vec);
}
}
}
template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
void launch_vectorized_biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
CV_Assert(is_fully_aligned<T>(inplace_output, N));
CV_Assert(inplace_output.size() % bias.size() == 0);
CV_Assert(is_fully_aligned<T>(eltwise, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::biasN_eltwise_op_generic_op_inplace_vec<T, EltwiseOp, ActivationOp, N>;
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, eltwise_params, act_params);
}
template <class T, class EltwiseOp, class ActivationOp> static
void biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
CV_Assert(inplace_output.size() == eltwise.size());
if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4) && inner_size % 4 == 0) {
launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 4>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
} else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2) && inner_size % 2 == 0) {
launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 2>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
} else {
launch_vectorized_biasN_eltwise_op_generic_op_inplace<T, EltwiseOp, ActivationOp, 1>(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params);
}
}
template <class T>
void biasN_eltwise_sum_2_identity_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, IdentityFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_eltwise_sum_2_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T slope) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {slope});
}
template <class T>
void biasN_eltwise_sum_2_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T floor, T ceiling) {
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {floor, ceiling});
}
template <class T>
void biasN_eltwise_sum_2_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, TanHFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_eltwise_sum_2_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SwishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_eltwise_sum_2_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, MishFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_eltwise_sum_2_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise);
}
template <class T>
void biasN_eltwise_sum_2_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, View<T> eltwise, T exp, T scale, T shift) {
biasN_eltwise_op_generic_op_inplace<T, SumFunctor<T>, PowerFunctor<T>>(stream, inplace_output, inner_size, bias, eltwise, {}, {exp, scale, shift});
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void biasN_eltwise_sum_2_identity_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_eltwise_sum_2_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half);
template void biasN_eltwise_sum_2_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half);
template void biasN_eltwise_sum_2_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_eltwise_sum_2_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_eltwise_sum_2_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_eltwise_sum_2_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>);
template void biasN_eltwise_sum_2_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half);
#endif
template void biasN_eltwise_sum_2_identity_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_eltwise_sum_2_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float);
template void biasN_eltwise_sum_2_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float);
template void biasN_eltwise_sum_2_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_eltwise_sum_2_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_eltwise_sum_2_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_eltwise_sum_2_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>);
template void biasN_eltwise_sum_2_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, View<float>, float, float, float);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,71 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
#define OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP
#include "types.hpp"
#include "index_helpers.hpp"
#include <cuda_runtime.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
template <int dim, int BLOCK_SIZE = 0, class index_type = device::index_type, class size_type = device::size_type>
class block_stride_range_generic {
public:
__device__ block_stride_range_generic(index_type to_) : from(0), to(to_) { }
__device__ block_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
class iterator
{
public:
__device__ iterator(index_type pos_) : pos(pos_) {}
/* these iterators return the index when dereferenced; this allows us to loop
* through the indices using a range based for loop
*/
__device__ index_type operator*() const { return pos; }
__device__ iterator& operator++() {
const index_type block_size = BLOCK_SIZE == 0 ? getBlockDim<dim>() : BLOCK_SIZE;
pos += block_size;
return *this;
}
__device__ bool operator!=(const iterator& other) const {
/* NOTE HACK
* 'pos' can move in large steps (see operator++)
* expansion of range for loop uses != as the loop conditioion
* => operator!= must return false if 'pos' crosses the end
*/
return pos < other.pos;
}
private:
index_type pos;
};
__device__ iterator begin() const {
return iterator(from + getThreadIdx<dim>());
}
__device__ iterator end() const {
return iterator(to);
}
private:
index_type from, to;
};
using block_stride_range_x = block_stride_range_generic<0>;
using block_stride_range_y = block_stride_range_generic<1>;
using block_stride_range_z = block_stride_range_generic<2>;
template <size_type BLOCK_SIZE = 0>
using block_stride_range = block_stride_range_generic<0, BLOCK_SIZE>;
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_BLOCK_STRIDE_RANGE_HPP */

View File

@@ -0,0 +1,277 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "kernel_dispatcher.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include <cstddef>
#include <vector>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t N>
__global__ void concat_vec(
Span<T> output, size_type output_axis_size, index_type output_axis_offset,
View<T> input, size_type input_axis_size, size_type concat_size)
{
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
/* we need to copy all the elements of input to some location in the output
* we copy blocks of size `total_concat_size` to some location in the output
*/
const auto total_concat_size = concat_size * input_axis_size;
for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) {
const index_type idx = in_idx * vector_type::size();
const index_type concat_num = idx / total_concat_size;
const index_type concat_index = idx % total_concat_size;
const index_type top_index = concat_index +
(concat_num * output_axis_size + output_axis_offset) * concat_size;
const auto out_idx = top_index / vector_type::size();
vector_type vec;
v_load(vec, input_vPtr[in_idx]);
v_store(output_vPtr[out_idx], vec);
}
}
template <class T, std::size_t Rank>
__global__ void concat_with_offsets(
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset,
View<T> input, array<size_type, Rank> in_strides)
{
for (auto i : grid_stride_range(input.size())) {
index_type in_index = i / in_strides[0];
index_type out_index = out_offset[0] + in_index;
index_type oidx = out_index * out_strides[0];
for (int j = 1; j < Rank; j++) {
in_index = (i % in_strides[j - 1]) / in_strides[j];
out_index = out_offset[j] + in_index;
oidx += out_index * out_strides[j];
}
output[oidx] = input[i];
}
}
}
template <class T, std::size_t N> static
void launch_vectorized_concat(const Stream& stream,
Span<T> output, size_type output_axis_size, index_type output_axis_offset,
View<T> input, size_type input_axis_size, size_type concat_size)
{
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
/* more assertions are required to fully check for vectorization possibility; check concat() */
auto kernel = raw::concat_vec<T, N>;
auto policy = make_policy(kernel, input.size() / N, 0, stream);
launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
}
template <class T>
void concat(
const Stream& stream,
TensorSpan<T> output, std::size_t output_axis_offset,
TensorView<T> input, std::size_t axis)
{
CV_Assert(output.rank() == input.rank());
CV_Assert(output_axis_offset < output.get_axis_size(axis));
/* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
* in the output and we can copy each block directly
*/
if (output.size_range(0, axis) == 1)
{
auto stride = output.size_range(axis + 1, output.rank());
auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
kernels::copy<T>(stream, sliced_output, input);
return;
}
/* let's call the axis of interest as the channel axis for the purpose of the following discussion
* even though it can be any axis
*
* for each batch item:
* we move all the channels from the input (which together, for a single batch item, is contiguous)
* of a batch item to its corresponding contiguous place in the output
*
* for a valid vector operation:
* - the size of each copy block must be aligned
* - input must be aligned
* - all the destination locations in the output must be aligned
*/
std::size_t concat_size = output.size_range(axis + 1, output.rank());
std::size_t input_axis_size = input.get_axis_size(axis);
std::size_t output_axis_size = output.get_axis_size(axis);
std::size_t copy_block_size = concat_size * input_axis_size;
std::size_t copy_block_stride = concat_size * output_axis_size;
std::size_t starting_offset = output_axis_offset * concat_size;
/* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size`
* to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride`
*/
bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0;
bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0;
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) {
launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) {
launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
} else {
launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
#endif
template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>, std::size_t);
template <class T, std::size_t Rank> static
void launch_concat_with_offsets(
const Stream& stream,
Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset,
View<T> input, const std::vector<std::size_t>& inStride)
{
CV_Assert(outStride.size() == Rank);
CV_Assert(outOffset.size() == Rank);
CV_Assert(inStride.size() == Rank);
array<size_type, Rank> outStride_k, inStride_k;
outStride_k.assign(std::begin(outStride), std::end(outStride));
inStride_k.assign(std::begin(inStride), std::end(inStride));
array<index_type, Rank> outOffset_k;
outOffset_k.assign(std::begin(outOffset), std::end(outOffset));
auto kernel = raw::concat_with_offsets<T, Rank>;
auto policy = make_policy(kernel, input.size(), 0, stream);
launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k);
}
GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets);
template <class T>
void concat_with_offsets(
const Stream& stream,
TensorSpan<T> output, TensorView<T> input,
std::vector<std::size_t> offsets)
{
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == offsets.size());
/* squeezable axes at the beginning of both tensors can be eliminated
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output
* tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items
* from the input tensor to new locations in the output tensor.
*
* If the size of the first axis of the input and output tensor is unity, the input and output
* indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...]
* respectively. The first index does not contribute to the element's address calculation and
* hence does nothing apart from eating up few cycles.
*/
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
CV_Assert(offsets[0] == 0);
input.squeeze(0);
output.squeeze(0);
offsets.erase(std::begin(offsets));
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == offsets.size());
}
auto inShape = input.shape_as_vector();
auto outShape = output.shape_as_vector();
/* contiguous axes that undergo full copy can be combined into one axis
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any
* concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
*
* Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
* a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`.
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
*/
for (int i = 0; i < inShape.size(); i++) {
/* check if axis `i` requires any slicing */
if (offsets[i] == 0 && inShape[i] == outShape[i]) {
/* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
/* `j` axis is also copied fully; merge `i` and `j` */
auto new_size = inShape[i] * inShape[j];
inShape[i] = new_size;
outShape[i] = new_size;
offsets[i] = 0; /* redundant */
/* delete axis `j` */
inShape.erase(std::begin(inShape) + j);
outShape.erase(std::begin(outShape) + j);
offsets.erase(std::begin(offsets) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape.size() == outShape.size());
CV_Assert(inShape.size() == offsets.size());
CV_Assert(inShape[i] == outShape[i]);
CV_Assert(offsets[i] == 0);
}
}
}
auto rank = inShape.size();
std::vector<std::size_t> inStride(rank), outStride(rank);
inStride.back() = 1;
outStride.back() = 1;
/* garbage, ..., garbage, 1 */
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
/* dim[0], dim[1], ..., dim[-1], 1 */
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
/* stride[0], stride[1], ..., stride[-2], 1 */
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride);
}
template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,171 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "memory.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
#include <cuda_runtime.h>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t CHANNELS_PER_ITER>
__global__ void crop_and_resize(
Span<T> output, size_type out_height, size_type out_width,
View<T> input, size_type in_height, size_type in_width,
View<T> boxes,
size_type num_channels)
{
// input [1, num_channels, in_height, in_width]
// output [boxes, num_channels, out_height, out_width]
const auto in_image_size = in_height * in_width;
const auto out_image_size = out_height * out_width;
const auto out_box_size = num_channels * out_image_size;
/* we have to compute the output value for every combination of (box, c, y, x) in the output
*
* the computation involving (y, x) are identical for all non-spatial dimensions
* the computation and memory requests involving the box are identical for remaining three axes
*
* we process multiple channels every iteration to reuse the identical computation
* and memory requests involved with the box and spatial dimensions
*/
/*
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
* (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y)
*/
auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER;
/* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are
* `num_boxes` boxes and `out_image_size` combinations of (x, y)
*/
auto num_boxes = boxes.size() / 7; /* 7 values per box */
auto iters_per_box = num_channel_iters_per_box_xy * out_image_size;
auto iters_required = num_boxes * iters_per_box;
for (auto iter : grid_stride_range(iters_required)) {
const index_type box_no = iter / iters_per_box;
const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER;
/* note here that consecutive `iter` values will often have consecutive `x` values
* => stores into output will be coalesced across threads
*/
const index_type y = (iter % out_image_size) / out_width;
const index_type x = iter % out_width;
const index_type box_offset = box_no * 7;
const auto left = boxes[box_offset + 3],
top = boxes[box_offset + 4],
right = boxes[box_offset + 5],
bottom = boxes[box_offset + 6];
const auto box_width = right - left;
const auto box_height = bottom - top;
const auto o2i_fy = static_cast<T>(in_height - 1) / static_cast<T>(out_height - 1);
const auto o2i_fx = static_cast<T>(in_width - 1) / static_cast<T>(out_width - 1);
const auto height_scale = box_height * o2i_fy;
const auto width_scale = box_width * o2i_fx;
const auto in_y = top * static_cast<T>(in_height - 1) + static_cast<T>(y) * height_scale;
const auto in_x = left * static_cast<T>(in_width - 1) + static_cast<T>(x) * width_scale;
const auto in_y0 = static_cast<index_type>(in_y);
const auto in_x0 = static_cast<index_type>(in_x);
using device::min;
const auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
const auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x;
#pragma unroll 1 /* disable unrolling */
for (int i = 0; i < CHANNELS_PER_ITER; i++) {
auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
v_01 = load_ldg(input[in_offset_r0 + in_x1]),
v_10 = load_ldg(input[in_offset_r1 + in_x0]),
v_11 = load_ldg(input[in_offset_r1 + in_x1]);
output[out_idx] =
v_00 +
T(in_y - T(in_y0)) * T(v_10 - v_00) +
T(in_x - T(in_x0)) * T(v_01 - v_00) +
T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00);
in_offset_r0 += in_image_size;
in_offset_r1 += in_image_size;
out_idx += out_image_size;
}
}
}
}
template <class T, std::size_t CHANNELS_PER_ITER> static
void launch_multichannel_crop_and_resize(const Stream& stream,
Span<T> output, size_type out_height, size_type out_width,
View<T> input, size_type in_height, size_type in_width,
View<T> boxes, size_type num_channels)
{
auto kernel = raw::crop_and_resize<T, CHANNELS_PER_ITER>;
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
}
template <class T>
void crop_and_resize(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> boxes) {
CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
auto out_height = output.get_axis_size(-2);
auto out_width = output.get_axis_size(-1);
auto in_height = input.get_axis_size(-2);
auto in_width = input.get_axis_size(-1);
auto num_channels = input.get_axis_size(1);
if (num_channels % 64 == 0) {
launch_multichannel_crop_and_resize<T, 64>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
} else if (num_channels % 32 == 0) {
launch_multichannel_crop_and_resize<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
} else if (num_channels % 16 == 0) {
launch_multichannel_crop_and_resize<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
} else if (num_channels % 8 == 0) {
launch_multichannel_crop_and_resize<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
} else if (num_channels % 4 == 0) {
launch_multichannel_crop_and_resize<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
} else if (num_channels % 2 == 0) {
launch_multichannel_crop_and_resize<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
} else {
launch_multichannel_crop_and_resize<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes);
#endif
template void crop_and_resize<float>(const Stream&, TensorSpan<float>, TensorView<float>, View<float> boxes);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,897 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "bbox_utils.hpp"
#include "grid_stride_range.hpp"
#include "block_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "memory.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX>
__global__ void decode_bbox(Span<T> decoded_bboxes, View<T> locations, View<T> priors,
bool transpose_location, bool normalized_bbox,
size_type num_loc_classes, index_type background_class_id,
float clip_width, float clip_height)
{
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
// locations: [batch_size, num_priors, num_loc_classes, 4]
// priors: [1, C, num_priors, 4]
// C = 2 if !VARIANCE_ENCODED_IN_TARGET; otherwise, 1
/* 4 bbox values + 4 variance values per prior */
constexpr int PRIOR_BOX_SIZE = VARIANCE_ENCODED_IN_TARGET ? 4 : 8;
const size_type num_priors = priors.size() / PRIOR_BOX_SIZE;
using vector_type = get_vector_type_t<T, 4>;
auto locations_vPtr = vector_type::get_pointer(locations.data());
auto priors_vPtr = vector_type::get_pointer(priors.data());
auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
const auto boxes_per_batch = num_priors * num_loc_classes;
for (auto idx : grid_stride_range(decoded_bboxes.size() / 4))
{
index_type p;
index_type c;
if (SHARE_LOCATION)
{
// locations are shared across all classes => num_loc_classes = 1
p = idx % boxes_per_batch;
c = 0;
}
else
{
p = (idx % boxes_per_batch) / num_loc_classes;
c = idx % num_loc_classes;
}
if (!SHARE_LOCATION && c == background_class_id)
continue;
BoundingBox bbox;
{
vector_type location;
v_load(location, locations_vPtr[idx]);
if (transpose_location)
{
bbox.ymin = location.data[0];
bbox.xmin = location.data[1];
bbox.ymax = location.data[2];
bbox.xmax = location.data[3];
}
else
{
bbox.xmin = location.data[0];
bbox.ymin = location.data[1];
bbox.xmax = location.data[2];
bbox.ymax = location.data[3];
}
}
if (!VARIANCE_ENCODED_IN_TARGET)
{
vector_type prior_variance;
v_load_ldg(prior_variance, priors_vPtr[num_priors + p]);
bbox.xmin *= static_cast<float>(prior_variance.data[0]);
bbox.ymin *= static_cast<float>(prior_variance.data[1]);
bbox.xmax *= static_cast<float>(prior_variance.data[2]);
bbox.ymax *= static_cast<float>(prior_variance.data[3]);
}
BoundingBox prior;
{
vector_type prior_box;
v_load_ldg(prior_box, priors_vPtr[p]);
prior.xmin = prior_box.data[0];
prior.ymin = prior_box.data[1];
prior.xmax = prior_box.data[2];
prior.ymax = prior_box.data[3];
}
BoundingBox decoded_bbox;
if (CORNER_TRUE_CENTER_FALSE)
{
decoded_bbox.xmin = prior.xmin + bbox.xmin;
decoded_bbox.ymin = prior.ymin + bbox.ymin;
decoded_bbox.xmax = prior.xmax + bbox.xmax;
decoded_bbox.ymax = prior.ymax + bbox.ymax;
}
else
{
auto prior_width = prior.xmax - prior.xmin;
auto prior_height = prior.ymax - prior.ymin;
if (!normalized_bbox)
{
prior_width += 1;
prior_height += 1;
}
auto prior_center_x = prior.xmin + prior_width * 0.5f;
auto prior_center_y = prior.ymin + prior_height * 0.5f;
auto decode_bbox_center_x = bbox.xmin * prior_width + prior_center_x;
auto decode_bbox_center_y = bbox.ymin * prior_height + prior_center_y;
using device::exp;
float decode_bbox_width = exp(bbox.xmax) * prior_width;
float decode_bbox_height = exp(bbox.ymax) * prior_height;
decoded_bbox.xmin = decode_bbox_center_x - decode_bbox_width * 0.5f;
decoded_bbox.ymin = decode_bbox_center_y - decode_bbox_height * 0.5f;
decoded_bbox.xmax = decode_bbox_center_x + decode_bbox_width * 0.5f;
decoded_bbox.ymax = decode_bbox_center_y + decode_bbox_height * 0.5f;
}
vector_type decoded_bbox_vec;
if (CLIP_BBOX)
{
decoded_bbox_vec.data[0] = clamp(decoded_bbox.xmin, 0.0f, clip_width);
decoded_bbox_vec.data[1] = clamp(decoded_bbox.ymin, 0.0f, clip_height);
decoded_bbox_vec.data[2] = clamp(decoded_bbox.xmax, 0.0f, clip_width);
decoded_bbox_vec.data[3] = clamp(decoded_bbox.ymax, 0.0f, clip_height);
}
else
{
decoded_bbox_vec.data[0] = decoded_bbox.xmin;
decoded_bbox_vec.data[1] = decoded_bbox.ymin;
decoded_bbox_vec.data[2] = decoded_bbox.xmax;
decoded_bbox_vec.data[3] = decoded_bbox.ymax;
}
v_store(decoded_bboxes_vPtr[idx], decoded_bbox_vec);
}
}
template <class T, int BINS, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void findTopK(Span<int> indices_, Span<int> count_, View<T> scores_, float threshold, size_type classwise_topK, size_type num_classes, size_type num_priors, index_type background_class_id)
{
/* We need to sort boxes based on their confidence scores. The confidence scores fall in
* the range [0.0, 1.0]. We break the range into bins and perform count sort. This is an
* approximate algorithm.
*
* Each block handles a particular class of a particular batch item.
*/
const auto c = blockIdx.x;
const auto b = blockIdx.y;
if (c == background_class_id)
return;
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
// scores: [batch_size, num_classes, num_priors]
auto count = count_.data() + b * num_classes + c;
auto scores = scores_.data() + (b * num_classes + c) * num_priors;
auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
/* We do not require a large number of bins to find the top K confidence scores. We will use
* a reasonable number of bins which will fit in the shared memory.
*
* Note that smaller scores will have a smaller index, i.e. the `bins` are ordered in
* ascending order.
*/
__shared__ int bins[BINS];
#pragma unroll
for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
__syncthreads();
for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
{
const float confidence = load_ldg(scores[i]);
if (confidence > threshold)
{
using device::fast_divide_ftz;
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
using device::clamp;
int bin_index = conf_scaled * BINS;
/* We store counts of confidence scores in the bins. Our ultimate goal is to store the indices
* of the `classwise_topK` confidence values in the `indices` array.
*
* We use a little trick to parallelize the process of filling up the `indices` array.
* We want every thread in the block to participate in the process. To do so, we want the
* bins array to be shifted by one place to the left. We will be computing the suffix sum
* of the bins array later. Details and reasons for doing so will be explained later.
*/
bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
if (bin_index >= 0)
atomicAdd(&bins[bin_index], 1);
}
}
__syncthreads();
constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
// FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
if (threadIdx.x < WARP_SIZE)
{
/* We can compute suffix sum of an array in groups of N numbers.
* Let N be 4 for this example.
*
* 1) Last 4 numbers
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
* group suffix sum: 42 33 23 12
*
* 2) Middle 4 numbers
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
* group suffix sum: | 26 21 15 8 |
*
* We add `42` (first element in the previous group) to each element to get:
*
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
* | 68 63 57 50 | 42 33 23 12
* 3) First 4 numbers
*
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
* group suffix sum: 10 9 7 4 |
*
* We add `68` (first element in the previous group) to each element to get:
*
* 1 2 3 4 | 5 6 7 8 | 9 10 11 12
* group suffix sum: 78 77 75 72 | 68 63 57 50 | 42 33 23 12
*
* What we are left with now is the suffix sum of the entire array.
*
* We use the aforementioned logic in the code below but work in groups of `warpSize`.
*/
/* We calculate suffix sums WARP_SIZE elements at a time starting from the right end.
* Hence, we will need BINS / WARP_SIZE number of iterations.
*
* Each iteration uses shuffle instructions to exchange data between threads. Shuffle
* instructions cannot be used in warp-divergent code. If the bins are a multiple of
* the warpSize, all the threads in the warp will participate.
*/
static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
const int thread_id = threadIdx.x;
const int inverse_lane_id = WARP_SIZE - thread_id - 1;
int previous_group_first_element = 0;
for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
{
const index_type idx = iter * WARP_SIZE + thread_id;
auto value = bins[idx];
for (int i = 1; i < WARP_SIZE; i *= 2)
{
auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
if (inverse_lane_id >= i)
value += n;
}
value += previous_group_first_element;
bins[idx] = value;
previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
}
}
if (threadIdx.x == 0)
*count = 0;
__syncthreads();
for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
{
const float confidence = load_ldg(scores[i]);
if (confidence > threshold)
{
using device::fast_divide_ftz;
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
int bin_index = conf_scaled * BINS;
bin_index = clamp<int>(bin_index, 0, BINS - 1);
/* This bounding box is eligible to be selected unless it does not fall in
* the `classwise_topK`. If it did, we would have to compute the location where it needs
* to be stored.
*
* Suppose we had just 4 bins and say the following were the counts:
* BIN0 2
* BIN1 1
* BIN2 3
* BIN3 0 (last bin is always zero as we shift left by one while populating the bins)
*
* We will try our best to store the boxes in a sorted order in the `indices` array.
* This requires that the boxes in later bins (higher confidence scores) must be
* stored earlier.
*
* We compute the suffix sum of the array. This gives us:
* BIN0 6
* BIN1 4
* BIN2 3
* BIN3 0
*
* The bins now give us the location in the `indices` array from which the indices of the
* scores corresponding to that bin would be stored. We atomically increment the bin count
* everytime we store a box corresponding to that bin. Therefore, the value in the bins
* gives the index in the `indices` array where the next box corresponding to that bin must
* be put.
*/
const index_type idx = atomicAdd(&bins[bin_index], 1);
if (idx < classwise_topK)
{
indices[idx] = i;
atomicAdd(&count[0], 1);
}
}
}
}
template <class T>
__global__ void box_collect(Span<T> collected_bboxes_, View<T> decoded_bboxes_, View<int> indices_, View<int> count_, bool share_location, size_type num_priors, size_type num_classes, size_type classwise_topK, index_type background_class_id)
{
const index_type c = blockIdx.x;
if (c == background_class_id)
return;
const index_type b = blockIdx.y;
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
const auto num_loc_classes = share_location ? 1 : num_classes;
auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
auto decoded_bboxes = decoded_bboxes_.data() + b * num_priors * num_loc_classes * 4;
auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
auto count = count_.data() + b * num_classes + c;
const auto boxes = load_ldg(&count[0]);
if (boxes == 0)
return;
using vector_type = get_vector_type_t<T, 4>;
auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes);
auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
for (auto i : block_stride_range<>(boxes))
{
const auto prior_id = indices[i];
const index_type idx = share_location ? prior_id : (prior_id * num_classes + c);
vector_type box;
v_load(box, decoded_bboxes_vPtr[idx]);
v_store(collected_bboxes_vPtr[i], box);
}
}
template <class T, bool NORMALIZED_BBOX>
__global__ void blockwise_class_nms(Span<int> indices_, Span<int> count_, View<T> collected_bboxes_, size_type num_classes, size_type classwise_topK, index_type background_class_id, float nms_threshold)
{
const index_type b = blockIdx.x / num_classes;
const index_type c = blockIdx.x % num_classes;
if (c == background_class_id)
return;
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
auto count = count_.data() + b * num_classes + c;
auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
const auto boxes = count[0];
if (boxes == 0)
return;
using vector_type = get_vector_type_t<T, 4>;
auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
for (int i = 0; i < boxes; i++)
{
auto prior_id = indices[i];
if (prior_id != -1)
{
BoundingBox bbox1;
{
vector_type box;
v_load(box, collected_bboxes_vPtr[i]);
bbox1.xmin = box.data[0];
bbox1.ymin = box.data[1];
bbox1.xmax = box.data[2];
bbox1.ymax = box.data[3];
}
for (auto j : block_stride_range<>(i + 1, boxes))
{
prior_id = indices[j];
if (prior_id == -1)
continue;
BoundingBox bbox2;
{
vector_type box;
v_load_ldg(box, collected_bboxes_vPtr[j]);
bbox2.xmin = box.data[0];
bbox2.ymin = box.data[1];
bbox2.xmax = box.data[2];
bbox2.ymax = box.data[3];
}
using device::min;
using device::max;
BoundingBox intersect_bbox;
intersect_bbox.xmin = max(bbox1.xmin, bbox2.xmin);
intersect_bbox.ymin = max(bbox1.ymin, bbox2.ymin);
intersect_bbox.xmax = min(bbox1.xmax, bbox2.xmax);
intersect_bbox.ymax = min(bbox1.ymax, bbox2.ymax);
float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
float bbox1_size = compute_bbox_size<NORMALIZED_BBOX>(bbox1);
float bbox2_size = compute_bbox_size<NORMALIZED_BBOX>(bbox2);
using device::fast_divide_ftz;
float iou = fast_divide_ftz(intersect_size, bbox1_size + bbox2_size - intersect_size);
if (iou > nms_threshold)
indices[j] = -1;
}
}
__syncthreads();
}
if (threadIdx.x == 0)
count[0] = 0;
__syncthreads();
for (auto i : block_stride_range<>(boxes))
{
auto prior_id = indices[i];
if(prior_id != -1)
{
const index_type idx = atomicAdd(&count[0], 1);
indices[idx] = prior_id;
}
}
}
template <class T, std::size_t BINS, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void nms_collect(
Span<int> kept_indices, Span<int> kept_count, View<int> indices_, View<int> count, View<T> scores_, float threshold,
size_type num_classes, size_type num_priors, size_type classwise_topK, size_type keepTopK, index_type background_class_id)
{
// sorting algorithm is documented in detail in findTopK kernel comments
// no explanations are provided here
// kept_indices: [batch_size, keepTopK]
// kept_count: [batch_size]
const auto b = blockIdx.x;
__shared__ int bins[BINS];
#pragma unroll
for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
__syncthreads();
for (int c = 0; c < num_classes; c++)
{
if (c == background_class_id)
continue;
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
// scores: [batch_size, num_classes, num_priors]
const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
auto boxes = count[b * num_classes + c];
for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
{
auto prior_id = indices[i];
const float confidence = load_ldg(scores[prior_id]);
if (confidence > threshold)
{
using device::fast_divide_ftz;
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
using device::clamp;
int bin_index = conf_scaled * BINS;
bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
if (bin_index >= 0)
atomicAdd(&bins[bin_index], 1);
}
}
}
__syncthreads();
constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
// FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
if (threadIdx.x < WARP_SIZE)
{
static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
const int thread_id = threadIdx.x;
const int inverse_lane_id = WARP_SIZE - thread_id - 1;
int previous_group_first_element = 0;
for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
{
const index_type idx = iter * WARP_SIZE + thread_id;
auto value = bins[idx];
for (int i = 1; i < WARP_SIZE; i *= 2)
{
auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
if (inverse_lane_id >= i)
value += n;
}
value += previous_group_first_element;
bins[idx] = value;
previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
}
}
if (threadIdx.x == 0)
kept_count[b] = 0;
__syncthreads();
for (int c = 0; c < num_classes; c++)
{
if (c == background_class_id)
continue;
const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
auto boxes = count[b * num_classes + c];
for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
{
auto prior_id = indices[i];
const float confidence = load_ldg(scores[prior_id]);
if (confidence > threshold)
{
using device::fast_divide_ftz;
auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
using device::clamp;
int bin_index = conf_scaled * BINS;
bin_index = clamp<int>(bin_index, 0, BINS - 1);
const index_type idx = atomicAdd(&bins[bin_index], 1);
if (idx < keepTopK)
{
kept_indices[b * keepTopK + idx] = c * num_priors + prior_id;
atomicAdd(&kept_count[b], 1);
}
}
}
}
}
template <class T>
__global__ void consolidate_detections(Span<T> output,
View<int> kept_indices, View<int> kept_count, View<T> decoded_bboxes, View<T> scores, bool share_location,
size_type batch_size, size_type num_classes, size_type num_priors, size_type keepTopK, DevicePtr<int> num_detections)
{
using vector_type = get_vector_type_t<T, 4>;
auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
// output: [1, 1, batch_size * keepTopK, 7]
// kept_indices: [batch_size, keepTopK]
// kept_count: [batch_size]
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
// scores: [batch_size, num_classes, num_priors]
for (int b = 0; b < batch_size; b++)
{
for (auto i : grid_stride_range(kept_count[b]))
{
auto score_id = kept_indices[b * keepTopK + i];
auto c = score_id / num_priors;
auto prior_id = score_id % num_priors;
const auto confidence = scores[b * num_classes * num_priors + score_id];
index_type bbox_id;
if (share_location)
{
// decoded_bboxes: [batch_size, num_priors, 1, 4]
bbox_id = b * num_priors + prior_id;
}
else
{
// decoded_bboxes: [batch_size, num_priors, num_classes, 4]
bbox_id = (b * num_priors + prior_id) * num_classes + c;
}
vector_type bbox;
v_load(bbox, decoded_bboxes_vPtr[bbox_id]);
auto output_id = atomicAdd(num_detections.get(), 1);
output[output_id * 7 + 0] = b;
output[output_id * 7 + 1] = c;
output[output_id * 7 + 2] = confidence;
output[output_id * 7 + 3] = bbox.data[0];
output[output_id * 7 + 4] = bbox.data[1];
output[output_id * 7 + 5] = bbox.data[2];
output[output_id * 7 + 6] = bbox.data[3];
}
}
}
}
template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX> static
void launch_decode_boxes_kernel(const Stream& stream, Span<T> decoded_bboxes, View<T> locations, View<T> priors,
bool transpose_location, bool normalized_bbox,
size_type num_loc_classes, index_type background_class_id,
float clip_width, float clip_height)
{
auto kernel = raw::decode_bbox<T, SHARE_LOCATION, VARIANCE_ENCODED_IN_TARGET, CORNER_TRUE_CENTER_FALSE, CLIP_BBOX>;
auto policy = make_policy(kernel, decoded_bboxes.size() / 4, 0, stream);
launch_kernel(kernel, policy, decoded_bboxes, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
}
template <class T, unsigned int current, class ...Args> static
typename std::enable_if<current == 0, void>
::type dispatch_decode_bboxes(int selector, Args&& ...args) {
if(selector == 0)
launch_decode_boxes_kernel<T, 0, 0, 0, 0>(std::forward<Args>(args)...);
}
template <class T, unsigned int current, class ...Args> static
typename std::enable_if<current != 0, void>
::type dispatch_decode_bboxes(int selector, Args&& ...args) {
if(selector == current)
launch_decode_boxes_kernel<T,
static_cast<bool>(current & 8),
static_cast<bool>(current & 4),
static_cast<bool>(current & 2),
static_cast<bool>(current & 1)>(std::forward<Args>(args)...);
else
dispatch_decode_bboxes<T, current - 1, Args...>(selector, std::forward<Args>(args)...);
}
template <class T>
void decode_bboxes(const Stream& stream, Span<T> output, View<T> locations, View<T> priors,
std::size_t num_loc_classes,
bool share_location, std::size_t background_class_id,
bool transpose_location, bool variance_encoded_in_target,
bool corner_true_or_center_false, bool normalized_bbox,
bool clip_box, float clip_width, float clip_height)
{
/* `config` combines three kernel template options into one number using which a bit of TMP code can
* run through all possible combinations and instantiate the correct template
*/
unsigned int config = (share_location << 3 | variance_encoded_in_target << 2 | corner_true_or_center_false << 1 | clip_box);
dispatch_decode_bboxes<T, 15>(config, stream, output, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
}
template void decode_bboxes(const Stream&, Span<__half>, View<__half>, View<__half>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
template void decode_bboxes(const Stream&, Span<float>, View<float>, View<float>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
template <class T>
void findTopK(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> scores, std::size_t background_class_id, float threshold)
{
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
// scores: [batch_size, num_classes, num_priors]
const auto batch_size = indices.get_axis_size(0);
CV_Assert(count.get_axis_size(0) == batch_size);
CV_Assert(scores.get_axis_size(0) == batch_size);
const auto num_classes = indices.get_axis_size(1);
CV_Assert(count.get_axis_size(1) == num_classes);
CV_Assert(scores.get_axis_size(1) == num_classes);
const auto classwise_topK = indices.get_axis_size(2);
const auto num_priors = scores.get_axis_size(2);
/* each block processes one class from each batch */
constexpr auto BLOCK_SIZE = 256;
dim3 grid_size(num_classes, batch_size);
dim3 block_size(BLOCK_SIZE);
auto policy = execution_policy(grid_size, block_size, stream);
auto kernel = raw::findTopK<T, 2048, BLOCK_SIZE>;
launch_kernel(kernel, policy, indices, count, scores, threshold, classwise_topK, num_classes, num_priors, background_class_id);
}
template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, std::size_t, float);
template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, std::size_t, float);
template <class T>
void box_collect(const Stream& stream, TensorSpan<T> collected_bboxes, TensorView<T> decoded_bboxes, TensorView<int> indices, TensorView<int> count, bool share_location, std::size_t background_class_id)
{
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
const auto batch_size = collected_bboxes.get_axis_size(0);
CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
CV_Assert(indices.get_axis_size(0) == batch_size);
CV_Assert(count.get_axis_size(0) == batch_size);
const auto num_classes = collected_bboxes.get_axis_size(1);
CV_Assert(indices.get_axis_size(1) == num_classes);
CV_Assert(count.get_axis_size(1) == num_classes);
const auto classwise_topK = collected_bboxes.get_axis_size(2);
CV_Assert(indices.get_axis_size(2) == classwise_topK);
const auto num_priors = decoded_bboxes.get_axis_size(1);
CV_Assert(!share_location || decoded_bboxes.get_axis_size(2) == 1);
constexpr int BLOCK_SIZE = 256;
/* each block processes one class from each batch */
dim3 grid_size(num_classes, batch_size);
dim3 block_size(BLOCK_SIZE);
auto policy = execution_policy(grid_size, block_size, stream);
auto kernel = raw::box_collect<T>;
launch_kernel(kernel, policy, collected_bboxes, decoded_bboxes, indices, count, share_location, num_priors, num_classes, classwise_topK, background_class_id);
}
template void box_collect(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<int>, TensorView<int>, bool, std::size_t);
template void box_collect(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<int>, TensorView<int>, bool, std::size_t);
template <class T>
void blockwise_class_nms(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> collected_bboxes,
bool normalized_bbox, std::size_t background_class_id, float nms_threshold)
{
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
// collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
const auto batch_size = indices.get_axis_size(0);
CV_Assert(count.get_axis_size(0) == batch_size);
CV_Assert(collected_bboxes.get_axis_size(0) == batch_size);
const auto num_classes = indices.get_axis_size(1);
CV_Assert(count.get_axis_size(1) == num_classes);
CV_Assert(collected_bboxes.get_axis_size(1) == num_classes);
const auto classwise_topK = indices.get_axis_size(2);
CV_Assert(collected_bboxes.get_axis_size(2) == classwise_topK);
/* each block processes one class from each batch */
auto num_blocks = batch_size * num_classes;
auto num_threads = std::max<std::size_t>(std::min<std::size_t>(1024, classwise_topK), 32);
dim3 grid_size(num_blocks);
dim3 block_size(num_threads);
auto policy = execution_policy(grid_size, block_size, stream);
if (normalized_bbox)
{
auto kernel = raw::blockwise_class_nms<T, true>;
launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
}
else
{
auto kernel = raw::blockwise_class_nms<T, false>;
launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
}
}
template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, bool, std::size_t, float);
template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, bool, std::size_t, float);
template <class T>
void nms_collect(const Stream& stream, TensorSpan<int> kept_indices, TensorSpan<int> kept_count,
TensorView<int> indices, TensorView<int> count, TensorView<T> scores, float threshold, std::size_t background_class_id)
{
// kept_indices: [batch_size, keepTopK]
// kept_count: [batch_size]
// indices: [batch_size, num_classes, classwise_topK]
// count: [batch_size, num_classes]
// scores: [batch_size, num_classes, num_priors]
auto batch_size = kept_indices.get_axis_size(0);
CV_Assert(kept_count.get_axis_size(0) == batch_size);
CV_Assert(indices.get_axis_size(0) == batch_size);
CV_Assert(count.get_axis_size(0) == batch_size);
CV_Assert(scores.get_axis_size(0) == batch_size);
auto keepTopK = kept_indices.get_axis_size(1);
auto num_classes = indices.get_axis_size(1);
CV_Assert(count.get_axis_size(1) == num_classes);
CV_Assert(scores.get_axis_size(1) == num_classes);
auto classwise_topK = indices.get_axis_size(2);
auto num_priors = scores.get_axis_size(2);
auto num_blocks = batch_size;
constexpr int BLOCK_SIZE = 1024;
dim3 grid_size(num_blocks);
dim3 block_size(BLOCK_SIZE);
auto policy = execution_policy(grid_size, block_size, stream);
auto kernel = raw::nms_collect<T, 1024, BLOCK_SIZE>;
launch_kernel(kernel, policy, kept_indices, kept_count, indices, count, scores, threshold, num_classes, num_priors, classwise_topK, keepTopK, background_class_id);
}
template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<__half>, float, std::size_t);
template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<float>, float, std::size_t);
template <class T>
void consolidate_detections(const Stream& stream, TensorSpan<T> output,
TensorView<int> kept_indices, TensorView<int> kept_count,
TensorView<T> decoded_bboxes, TensorView<T> scores, bool share_location, DevicePtr<int> num_detections)
{
// output: [1, 1, batch_size * keepTopK, 7]
// kept_indices: [batch_size, keepTopK]
// kept_count: [batch_size]
// decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
// scores: [batch_size, num_classes, num_priors]
auto batch_size = kept_indices.get_axis_size(0);
CV_Assert(kept_count.get_axis_size(0) == batch_size);
CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
CV_Assert(scores.get_axis_size(0) == batch_size);
auto keepTopK = kept_indices.get_axis_size(1);
auto num_classes = scores.get_axis_size(1);
auto num_priors = scores.get_axis_size(2);
CV_Assert(batch_size * keepTopK * 7 == output.size());
auto kernel = raw::consolidate_detections<T>;
auto policy = make_policy(kernel, keepTopK, 0, stream);
launch_kernel(kernel, policy, output, kept_indices, kept_count, decoded_bboxes, scores, share_location, batch_size, num_classes, num_priors, keepTopK, num_detections);
}
template void consolidate_detections(const Stream&, TensorSpan<__half>, TensorView<int>, TensorView<int>, TensorView<__half>, TensorView<__half>, bool, DevicePtr<int>);
template void consolidate_detections(const Stream&, TensorSpan<float>, TensorView<int>, TensorView<int>, TensorView<float>, TensorView<float>, bool, DevicePtr<int>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,125 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "functors.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class EltwiseOp, class ActivationOp, std::size_t N>
__global__ void eltwise_op_generic_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto x_vPtr = vector_type::get_pointer(x.data());
auto y_vPtr = vector_type::get_pointer(y.data());
EltwiseOp eltwise_op(eltwise_params);
ActivationOp activation_op(act_params);
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec_x, vec_y;
v_load(vec_x, x_vPtr[i]);
v_load(vec_y, y_vPtr[i]);
for(int j = 0; j < vec_x.size(); j++)
vec_x.data[j] = activation_op(eltwise_op(vec_x.data[j], vec_y.data[j]));
v_store(output_vPtr[i], vec_x);
}
}
}
template <class T, class EltwiseOp, class ActivationOp, std::size_t N> static
void launch_vectorized_eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(x, N));
CV_Assert(is_fully_aligned<T>(y, N));
auto kernel = raw::eltwise_op_generic_op_vec<T, EltwiseOp, ActivationOp, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, x, y, eltwise_params, act_params);
}
template <class T, class EltwiseOp, class ActivationOp> static
void eltwise_op_generic_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) {
CV_Assert(output.size() == x.size());
CV_Assert(output.size() == y.size());
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 4>(stream, output, x, y, eltwise_params, act_params);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 4)) {
launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 2>(stream, output, x, y, eltwise_params, act_params);
} else {
launch_vectorized_eltwise_op_generic_op<T, EltwiseOp, ActivationOp, 1>(stream, output, x, y, eltwise_params, act_params);
}
}
template <class T>
void eltwise_sum_2_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T slope) {
eltwise_op_generic_op<T, SumFunctor<T>, ReLUFunctor<T>>(stream, output, x, y, {}, {slope});
}
template <class T>
void eltwise_sum_2_clipped_relu(const Stream& stream, Span<T> output, View<T> x, View<T> y, T floor, T ceiling) {
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
eltwise_op_generic_op<T, SumFunctor<T>, ClippedReLUFunctor<T>>(stream, output, x, y, {}, {floor, ceiling});
}
template <class T>
void eltwise_sum_2_tanh(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
eltwise_op_generic_op<T, SumFunctor<T>, TanHFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sum_2_swish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
eltwise_op_generic_op<T, SumFunctor<T>, SwishFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sum_2_mish(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
eltwise_op_generic_op<T, SumFunctor<T>, MishFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sum_2_sigmoid(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
eltwise_op_generic_op<T, SumFunctor<T>, SigmoidFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sum_2_power(const Stream& stream, Span<T> output, View<T> x, View<T> y, T exp, T scale, T shift) {
eltwise_op_generic_op<T, SumFunctor<T>, PowerFunctor<T>>(stream, output, x, y, {}, {exp, scale, shift});
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void eltwise_sum_2_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half);
template void eltwise_sum_2_clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half);
template void eltwise_sum_2_tanh<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
template void eltwise_sum_2_swish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
template void eltwise_sum_2_mish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
template void eltwise_sum_2_sigmoid<__half>(const Stream&, Span<__half>, View<__half>, View<__half>);
template void eltwise_sum_2_power<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half, __half);
#endif
template void eltwise_sum_2_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float);
template void eltwise_sum_2_clipped_relu<float>(const Stream&, Span<float>, View<float>, View<float>, float, float);
template void eltwise_sum_2_tanh<float>(const Stream&, Span<float>, View<float>, View<float>);
template void eltwise_sum_2_swish<float>(const Stream&, Span<float>, View<float>, View<float>);
template void eltwise_sum_2_mish<float>(const Stream&, Span<float>, View<float>, View<float>);
template void eltwise_sum_2_sigmoid<float>(const Stream&, Span<float>, View<float>, View<float>);
template void eltwise_sum_2_power<float>(const Stream&, Span<float>, View<float>, View<float>, float, float, float);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,334 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "functors.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "kernel_dispatcher.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include <opencv2/core.hpp>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, class EltwiseOp, std::size_t N>
__global__ void eltwise_op_vec(Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params params) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto x_vPtr = vector_type::get_pointer(x.data());
auto y_vPtr = vector_type::get_pointer(y.data());
EltwiseOp eltwise_op(params);
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec_x, vec_y;
v_load(vec_x, x_vPtr[i]);
v_load(vec_y, y_vPtr[i]);
for (int j = 0; j < vector_type::size(); j++)
vec_x.data[j] = eltwise_op(vec_x.data[j], vec_y.data[j]);
v_store(output_vPtr[i], vec_x);
}
}
template <class T, class EltwiseOp, std::size_t Rank>
__global__ void eltwise_op_bcast(
Span<T> output, array<size_type, Rank> out_strides,
View<T> x, array<size_type, Rank> x_strides, array<bool, Rank> x_bcast,
View<T> y, array<size_type, Rank> y_strides, array<bool, Rank> y_bcast,
const typename EltwiseOp::Params params) {
EltwiseOp eltwise_op(params);
for (auto i : grid_stride_range(output.size())) {
index_type out_index = i / out_strides[0];
index_type x_index = x_bcast[0] ? 0 : out_index * x_strides[0];
index_type y_index = y_bcast[0] ? 0 : out_index * y_strides[0];
for (int j = 1; j < Rank; j++)
{
out_index = (i % out_strides[j - 1]) / out_strides[j];
if (!x_bcast[j])
x_index += out_index * x_strides[j];
if (!y_bcast[j])
y_index += out_index * y_strides[j];
}
output[i] = eltwise_op(x[x_index], y[y_index]);
}
}
}
template <class T, class EltwiseOp, std::size_t N> static
void launch_vectorized_eltwise_op(const Stream& stream, Span<T> output, View<T> x, View<T> y, const typename EltwiseOp::Params& params) {
CV_Assert(x.size() == y.size());
CV_Assert(x.size() == output.size());
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(x, N));
CV_Assert(is_fully_aligned<T>(y, N));
auto kernel = raw::eltwise_op_vec<T, EltwiseOp, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, x, y, params);
}
template <class T, class EltwiseOp, std::size_t Rank> static
void launch_eltwise_op_bcast(
const Stream& stream,
Span<T> output, const std::vector<std::size_t>& outStride,
View<T> x, const std::vector<std::size_t>& inStride1, const std::vector<int>& inBcast1,
View<T> y, const std::vector<std::size_t>& inStride2, const std::vector<int>& inBcast2,
const typename EltwiseOp::Params& params)
{
CV_Assert(outStride.size() == Rank);
CV_Assert(inStride1.size() == Rank);
CV_Assert(inStride2.size() == Rank);
CV_Assert(inBcast1.size() == Rank);
CV_Assert(inBcast2.size() == Rank);
array<size_type, Rank> outStride_k, inStride1_k, inStride2_k;
outStride_k.assign(std::begin(outStride), std::end(outStride));
inStride1_k.assign(std::begin(inStride1), std::end(inStride1));
inStride2_k.assign(std::begin(inStride2), std::end(inStride2));
array<bool, Rank> inBcast1_k, inBcast2_k;
inBcast1_k.assign(std::begin(inBcast1), std::end(inBcast1));
inBcast2_k.assign(std::begin(inBcast2), std::end(inBcast2));
auto kernel = raw::eltwise_op_bcast<T, EltwiseOp, Rank>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, outStride_k, x, inStride1_k, inBcast1_k, y, inStride2_k, inBcast2_k, params);
}
GENERATE_KERNEL_DISPATCHER_2TP(eltwise_op_bcast_dispatcher, launch_eltwise_op_bcast);
template <class T, class EltwiseOp> static
void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y, const typename EltwiseOp::Params& params = {}) {
if (is_shape_same(output, x) && is_shape_same(output, y))
{
/* no broadcasting; use fast path */
CV_Assert(x.size() == y.size());
CV_Assert(x.size() == output.size());
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
launch_vectorized_eltwise_op<T, EltwiseOp, 4>(stream, output, x, y, params);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
launch_vectorized_eltwise_op<T, EltwiseOp, 2>(stream, output, x, y, params);
} else {
launch_vectorized_eltwise_op<T, EltwiseOp, 1>(stream, output, x, y, params);
}
}
else
{
CV_Assert(is_shape_compatible(output, x));
CV_Assert(is_shape_compatible(output, y));
/* matching singleton axes in both input tensors can be eliminated
*
* Reasoning:
* ----------
* Singleton axes do not contribute towards address calculation. They are redundant
* unless there is broadcasting. If both input tensors have singleton axis at a
* specified position, there is no broadcasting on that axis.
*
* Example:
* ---------
* x: [1, 256, 32, 32] -> [256, 32, 32]
* y: [1, 256, 1, 1] -> [256, 1, 1]
*/
for (int r = 0; r < output.rank(); r++)
{
while (x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
CV_Assert(output.get_axis_size(r) == 1);
x.squeeze(r);
y.squeeze(r);
output.squeeze(r);
}
}
auto inShape1 = x.shape_as_vector();
auto inShape2 = y.shape_as_vector();
auto outShape = output.shape_as_vector();
/* contiguous axes that do not broadcast can be merged into one axis
*
* Example:
* ---------
* x: [32, 8, 8] -> [32, 64]
* y: [1, 8, 8] -> [1, 64]
*/
for (int i = 0; i < inShape1.size(); i++) {
/* check if axis `i` requires any broadcasting */
if (inShape1[i] == inShape2[i]) {
/* loop invariant: `i` is the first axis in the contiguous axis sequence */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape1.size() && inShape1[j] == inShape2[j]) {
CV_Assert(outShape[j] == inShape1[j]);
/* `j` axis is also used fully; merge `i` and `j` */
auto new_size = inShape1[i] * inShape1[j];
inShape1[i] = new_size;
inShape2[i] = new_size;
/* delete axis `j` */
inShape1.erase(std::begin(inShape1) + j);
inShape2.erase(std::begin(inShape2) + j);
outShape.erase(std::begin(outShape) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape1.size() == outShape.size());
CV_Assert(inShape2.size() == outShape.size());
CV_Assert(inShape1[i] == outShape[i]);
CV_Assert(inShape2[i] == outShape[i]);
}
}
}
/* contiguous broadcasting axes on the same tensor can be merged into one axis
*
* Example:
* ---------
* x: [256, 8, 8] -> [256, 64]
* y: [256, 1, 1] -> [256, 1]
*/
for (int i = 0; i < inShape1.size(); i++) {
/* check if axis `i` requires any broadcasting in tensor 1 */
if (inShape1[i] == 1 && inShape2[i] != 1) {
/* loop invariant: `i` is the first axis in the contiguous axis sequence */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape1.size() && inShape1[j] == 1 && inShape2[j] != 1) {
CV_Assert(outShape[j] == inShape2[j]);
/* `j` axis is also used fully; merge `i` and `j` */
inShape1[i] = 1;
inShape2[i] = inShape2[i] * inShape2[j];
outShape[i] = inShape2[i];
/* delete axis `j` */
inShape1.erase(std::begin(inShape1) + j);
inShape2.erase(std::begin(inShape2) + j);
outShape.erase(std::begin(outShape) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape1.size() == outShape.size());
CV_Assert(inShape2.size() == outShape.size());
CV_Assert(inShape1[i] == 1);
CV_Assert(inShape2[i] == outShape[i]);
}
}
/* check if axis `i` requires any broadcasting in tensor 2 */
if (inShape1[i] != 1 && inShape2[i] == 1) {
/* loop invariant: `i` is the first axis in the contiguous axis sequence */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape1.size() && inShape1[j] != 1 && inShape2[j] == 1) {
CV_Assert(outShape[j] == inShape1[j]);
/* `j` axis is also used fully; merge `i` and `j` */
inShape1[i] = inShape1[i] * inShape1[j];
inShape2[i] = 1;
outShape[i] = inShape1[i];
/* delete axis `j` */
inShape1.erase(std::begin(inShape1) + j);
inShape2.erase(std::begin(inShape2) + j);
outShape.erase(std::begin(outShape) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape1.size() == outShape.size());
CV_Assert(inShape2.size() == outShape.size());
CV_Assert(inShape1[i] == outShape[i]);
CV_Assert(inShape2[i] == 1);
}
}
}
auto rank = outShape.size();
std::vector<std::size_t> inStride1(rank), inStride2(rank), outStride(rank);
inStride1.back() = 1;
inStride2.back() = 1;
outStride.back() = 1;
/* garbage, ..., garbage, 1 */
std::copy(std::begin(inShape1) + 1, std::end(inShape1), std::begin(inStride1));
std::copy(std::begin(inShape2) + 1, std::end(inShape2), std::begin(inStride2));
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
/* dim[0], dim[1], ..., dim[-1], 1 */
std::partial_sum(inStride1.rbegin(), inStride1.rend(), inStride1.rbegin(), std::multiplies<std::size_t>());
std::partial_sum(inStride2.rbegin(), inStride2.rend(), inStride2.rbegin(), std::multiplies<std::size_t>());
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
/* stride[0], stride[1], ..., stride[-2], 1 */
std::vector<int> inBcast1(rank), inBcast2(rank);
std::transform(std::begin(inShape1), std::end(inShape1), std::begin(inBcast1), [](std::size_t sz) { return sz == 1; });
std::transform(std::begin(inShape2), std::end(inShape2), std::begin(inBcast2), [](std::size_t sz) { return sz == 1; });
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
eltwise_op_bcast_dispatcher<T, EltwiseOp, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, x, inStride1, inBcast1, y, inStride2, inBcast2, params);
}
}
template <class T>
void eltwise_max_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, MaxFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_min_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, MinFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sum_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, SumFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_sum_coeff_2(const Stream& stream, TensorSpan<T> output, T coeff_x, TensorView<T> x, T coeff_y, TensorView<T> y) {
eltwise_op<T, ScaledSumFunctor<T>>(stream, output, x, y, {coeff_x, coeff_y});
}
template <class T>
void eltwise_prod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, ProductFunctor<T>>(stream, output, x, y);
}
template <class T>
void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
template void eltwise_sum_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
#endif
template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);
template void eltwise_sum_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,81 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
#include "../cuda4dnn/csl/error.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include <opencv2/core.hpp>
#include <cuda_runtime_api.h>
#include <cstddef>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
struct execution_policy {
execution_policy(dim3 grid_size, dim3 block_size)
: grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
: grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
: grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
: grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
dim3 grid;
dim3 block;
std::size_t sharedMem;
cudaStream_t stream;
};
/* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
/*
template <class Kernel> inline
execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
int grid_size, block_size;
CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
return execution_policy(grid_size, block_size, sharedMem, stream);
}*/
template <class Kernel> inline
execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
CV_Assert(max_threads > 0);
int grid_size = 0, block_size = 0;
CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
if (grid_size * block_size > max_threads) {
grid_size = (max_threads + block_size - 1) / block_size;
if (block_size > max_threads)
block_size = max_threads;
}
CV_Assert(grid_size >= 1 && block_size >= 1);
return execution_policy(grid_size, block_size, sharedMem, stream);
}
template <class Kernel, typename ...Args> inline
void launch_kernel(Kernel kernel, Args ...args) {
auto policy = make_policy(kernel);
kernel <<<policy.grid, policy.block>>> (args...);
}
template <class Kernel, typename ...Args> inline
void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
kernel <<<grid, block>>> (args...);
}
template <class Kernel, typename ...Args> inline
void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (args...);
}
}}}} /* namespace cv::dnn::cuda4dnn::csl */
#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */

View File

@@ -0,0 +1,98 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t N>
__global__ void fill_vec(Span<T> output, T value) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
for (int j = 0; j < vector_type::size(); j++)
vec.data[j] = value;
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void copy_vec(Span<T> output, View<T> input) {
using vector_type = get_vector_type_t<T, N>;
auto input_vPtr = vector_type::get_pointer(input.data());
auto output_vPtr = vector_type::get_pointer(output.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
v_load(vec, input_vPtr[i]);
v_store(output_vPtr[i], vec);
}
}
}
template <class T, std::size_t N> static
void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
CV_Assert(is_fully_aligned<T>(output, N));
auto kernel = raw::fill_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, value);
}
template <class T>
void fill(const Stream& stream, Span<T> output, T value) {
if (is_fully_aligned<T>(output, 4)) {
launch_vectorized_fill<T, 4>(stream, output, value);
} else if (is_fully_aligned<T>(output, 2)) {
launch_vectorized_fill<T, 2>(stream, output, value);
} else {
launch_vectorized_fill<T, 1>(stream, output, value);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void fill(const Stream&, Span<__half>, __half);
#endif
template void fill(const Stream&, Span<float>, float);
template void fill(const Stream&, Span<int>, int);
template <class T, std::size_t N> static
void launch_vectorized_copy(const Stream& stream, Span<T> output, View<T> input) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
auto kernel = raw::copy_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input);
}
template <class T>
void copy(const Stream& stream, Span<T> output, View<T> input) {
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
launch_vectorized_copy<T, 4>(stream, output, input);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
launch_vectorized_copy<T, 2>(stream, output, input);
} else {
launch_vectorized_copy<T, 1>(stream, output, input);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void copy(const Stream&, Span<__half>, View<__half>);
#endif
template void copy(const Stream&, Span<float>, View<float>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,102 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <std::size_t N>
__global__ void fp32_to_fp16(Span<__half> output, View<float> input) {
using output_vector_type = get_vector_type_t<__half, N>;
using input_vector_type = get_vector_type_t<float, N>;
auto output_vPtr = output_vector_type::get_pointer(output.data());
auto input_vPtr = input_vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
input_vector_type in_vec;
v_load(in_vec, input_vPtr[i]);
output_vector_type out_vec;
for (int j = 0; j < output_vector_type::size(); j++)
out_vec.data[j] = __float2half(in_vec.data[j]);
v_store(output_vPtr[i], out_vec);
}
}
template <std::size_t N>
__global__ void fp16_to_fp32(Span<float> output, View<__half> input) {
using output_vector_type = get_vector_type_t<float, N>;
using input_vector_type = get_vector_type_t<__half, N>;
auto output_vPtr = output_vector_type::get_pointer(output.data());
auto input_vPtr = input_vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
input_vector_type in_vec;
v_load(in_vec, input_vPtr[i]);
output_vector_type out_vec;
for (int j = 0; j < output_vector_type::size(); j++)
out_vec.data[j] = __half2float(in_vec.data[j]);
v_store(output_vPtr[i], out_vec);
}
}
}
template <std::size_t N> static
void launch_vectorized_fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
CV_Assert(is_fully_aligned<__half>(output, N));
CV_Assert(is_fully_aligned<float>(input, N));
auto kernel = raw::fp32_to_fp16<N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input);
}
void fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
if (is_fully_aligned<__half>(output, 4) && is_fully_aligned<float>(input, 4)) {
launch_vectorized_fp32_to_fp16<4>(stream, output, input);
} else if (is_fully_aligned<__half>(output, 2) && is_fully_aligned<float>(input, 2)) {
launch_vectorized_fp32_to_fp16<2>(stream, output, input);
} else {
launch_vectorized_fp32_to_fp16<1>(stream, output, input);
}
}
template <std::size_t N> static
void launch_vectorized_fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
CV_Assert(is_fully_aligned<float>(output, N));
CV_Assert(is_fully_aligned<__half>(input, N));
auto kernel = raw::fp16_to_fp32<N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input);
}
void fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
if (is_fully_aligned<float>(output, 4) && is_fully_aligned<__half>(input, 4)) {
launch_vectorized_fp16_to_fp32<4>(stream, output, input);
} else if (is_fully_aligned<float>(output, 2) && is_fully_aligned<__half>(input, 2)) {
launch_vectorized_fp16_to_fp32<2>(stream, output, input);
} else {
launch_vectorized_fp16_to_fp32<1>(stream, output, input);
}
}
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,334 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
#define OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP
#include <cuda_runtime.h>
#include "math.hpp"
#include "../cuda4dnn/csl/nvcc_defs.hpp"
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
struct IdentityFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE IdentityFunctor() { }
CUDA4DNN_DEVICE IdentityFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
return value;
};
};
template <class T>
struct ReLUFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() : slope(0) { }
CUDA4DNN_HOST_DEVICE Params(T slope_) : slope(slope_) { }
T slope;
};
CUDA4DNN_DEVICE ReLUFunctor() : ReLUFunctor(Params{}) { }
CUDA4DNN_DEVICE ReLUFunctor(const Params& params) : slope(params.slope) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::log1pexp;
return value >= T(0) ? value : slope * value;
}
T slope;
};
template <class T>
struct ClippedReLUFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() : floor(0), ceiling(6) { }
CUDA4DNN_HOST_DEVICE Params(T floor_, T ceiling_) : floor(floor_), ceiling(ceiling_) { }
T floor, ceiling;
};
CUDA4DNN_DEVICE ClippedReLUFunctor() : ClippedReLUFunctor(Params{}) { }
CUDA4DNN_DEVICE ClippedReLUFunctor(const Params& params) : floor{params.floor}, ceiling{params.ceiling} { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::clamp;
return clamp(value, floor, ceiling);
}
T floor, ceiling;
};
template <class T>
struct TanHFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE TanHFunctor() { }
CUDA4DNN_DEVICE TanHFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::tanh;
return tanh(value);
}
};
template <class T>
struct SwishFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE SwishFunctor() { }
CUDA4DNN_DEVICE SwishFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
// f(x) = x * sigmoid(x)
using csl::device::fast_divide;
using csl::device::fast_exp;
return fast_divide(value, static_cast<T>(1) + fast_exp(-value));
}
};
template <class T>
struct MishFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE MishFunctor() { }
CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::tanh;
using csl::device::log1pexp;
return value * tanh(log1pexp(value));
}
};
template <>
struct MishFunctor<float> {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE MishFunctor() { }
CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
CUDA4DNN_DEVICE float operator()(float value) {
// f(x) = x * tanh(log1pexp(x));
using csl::device::fast_divide;
using csl::device::fast_exp;
auto e = fast_exp(value);
auto n = e * e + 2 * e;
if (value <= -0.6f)
return value * fast_divide(n, n + 2);
return value - 2 * fast_divide(value, n + 2);
}
};
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <>
struct MishFunctor<__half> {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE MishFunctor() { }
CUDA4DNN_DEVICE MishFunctor(const Params& params) { }
CUDA4DNN_DEVICE __half operator()(__half value) {
return MishFunctor<float>()(value);
}
};
#endif
template <class T>
struct SigmoidFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE SigmoidFunctor() { }
CUDA4DNN_DEVICE SigmoidFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::fast_sigmoid;
return fast_sigmoid(value);
}
};
template <class T>
struct ELUFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE ELUFunctor() { }
CUDA4DNN_DEVICE ELUFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::expm1;
return value >= T(0) ? value : expm1(value);
}
};
template <class T>
struct AbsFunctor {
struct Params { };
CUDA4DNN_DEVICE AbsFunctor() { }
CUDA4DNN_DEVICE AbsFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::abs;
return abs(value);
}
};
template <class T>
struct BNLLFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE BNLLFunctor() { }
CUDA4DNN_DEVICE BNLLFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::log1pexp;
return value > T(0) ? value + log1pexp(-value) : log1pexp(value);
}
};
template <class T>
struct PowerFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() : exp(1), scale(1), shift(0) { }
CUDA4DNN_HOST_DEVICE Params(T exp_, T scale_, T shift_) : exp(exp_), scale(scale_), shift(shift_) { }
T exp, scale, shift;
};
CUDA4DNN_DEVICE PowerFunctor() : PowerFunctor(Params{}) { }
CUDA4DNN_DEVICE PowerFunctor(const Params& params) : exp{params.exp}, scale{params.scale}, shift{params.shift} { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::pow;
return pow(shift + scale * value, exp);
}
T exp, scale, shift;
};
template <class T>
struct ExpFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() : normScale(1), normShift(0) { }
CUDA4DNN_HOST_DEVICE Params(T nScale_, T nShift_) : normScale(nScale_), normShift(nShift_) { }
T normScale, normShift;
};
CUDA4DNN_DEVICE ExpFunctor() : ExpFunctor(Params{}) { }
CUDA4DNN_DEVICE ExpFunctor(const Params& params) : normScale{params.normScale}, normShift{params.normShift} { }
CUDA4DNN_DEVICE T operator()(T value) {
using csl::device::fast_exp;
return fast_exp(normShift + normScale * value);
}
T normScale, normShift;
};
template <class T>
struct MaxFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE MaxFunctor() { }
CUDA4DNN_DEVICE MaxFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T x, T y) {
using csl::device::max;
return max(x, y);
}
};
template <class T>
struct MinFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE MinFunctor() { }
CUDA4DNN_DEVICE MinFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T x, T y) {
using csl::device::min;
return min(x, y);
}
};
template <class T>
struct SumFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE SumFunctor() { }
CUDA4DNN_DEVICE SumFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T x, T y) { return x + y; }
};
template <class T>
struct ScaledSumFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() : scale_x(1), scale_y(1) { }
CUDA4DNN_HOST_DEVICE Params(T scale_x_, T scale_y_) : scale_x(scale_x_), scale_y(scale_y_) { }
T scale_x, scale_y;
};
CUDA4DNN_DEVICE ScaledSumFunctor() : scale_x(1), scale_y(1) { }
CUDA4DNN_DEVICE ScaledSumFunctor(const Params& params) : scale_x{params.scale_x}, scale_y{params.scale_y} { }
CUDA4DNN_DEVICE T operator()(T x, T y) { return scale_x * x + scale_y * y; }
T scale_x, scale_y;
};
template <class T>
struct ProductFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE ProductFunctor() { }
CUDA4DNN_DEVICE ProductFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T x, T y) { return x * y; }
};
template <class T>
struct DivFunctor {
struct Params {
CUDA4DNN_HOST_DEVICE Params() { }
};
CUDA4DNN_DEVICE DivFunctor() { }
CUDA4DNN_DEVICE DivFunctor(const Params& params) { }
CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
};
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */

View File

@@ -0,0 +1,467 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "bbox_utils.hpp"
#include "grid_stride_range.hpp"
#include "block_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "memory.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, bool NORMALIZED_BBOX, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void grid_nms(Span<unsigned int> mask_, Span<int> count_, View<T> bboxes_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs, float nms_threshold)
{
// topK_gs is topK rounded upwards to some size
// mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
// bboxes: [batch_size, num_classes, topK, 4]
// count: [batch_size, num_classes]
const index_type c = blockIdx.y;
const index_type b = blockIdx.z;
if (c == background_class_id)
return;
auto mask = mask_.data() + (b * num_classes + c) * topK_gs * topK_gs / 32;
auto bboxes = bboxes_.data() + (b * num_classes + c) * topK * 4;
auto count = count_.data() + b * num_classes + c;
const auto boxes = *count;
if (boxes == 0)
return;
/* We divide the set of boxes into groups containing BLOCK_SIZE boxes */
const auto num_groups = (boxes + BLOCK_SIZE - 1) / BLOCK_SIZE;
/* We need to calculate IOUs for every pair of boxes. We can generalize and say that
* we need to compute IOUs of every group with every other group including itself.
*/
// Each block processes a pair of groups.
const index_type group_i = blockIdx.x % num_groups;
const index_type group_j = blockIdx.x / num_groups;
/* we use __syncthreads() later but note that the following condition will cause all threads
* in the block to exit; hence, no thread will execute a divergent __syncthreads()
*/
if (group_i >= num_groups || group_j >= num_groups)
return;
/* Note that IOU(A, B) = IOU(B, A). Hence, if we compute IOU(GROUP_A, GROUP_B), we do not need
* to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
* each group has many boxes and we need IOUs amongst boxes within a group.
*
* We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
* compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
* early on and the working blocks are compute heavy.
*/
if (group_i > group_j)
return;
/* the following variables contain the absolute box number of the first box of their respective groups */
const auto group_i_offset = group_i * BLOCK_SIZE;
const auto group_j_offset = group_j * BLOCK_SIZE;
/* MAIN LOOP LOGIC:
* We compare a box `i` from group_i with all boxes in group_j in each iteration. The box `j` is fixed
* for each thread. The `j` exactly maps to the thread index. Hence, the `j` is a loop invariant. Each
* thread of the block computes the overlap between box `i` and its box `j`.
*
* for (int i = 0; i < BLOCK_SIZE; i++)
* {
* // i = box 1
* // j = threadIdx.x = box 2
* }
*/
/* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
* We store the `i` boxes in shared memory to allow global memory coalesing.
*/
using vector_type = get_vector_type_t<T, 4>;
__shared__ vector_type group_i_boxes[BLOCK_SIZE];
/* We will precompute the sizes of `i` boxes in the code where we load them. The size computation
* is distributed across the block. Otherwise, all threads will have to compute the size of the same
* box simultaneously in the main loop. The size is computed while the memory subsystem is busy
* servicing requests for box coordinates; the compute resources would otherwise be idle in this phase.
*/
/* we store the size as a float since the size can exceed fp16 limits for unnormalized boxes */
__shared__ float group_i_size[BLOCK_SIZE];
const auto bboxes_vPtr = vector_type::get_pointer(bboxes);
// load `i` boxes and precompute their sizes
{
int i = threadIdx.x;
if (group_i_offset + i < boxes)
{
vector_type box;
v_load(box, bboxes_vPtr[group_i_offset + i]);
v_store(group_i_boxes[i], box);
BoundingBox bbox;
bbox.xmin = box.data[0];
bbox.ymin = box.data[1];
bbox.xmax = box.data[2];
bbox.ymax = box.data[3];
group_i_size[i] = compute_bbox_size<NORMALIZED_BBOX>(bbox);
}
}
__syncthreads();
/* We compute overlap between boxes and check if the IOU exceeds the nms threshold.
* We store the result (exceeds or below nms_thresold) in a two-dimensional matrix.
* (i, j) is set to one if the overlap between i and j is within the nms threshold.
* We pack 32 results into one 32-bit integer. The effective memory layout of the
* matrix hence is (BLOCK_SIZE, BLOCK_SIZE / 32).
*/
__shared__ unsigned int mask_shared[BLOCK_SIZE * BLOCK_SIZE / 32];
// load box `j` and precompute its size (fixed per thread)
BoundingBox bbox_j;
float bbox_j_size = 0;
if (group_j_offset + threadIdx.x < boxes)
{
vector_type box;
v_load(box, bboxes_vPtr[group_j_offset + threadIdx.x]);
bbox_j.xmin = box.data[0];
bbox_j.ymin = box.data[1];
bbox_j.xmax = box.data[2];
bbox_j.ymax = box.data[3];
bbox_j_size = compute_bbox_size<NORMALIZED_BBOX>(bbox_j);
}
/* Each thread computes a predicate which is broadcasted across the warp to obtain a 32-bit mask.
* The lane zero thread of each warp saves the mask. We store the offset to the mask array beforehand
* to save cycles in the compute-intensive main loop.
*/
auto mask_offset = threadIdx.x / 32;
/* The main loop is compute intensive and causes the kernel to be overall compute-bound. Hence,
* this loop has been highly tuned. Please profile and verify carefully before making changes.
*/
/* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
* the loop since the compiler cannot effectively unroll on its own preassumably due to presence
* of instructions forcing warp synchronization.
*/
constexpr int UNROLL_SIZE = 4;
#pragma unroll 8
for (int s = 0; s < BLOCK_SIZE; s += UNROLL_SIZE)
{
bool do_not_reject_j[UNROLL_SIZE];
#pragma unroll
for (int k = 0; k < UNROLL_SIZE; k++)
{
int i = s + k;
/* The number of boxes need not necessarily be a multiple of BLOCK_SIZE.
* However, the shared memory allocated can hold BLOCK_SIZE boxes from
* each group. Accessing the undefined regions of shared memory is
* a valid memory operation as long as the memory has been allocated.
*
* The condition below is only required when one of the groups does not
* fully filled with valid boxes. This situations are relatively rare. It's
* more common to see both groups completely filled.
*
* We comment this condition to improve the performance of the common case.
* This leads to a net improvement.
*/
// if (group_i_offset + i < boxes && group_j_offset + threadIdx.x < boxes)
{
BoundingBox bbox_i;
float bbox_i_size;
{
vector_type box;
v_load(box, group_i_boxes[i]);
bbox_i.xmin = box.data[0];
bbox_i.ymin = box.data[1];
bbox_i.xmax = box.data[2];
bbox_i.ymax = box.data[3];
bbox_i_size = group_i_size[i];
}
using device::min;
using device::max;
BoundingBox intersect_bbox;
intersect_bbox.xmin = max(bbox_i.xmin, bbox_j.xmin);
intersect_bbox.ymin = max(bbox_i.ymin, bbox_j.ymin);
intersect_bbox.xmax = min(bbox_i.xmax, bbox_j.xmax);
intersect_bbox.ymax = min(bbox_i.ymax, bbox_j.ymax);
float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
using device::fast_divide_ftz;
float iou = fast_divide_ftz(intersect_size, bbox_i_size + bbox_j_size - intersect_size);
do_not_reject_j[k] = iou <= nms_threshold;
}
}
#pragma unroll
for (int k = 0; k < UNROLL_SIZE; k++)
{
// FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
auto predicate = __ballot_sync(0xFFFFFFFF, do_not_reject_j[k]);
if (threadIdx.x % 32 == 0)
mask_shared[mask_offset] = predicate;
/* The following operation should logically be inside the previous if branch. Note that `mask_offset`
* is only used by lane zero threads. Hence, there is no harm in executing it other threads as it is
* unused there.
*
* Keeping it inside prevents the compiler from treating it as a constexpr addition to the address in
* successive unrolled iterations. A register is used and instructions are emitted to multiply the
* addend by four to obtain the byte offset. Pulling it out of the branch makes the compiler do constexpr
* addition on the address in successive unrolled iterations.
*/
mask_offset += BLOCK_SIZE / 32;
}
}
__syncthreads();
/* The mask data is organized as a two-dimensional bit matrix of size topK_gs * topK_gs.
* (i, j) is set to true if the overlap between `i` and `j` is beyond the nms threshold.
* We pack 32 results into one 32-bit integer. So the effective memory layout is topK_gs * topK_gs / 32.
*/
/* Each box `i` was compared with BLOCK_SIZE `j` boxes. This amounts to BLOCK_SIZE / 32
* 32-bit integers per box `i`.
*/
using mask_vector_type = get_vector_type_t<unsigned int, BLOCK_SIZE / 32>;
const int i = threadIdx.x;
auto mask_shared_vPtr = mask_vector_type::get_pointer(DevicePtr<unsigned>(mask_shared));
mask_vector_type temp;
v_load(temp, mask_shared_vPtr[i]);
for (int i = 0; i < mask_vector_type::size(); i++)
temp.data[i] = __brev(temp.data[i]);
auto mask_vPtr = mask_vector_type::get_pointer(mask);
v_store(mask_vPtr[((group_i_offset + i) * topK_gs + group_j_offset) / 32 / mask_vector_type::size()], temp);
}
template <int ITEMS_PER_THREAD, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void grid_nms_collect(Span<int> indices_, Span<int> count_, View<unsigned int> mask_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs_by32)
{
const index_type c = blockIdx.x;
if (c == background_class_id)
return;
const index_type b = blockIdx.y;
// topK_gs is topK rounded upwards to some size
// indices: [batch_size, num_classes, topK]
// count: [batch_size, num_classes]
// mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
auto indices = indices_.data() + (b * num_classes + c) * topK;
auto count = count_.data() + (b * num_classes + c);
auto mask = mask_.data() + (b * num_classes + c) * topK_gs_by32 * 32 * topK_gs_by32;
const auto boxes = *count;
if (boxes == 0)
return;
/* We have a fixed number of threads and an arbitary number of boxes. We use an array of
* bits to store which boxes haven't been eliminated and which are still active. We organize
* the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
* is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
* `num_rows` is the minimum number of rows required to cover all the boxes.
*
* Each thread handles a specific column in the matrix. To improve performance, we process
* `ITEMS_PER_THREAD` number of elements per thread. This changes the shape to (num_rows,
* ROW_WIDTH) where ROW_WIDTH is BLOCK_SIZE * ITEMS_PER_THREAD.
*/
constexpr int ROW_WIDTH = BLOCK_SIZE * ITEMS_PER_THREAD;
const index_type num_32b_masks = static_cast<unsigned>(boxes + 31) / 32;
const index_type num_rows = static_cast<unsigned>(num_32b_masks + ROW_WIDTH - 1) / ROW_WIDTH;
extern __shared__ unsigned int active_boxes[]; // the matrix described earlier
#pragma unroll 1
for (auto idx : block_stride_range<BLOCK_SIZE>(num_32b_masks))
active_boxes[idx] = (idx == num_32b_masks - 1) ? __brev((1u << (boxes % 32)) - 1) : 0xFFFFFFFF;
__syncthreads();
using vector_type = get_vector_type_t<unsigned int, ITEMS_PER_THREAD>;
auto mask_vPtr = vector_type::get_pointer(mask);
auto shared_vPtr = vector_type::get_pointer(DevicePtr<unsigned>(active_boxes));
int index_temp;
int thread0_count = 0;
int thread_id = threadIdx.x;
for (int step = 0; step < num_32b_masks; step++)
{
auto current_active = active_boxes[step];
while (current_active)
{
const index_type bit = __clz(current_active);
const index_type i = step * 32 + bit;
const int mask_offset = static_cast<unsigned>(i * topK_gs_by32) / ITEMS_PER_THREAD;
/* We fetch the index from the memory and store it in a register. We will not use it until
* much later. This helps avoid a long scoreboard stall.
*/
if (thread_id == 0)
index_temp = indices[i];
__syncthreads();
if (threadIdx.x == 0)
active_boxes[step] = current_active ^ (0x80000000 >> bit);
__syncthreads();
#pragma unroll 1
for (int r = 0; r < num_rows; r++)
{
const int idx = r * BLOCK_SIZE + thread_id;
if ((step & ~(ITEMS_PER_THREAD - 1)) <= idx * ITEMS_PER_THREAD && idx * ITEMS_PER_THREAD < num_32b_masks)
{
auto active_boxes_vec = shared_vPtr[idx];
auto mask_vec = mask_vPtr[mask_offset + idx];
for (int i = 0; i < vector_type::size(); i++)
active_boxes_vec.data[i] &= mask_vec.data[i];
shared_vPtr[idx] = active_boxes_vec;
}
}
__syncthreads();
if (thread_id == 0)
{
indices[thread0_count] = index_temp;
thread0_count++;
}
current_active = active_boxes[step];
}
}
if (threadIdx.x == 0)
*count = thread0_count;
}
}
constexpr int GROUP_SIZE = 128;
static std::size_t getAlignedTopK(std::size_t topK)
{
auto remainder = topK % GROUP_SIZE;
if (remainder == 0)
return topK;
return topK + (GROUP_SIZE - remainder);
}
std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK)
{
auto topK_gs = getAlignedTopK(classwise_topK);
return num_classes * topK_gs * topK_gs / 32 * sizeof(unsigned int);
}
template <class T>
void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold)
{
// workspace: [batch_size, num_classes, topK_gs, topK_gs / 32]
// indices: [batch_size, num_classes, topK]
// count: [batch_size, num_classes]
// bboxes: [batch_size, num_classes, topK, 4] (only first count[b][c] boxes are read)
const auto batch_size = indices.get_axis_size(0);
CV_Assert(count.get_axis_size(0) == batch_size);
CV_Assert(bboxes.get_axis_size(0) == batch_size);
const auto num_classes = indices.get_axis_size(1);
CV_Assert(count.get_axis_size(1) == num_classes);
CV_Assert(bboxes.get_axis_size(1) == num_classes);
const auto topK = indices.get_axis_size(2);
CV_Assert(bboxes.get_axis_size(2) == topK);
CV_Assert(bboxes.get_axis_size(3) == 4);
const auto topK_gs = getAlignedTopK(topK);
CV_Assert(workspace.size() >= topK_gs * topK_gs / 32);
const auto boxes = topK;
const auto num_groups = (boxes + GROUP_SIZE - 1) / GROUP_SIZE;
{
// grid = (num_groups * num_groups, num_classes, batch_size)
// if the background class is the last class, we can reduce grid y dim by one
auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
constexpr int BLOCK_SIZE = GROUP_SIZE;
dim3 grid_size(num_groups * num_groups, grid_num_classes, batch_size);
dim3 block_size(BLOCK_SIZE);
auto policy = execution_policy(grid_size, block_size, stream);
if (normalized_bbox)
{
auto kernel = raw::grid_nms<T, true, BLOCK_SIZE>;
launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
}
else
{
auto kernel = raw::grid_nms<T, false, BLOCK_SIZE>;
launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
}
}
{
// grid = (num_classes, batch_size)
// if the background class is the last class, we can reduce grid x dim by one
auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
constexpr int BLOCK_SIZE = 64;
constexpr int ITEMS_PER_THREAD = 4;
auto kernel = raw::grid_nms_collect<ITEMS_PER_THREAD, BLOCK_SIZE>;
dim3 grid_size(grid_num_classes, batch_size);
auto sharedMem = topK_gs / 32 * 4;
auto policy = execution_policy(grid_size, BLOCK_SIZE, sharedMem, stream);
launch_kernel(kernel, policy, indices, count, workspace, num_classes, background_class_id, topK, topK_gs / 32);
}
}
std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,68 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
#include "types.hpp"
#include "index_helpers.hpp"
#include <cuda_runtime.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
template <int dim, class index_type = device::index_type, class size_type = device::size_type>
class grid_stride_range_generic {
public:
__device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { }
__device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
class iterator
{
public:
__device__ iterator(index_type pos_) : pos(pos_) {}
/* these iterators return the index when dereferenced; this allows us to loop
* through the indices using a range based for loop
*/
__device__ index_type operator*() const { return pos; }
__device__ iterator& operator++() {
pos += getGridDim<dim>() * static_cast<index_type>(getBlockDim<dim>());
return *this;
}
__device__ bool operator!=(const iterator& other) const {
/* NOTE HACK
* 'pos' can move in large steps (see operator++)
* expansion of range for loop uses != as the loop conditioion
* => operator!= must return false if 'pos' crosses the end
*/
return pos < other.pos;
}
private:
index_type pos;
};
__device__ iterator begin() const {
return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>());
}
__device__ iterator end() const {
return iterator(to);
}
private:
index_type from, to;
};
using grid_stride_range_x = grid_stride_range_generic<0>;
using grid_stride_range_y = grid_stride_range_generic<1>;
using grid_stride_range_z = grid_stride_range_generic<2>;
using grid_stride_range = grid_stride_range_x;
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */

View File

@@ -0,0 +1,41 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
#define OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP
#include "types.hpp"
#include <cuda_runtime.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
namespace detail {
using dim3_member_type = decltype(dim3::x);
using uint3_member_type = decltype(uint3::x);
}
template <int> __device__ detail::dim3_member_type getGridDim();
template <> inline __device__ detail::dim3_member_type getGridDim<0>() { return gridDim.x; }
template <> inline __device__ detail::dim3_member_type getGridDim<1>() { return gridDim.y; }
template <> inline __device__ detail::dim3_member_type getGridDim<2>() { return gridDim.z; }
template <int> __device__ detail::dim3_member_type getBlockDim();
template <> inline __device__ detail::dim3_member_type getBlockDim<0>() { return blockDim.x; }
template <> inline __device__ detail::dim3_member_type getBlockDim<1>() { return blockDim.y; }
template <> inline __device__ detail::dim3_member_type getBlockDim<2>() { return blockDim.z; }
template <int> __device__ detail::uint3_member_type getBlockIdx();
template <> inline __device__ detail::uint3_member_type getBlockIdx<0>() { return blockIdx.x; }
template <> inline __device__ detail::uint3_member_type getBlockIdx<1>() { return blockIdx.y; }
template <> inline __device__ detail::uint3_member_type getBlockIdx<2>() { return blockIdx.z; }
template <int> __device__ detail::uint3_member_type getThreadIdx();
template <> inline __device__ detail::uint3_member_type getThreadIdx<0>() { return threadIdx.x; }
template <> inline __device__ detail::uint3_member_type getThreadIdx<1>() { return threadIdx.y; }
template <> inline __device__ detail::uint3_member_type getThreadIdx<2>() { return threadIdx.z; }
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_INDEX_HELPERS_HPP */

View File

@@ -0,0 +1,94 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
#include <cstddef>
#include <type_traits>
/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
* one kernel which can work with the maximally ranked tensors, we make one kernel for each supported
* tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a
* toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor
* rank as a template parameter.
*
* The kernel is a template and we have different instantiations for each rank. This causes the following pattern
* to arise frequently:
*
* if(rank == 3)
* kernel<T, 3>();
* else if(rank == 2)
* kernel<T, 2>();
* else
* kernel<T, 1>();
*
* The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER.
* This macro creates a function which selects the correct kernel instantiation at runtime.
*
* Example:
*
* // function which setups the kernel and launches it
* template <class T, std::size_t Rank>
* void launch_some_kernel(...);
*
* // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
* GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel);
*
* // internal API function
* template <class T>
* void some(...) {
* // ...
* auto rank = input.rank();
* some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...);
* }
*/
/*
* name name of the dispatcher function that is generated
* func template function that requires runtime selection
*
* T first template parameter to `func`
* start starting rank
* end ending rank (inclusive)
*
* Executes func<T, selector> based on runtime `selector` argument given `selector` lies
* within the range [start, end]. If outside the range, no instantiation of `func` is executed.
*/
#define GENERATE_KERNEL_DISPATCHER(name,func); \
template <class T, std::size_t start, std::size_t end, class... Args> static \
typename std::enable_if<start == end, void> \
::type name(int selector, Args&& ...args) { \
if(selector == start) \
func<T, start>(std::forward<Args>(args)...); \
} \
\
template <class T, std::size_t start, std::size_t end, class... Args> static \
typename std::enable_if<start != end, void> \
::type name(int selector, Args&& ...args) { \
if(selector == start) \
func<T, start>(std::forward<Args>(args)...); \
else \
name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...); \
}
// Same as GENERATE_KERNEL_DISPATCHER but takes two class template parameters T and TP1 instead of just T
#define GENERATE_KERNEL_DISPATCHER_2TP(name,func); \
template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static \
typename std::enable_if<start == end, void> \
::type name(int selector, Args&& ...args) { \
if(selector == start) \
func<TP1, TP2, start>(std::forward<Args>(args)...); \
} \
\
template <class TP1, class TP2, std::size_t start, std::size_t end, class... Args> static \
typename std::enable_if<start != end, void> \
::type name(int selector, Args&& ...args) { \
if(selector == start) \
func<TP1, TP2, start>(std::forward<Args>(args)...); \
else \
name<TP1, TP2, start + 1, end, Args...>(selector, std::forward<Args>(args)...); \
}
#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */

View File

@@ -0,0 +1,36 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP
#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cfloat>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
template <class T>
struct numeric_limits;
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <>
struct numeric_limits<__half> {
__device__ static __half min() { return 0.0000610; }
__device__ static __half max() { return 65504.0; }
__device__ static __half lowest() { return -65504.0; }
};
#endif
template <>
struct numeric_limits<float> {
__device__ static float min() { return FLT_MIN; }
__device__ static float max() { return FLT_MAX; }
__device__ static float lowest() { return -FLT_MAX; }
};
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */

View File

@@ -0,0 +1,154 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP
#define OPENCV_DNN_SRC_CUDA_MATH_HPP
#include <cuda_runtime.h>
#include <cuda_fp16.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); }
template <> inline __device__ float abs(float val) { return fabsf(val); }
template <> inline __device__ double abs(double val) { return fabs(val); }
template <class T> __device__ T exp(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half exp(__half val) { return hexp(val); }
#endif
template <> inline __device__ float exp(float val) { return expf(val); }
template <> inline __device__ double exp(double val) { return ::exp(val); }
template <class T> __device__ T expm1(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half expm1(__half val) { return hexp(val) - __half(1); }
#endif
template <> inline __device__ float expm1(float val) { return expm1f(val); }
template <> inline __device__ double expm1(double val) { return ::expm1(val); }
template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); }
template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); }
template <> inline __device__ double max(double x, double y) { return fmax(x, y); }
template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); }
template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
template <class T> __device__ T log1p(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
#endif
template <> inline __device__ float log1p(float val) { return log1pf(val); }
template <class T> __device__ T log1pexp(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half log1pexp(__half val) {
if (val <= __half(-4.0))
return exp(val);
else if (val <= __half(8.0))
return log1p(exp(val));
else if (val <= __half(8.7))
return val + exp(-val);
else
return val;
}
#endif
template <> inline __device__ float log1pexp(float val) {
if (val <= -20)
return expf(val);
else if (val <= 9.0)
return log1pf(expf(val));
else if (val <= 14.6)
return val + exp(-val);
else
return val;
}
template <> inline __device__ double log1pexp(double val) {
if (val <= -37)
return exp(val);
else if (val <= 18)
return log1p(exp(val));
else if (val <= 33.3)
return val + exp(-val);
else
return val;
}
template <class T> __device__ T tanh(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half tanh(__half val) { return tanhf(val); }
#endif
template <> inline __device__ float tanh(float val) { return tanhf(val); }
template <> inline __device__ double tanh(double val) { return ::tanh(val); }
template <class T> __device__ T pow(T val, T exp);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); }
#endif
template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); }
template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); }
template <class T> __device__ T sqrt(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); }
#endif
template <> inline __device__ float sqrt(float val) { return sqrtf(val); }
template <> inline __device__ double sqrt(double val) { return ::sqrt(val); }
template <class T> __device__ T rsqrt(T val);
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); }
#endif
template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); }
template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); }
template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); }
template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
template <class T> __device__ long lround(T value);
template <> inline __device__ long lround(double value) { return ::lround(value); }
template <> inline __device__ long lround(float value) { return lroundf(value); }
template <class T> __device__ T round(T value);
template <> inline __device__ double round(double value) { return ::round(value); }
template <> inline __device__ float round(float value) { return roundf(value); }
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half round(__half value) { return hrint(value); }
#endif
template <class T> __device__ T ceil(T value);
template <> inline __device__ double ceil(double value) { return ::ceil(value); }
template <> inline __device__ float ceil(float value) { return ceilf(value); }
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template <> inline __device__ __half ceil(__half value) { return hceil(value); }
#endif
template <class T> __device__ T mul_ftz(T x, T y) { return x * y; }
template <> inline __device__ float mul_ftz(float x, float y) {
float result;
asm("mul.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
return result;
}
template <class T> __device__ T fast_divide(T x, T y) { return x / y; }
template <> inline __device__ float fast_divide(float x, float y) { return __fdividef(x, y); }
template <class T> __device__ T fast_divide_ftz(T x, T y) { return fast_divide(x, y); }
template <> inline __device__ float fast_divide_ftz(float x, float y) {
float result;
asm("div.approx.ftz.f32 %0, %1, %2;" : "=f"(result) : "f"(x), "f"(y));
return result;
}
template <class T> __device__ T fast_exp(T value) { return exp(value); }
template <> inline __device__ float fast_exp(float value) { return __expf(value); }
template <class T> __device__ T fast_sigmoid(T value) { return sigmoid(value); }
template <> inline __device__ float fast_sigmoid(float value) { return __fdividef(1, 1 + __expf(-value)); }
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */

View File

@@ -0,0 +1,328 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "array.hpp"
#include "limits.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
#include <vector>
#include <type_traits>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t Order,
typename std::enable_if<Order == 1 || Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
__global__ void max_pooling_with_indices(
Span<T> output, Span<T> indices, View<T> input, size_type channels,
array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
{
/* every element in the output is mapped to a window in the input and each thread processes several windows */
for (auto idx : grid_stride_range(output.size())) {
size_type out_spatial_size = 1;
array<index_type, Order> window_idx;
for (int i = Order - 1; i >= 0; i--) {
window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
out_spatial_size *= out_spatial_dims[i];
}
const index_type n = idx / (out_spatial_size * channels);
const index_type c = (idx / out_spatial_size) % channels;
array<index_type, Order> start;
for(int i = 0; i < Order; i++)
start[i] = window_idx[i] * strides[i] - padding_left[i];
array<index_type, Order> end;
for (int i = 0; i < Order; i++) {
using device::min;
end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
}
for (int i = 0; i < Order; i++) {
using device::max;
start[i] = max(start[i], 0);
}
T max_value = numeric_limits<T>::lowest();
index_type max_idx = -1;
size_type in_spatial_size = 1;
for (int i = 0; i < Order; i++)
in_spatial_size *= in_spatial_dims[i];
const auto outer_offset = (n * channels + c) * in_spatial_size;
if (Order == 1) {
array<index_type, Order> idx;
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
index_type offset = 0;
index_type stride = 1;
for (int i = Order - 1; i >= 0; i--) {
offset += stride * idx[i];
stride *= in_spatial_dims[i];
}
if (input[outer_offset + offset] > max_value) {
max_idx = offset;
max_value = input[outer_offset + offset];
}
}
} else if (Order == 2) {
array<index_type, Order> idx;
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
index_type offset = 0;
index_type stride = 1;
for (int i = Order - 1; i >= 0; i--) {
offset += stride * idx[i];
stride *= in_spatial_dims[i];
}
if (input[outer_offset + offset] > max_value) {
max_idx = offset;
max_value = input[outer_offset + offset];
}
}
}
} else if(Order == 3) {
array<index_type, Order> idx;
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
index_type offset = 0;
index_type stride = 1;
for (int i = Order - 1; i >= 0; i--) {
offset += stride * idx[i];
stride *= in_spatial_dims[i];
}
if (input[outer_offset + offset] > max_value) {
max_idx = offset;
max_value = input[outer_offset + offset];
}
}
}
}
}
output[idx] = max_value;
indices[idx] = max_idx;
}
}
template <class T, std::size_t Order>
__global__ void max_unpooling(
Span<T> output, View<T> input, View<T> indices, size_type channels,
array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
{
/* the output has already been zero filled */
/* Every input value represents a window in the output. The max unpooling operation
* copies the input value to exactly one location in the output window which is given
* by the indices tensor.
*/
for (auto idx : grid_stride_range(input.size())) {
size_type in_spatial_size = 1;
array<index_type, Order> window_idx;
for (int i = Order - 1; i >= 0; i--) {
window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
in_spatial_size *= in_spatial_dims[i];
}
const index_type n = idx / (in_spatial_size * channels);
const index_type c = (idx / in_spatial_size) % channels;
array<index_type, Order> start;
for (int i = 0; i < Order; i++) {
using device::min;
using device::max;
start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
}
size_type out_spatial_size = 1;
for (int i = 0; i < Order; i++)
out_spatial_size *= out_spatial_dims[i];
index_type outer_offset = (n * channels + c) * out_spatial_size;
output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
}
}
}
template <class T, std::size_t Order> static
void launch_max_pooling_kernel(
const Stream& stream,
Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
const std::vector<std::size_t>& window_size,
const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
{
CV_Assert(indices.size() == output.size());
CV_Assert(out_spatial_dims.size() == Order);
CV_Assert(in_spatial_dims.size() == Order);
CV_Assert(window_size.size() == Order);
CV_Assert(strides.size() == Order);
CV_Assert(padding_left.size() == Order);
array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
array<size_type, Order> window_size_k, strides_k, padding_left_k;
window_size_k.assign(std::begin(window_size), std::end(window_size));
strides_k.assign(std::begin(strides), std::end(strides));
padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
auto kernel = raw::max_pooling_with_indices<T, Order>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, indices, input, channels,
out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
}
template <class T>
void max_pooling_with_indices(
const Stream& stream,
TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
const std::vector<std::size_t>& padding_left)
{
CV_Assert(is_shape_same(output, indices));
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
auto order = window_size.size();
CV_Assert(strides.size() == order);
CV_Assert(padding_left.size() == order);
CV_Assert(output.rank() == order + 2);
CV_Assert(input.rank() == order + 2);
std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
for (int i = 0; i < order; i++) {
in_spatial_dims[i] = input.get_axis_size(2 + i);
out_spatial_dims[i] = output.get_axis_size(2 + i);
}
CV_Assert(1 <= order && order <= 3);
std::size_t channels = input.get_axis_size(1);
if (order == 3) {
launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
} else if (order == 2) {
launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
} else if (order == 1) {
launch_max_pooling_kernel<T, 1>(stream, output, indices, input, channels,
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void max_pooling_with_indices(const Stream&,
TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const std::vector<std::size_t>&);
#endif
template void max_pooling_with_indices(const Stream&,
TensorSpan<float>, TensorSpan<float>, TensorView<float>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const std::vector<std::size_t>&);
template <class T, std::size_t Order> static
void launch_max_unpooling_kernel(
const Stream& stream,
Span<T> output, View<T> input, View<T> indices, std::size_t channels,
const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
const std::vector<std::size_t>& window_size,
const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
{
CV_Assert(out_spatial_dims.size() == Order);
CV_Assert(in_spatial_dims.size() == Order);
CV_Assert(window_size.size() == Order);
CV_Assert(strides.size() == Order);
CV_Assert(padding_left.size() == Order);
CV_Assert(indices.size() == input.size());
array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
array<size_type, Order> window_size_k, strides_k, padding_left_k;
window_size_k.assign(std::begin(window_size), std::end(window_size));
strides_k.assign(std::begin(strides), std::end(strides));
padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
auto kernel = raw::max_unpooling<T, Order>;
auto policy = make_policy(kernel, input.size(), 0, stream);
launch_kernel(kernel, policy, output, input, indices, channels,
out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
}
template <class T>
void max_unpooling(
const Stream& stream,
TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
const std::vector<std::size_t>& padding_left)
{
CV_Assert(is_shape_same(input, indices));
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
auto order = window_size.size();
CV_Assert(strides.size() == order);
CV_Assert(padding_left.size() == order);
CV_Assert(output.rank() == order + 2);
CV_Assert(input.rank() == order + 2);
std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
for (int i = 0; i < order; i++) {
in_spatial_dims[i] = input.get_axis_size(2 + i);
out_spatial_dims[i] = output.get_axis_size(2 + i);
}
kernels::fill<T>(stream, output, 0.0);
/* only max_unpooling2d and max_unpooling3d are supported */
CV_Assert(2 <= order && order <= 3);
std::size_t channels = input.get_axis_size(1);
if (order == 3) {
launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
} else if (order == 2) {
launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void max_unpooling(const Stream&,
TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const std::vector<std::size_t>&);
#endif
template void max_unpooling(const Stream&,
TensorSpan<float>, TensorView<float>, TensorView<float>,
const std::vector<std::size_t>&, const std::vector<std::size_t>&,
const std::vector<std::size_t>&);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,32 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_MEMORY_HPP
#define OPENCV_DNN_SRC_CUDA_MEMORY_HPP
#include <cuda_runtime.h>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
template <class T>
__device__ T load_ldg(const T& src) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
return __ldg(&src);
#else
return src;
#endif
}
template <class T>
__device__ T load_ldg(const T* src) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
return __ldg(src);
#else
return *src;
#endif
}
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_MEMORY_HPP */

View File

@@ -0,0 +1,145 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "types.hpp"
#include "atomics.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T>
__global__ void reduce_mean(Span<float> means, View<T> input, size_type inner_size) {
for (auto idx : grid_stride_range(input.size())) {
const index_type outer_idx = idx / inner_size;
atomicAdd(&means[outer_idx], static_cast<float>(input[idx]) / inner_size);
}
}
template <class T>
__global__ void reduce_mean_sqr_sum(Span<float> means, Span<float> sum_sqrs, View<T> input, size_type inner_size) {
for (auto idx : grid_stride_range(input.size())) {
const index_type outer_idx = idx / inner_size;
auto x = static_cast<float>(input[idx]);
atomicAdd(&means[outer_idx], x / inner_size);
atomicAdd(&sum_sqrs[outer_idx], x * x);
}
}
__global__ void compute_normalization_scale(Span<float> scale, View<float> means, View<float> sums_sqr, size_type inner_size, float eps) {
for (auto idx : grid_stride_range(scale.size())) {
auto mean = means[idx];
auto var = sums_sqr[idx] / inner_size - mean * mean;
using device::rsqrt;
scale[idx] = rsqrt(eps + var);
}
}
template <class T>
__global__ void normalize_mean(Span<T> output, View<T> input, View<float> means, size_type inner_size) {
for (auto idx : grid_stride_range(output.size())) {
const index_type outer_idx = idx / inner_size;
output[idx] = static_cast<float>(input[idx]) - means[outer_idx];
}
}
template <class T>
__global__ void normalize_mean_variance(Span<T> output, View<T> input, View<float> means, View<float> scale, size_type inner_size) {
for (auto idx : grid_stride_range(output.size())) {
const index_type outer_idx = idx / inner_size;
output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * scale[outer_idx];
}
}
}
template <class T>
void reduce_mean(const Stream& stream, Span<float> means, View<T> input, std::size_t inner_size)
{
CV_Assert(input.size() / inner_size == means.size());
auto kernel = raw::reduce_mean<T>;
auto policy = make_policy(kernel, input.size(), 0, stream);
launch_kernel(kernel, policy, means, input, inner_size);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void reduce_mean(const Stream&, Span<float>, View<__half>, std::size_t);
#endif
template void reduce_mean(const Stream&, Span<float>, View<float>, std::size_t);
template <class T>
void reduce_mean_sqr_sum(const Stream& stream, Span<float> means, Span<float> sum_sqrs, View<T> input, std::size_t inner_size)
{
CV_Assert(input.size() / inner_size == means.size());
CV_Assert(input.size() / inner_size == sum_sqrs.size());
auto kernel = raw::reduce_mean_sqr_sum<T>;
auto policy = make_policy(kernel, input.size(), 0, stream);
launch_kernel(kernel, policy, means, sum_sqrs, input, inner_size);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<__half>, std::size_t);
#endif
template void reduce_mean_sqr_sum(const Stream&, Span<float>, Span<float>, View<float>, std::size_t);
void compute_normalization_scale(const Stream& stream, Span<float> scale, View<float> means, View<float> sum_sqrs, std::size_t inner_size, float eps)
{
CV_Assert(scale.size() == means.size());
CV_Assert(scale.size() == sum_sqrs.size());
auto kernel = raw::compute_normalization_scale;
auto policy = make_policy(kernel, scale.size(), 0, stream);
launch_kernel(kernel, policy, scale, means, sum_sqrs, inner_size, eps);
}
template <class T>
void normalize_mean(const Stream& stream, Span<T> output, View<T> input, View<float> means, std::size_t inner_size)
{
CV_Assert(output.size() == input.size());
CV_Assert(input.size() / inner_size == means.size());
auto kernel = raw::normalize_mean<T>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, input, means, inner_size);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void normalize_mean(const Stream&, Span<__half>, View<__half>, View<float>, std::size_t);
#endif
template void normalize_mean(const Stream&, Span<float>, View<float>, View<float>, std::size_t);
template <class T>
void normalize_mean_variance(const Stream& stream, Span<T> output, View<T> input, View<float> means, View<float> scale, std::size_t inner_size)
{
CV_Assert(input.size() == output.size());
CV_Assert(input.size() / inner_size == means.size());
CV_Assert(input.size() / inner_size == scale.size());
auto kernel = raw::normalize_mean_variance<T>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, input, means, scale, inner_size);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void normalize_mean_variance(const Stream&, Span<__half>, View<__half>, View<float>, View<float>, std::size_t);
#endif
template void normalize_mean_variance(const Stream&, Span<float>, View<float>, View<float>, View<float>, std::size_t);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,123 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "math.hpp"
#include "types.hpp"
#include "atomics.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include "../cuda4dnn/kernels/scale_shift.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T>
__global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
for (auto idx : grid_stride_range(input.size())) {
const index_type outer_idx = idx / outer_stride;
const index_type inner_idx = idx % mid_stride;
const index_type sum_idx = outer_idx * mid_stride + inner_idx;
atomicAdd(&output[sum_idx], device::abs(input[idx]));
}
}
template <class T>
__global__ void reciprocal(Span<T> output, T epsilon) {
for (auto idx : grid_stride_range(output.size()))
output[idx] = T(1) / (output[idx] + epsilon);
}
template <class T>
__global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
for (auto idx : grid_stride_range(input.size())) {
const index_type outer_idx = idx / outer_stride;
const index_type inner_idx = idx % mid_stride;
const index_type sum_idx = outer_idx * mid_stride + inner_idx;
atomicAdd(&output[sum_idx], input[idx] * input[idx]);
}
}
template <class T>
__global__ void rsqrt(Span<T> output, T epsilon) {
for (auto idx : grid_stride_range(output.size())) {
using device::sqrt;
output[idx] = T(1) / sqrt(output[idx] + epsilon);
}
}
template <class T>
__global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
for (auto idx : grid_stride_range(output.size())) {
const index_type outer_idx = idx / outer_stride;
const index_type inner_idx = idx % mid_stride;
const index_type sum_idx = outer_idx * mid_stride + inner_idx;
output[idx] = input[idx] * sums[sum_idx];
}
}
}
template <class T>
void normalize(
const Stream& stream,
Span<T> output,
View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
Span<T> workspace)
{
CV_Assert(output.size() == input.size());
CV_Assert(output.size() == outer_size * mid_size * inner_size);
CV_Assert(norm == 1 || norm == 2);
CV_Assert(workspace.size() >= outer_size * inner_size);
auto sums = Span<T>(workspace.data(), outer_size * inner_size);
fill<T>(stream, sums, 0.0);
if (norm == 1) {
auto reduce_kernel = raw::reduce_sum_abs<T>;
auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
auto reciprocal_kernel = raw::reciprocal<T>;
policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
launch_kernel(reciprocal_kernel, policy, sums, epsilon);
} else {
auto reduce_kernel = raw::reduce_sum_squared<T>;
auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
auto rsqrt_kernel = raw::rsqrt<T>;
policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
launch_kernel(rsqrt_kernel, policy, sums, epsilon);
}
auto scale_kernel = raw::apply_norm<T>;
auto policy = make_policy(scale_kernel, output.size(), 0, stream);
launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
#endif
template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,201 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "math.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "kernel_dispatcher.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
#include <vector>
#include <utility>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t Rank>
__global__ void copy_with_reflection101(
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
View<T> input, array<size_type, Rank> in_strides)
{
for (auto i : grid_stride_range(output.size())) {
/* compute output axis indices corresponding to element 'i' */
array<index_type, Rank> out_index;
out_index[0] = i / out_strides[0];
for (int j = 1; j < Rank; j++)
out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
/* compute input axis indices corresponding to output axis indices */
array<index_type, Rank> in_index;
for (int j = 0; j < Rank; j++) {
/* if out_index < start, the point is in the left reflection region
* the reflected value's index is the absolute value of the difference
*
* otherwise, if the value is in the copy region, out_index - start gives the input index
*/
using device::abs;
in_index[j] = abs(out_index[j] - start[j]);
/* if out_index >= end, it's in the right reflection region */
if (out_index[j] >= end[j])
in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
}
/* compute input element number from input axis indices */
index_type iidx = 0;
for (int j = 0; j < Rank; j++)
iidx += in_index[j] * in_strides[j];
output[i] = input[iidx];
}
}
}
template <class T, std::size_t Rank> static
void launch_copy_with_reflection101(
const Stream& stream,
Span<T> output, const std::vector<std::size_t>& outStride,
View<T> input, const std::vector<std::size_t>& inStride,
const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
{
CV_Assert(outStride.size() == Rank);
CV_Assert(inStride.size() == Rank);
CV_Assert(ranges.size() == Rank);
array<size_type, Rank> outStride_k, inStride_k;
outStride_k.assign(std::begin(outStride), std::end(outStride));
inStride_k.assign(std::begin(inStride), std::end(inStride));
array<index_type, Rank> start_k, end_k;
for (int i = 0; i < Rank; i++) {
start_k[i] = ranges[i].first;
end_k[i] = ranges[i].second;
}
auto kernel = raw::copy_with_reflection101<T, Rank>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
}
GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
template <class T>
void copy_with_reflection101(
const Stream& stream,
TensorSpan<T> output, TensorView<T> input,
std::vector<std::pair<std::size_t, std::size_t>> ranges)
{
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == ranges.size());
/* squeezable axes at the beginning of both tensors can be eliminated
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
* output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
* The padding operation essentially copies items from the input tensor to new locations in the output tensor
* and pads the remaining.
*
* If the size of the first axis of the input and output tensor is unity, the input and output indices
* for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
* there cannot be extra padding since the axes have unit size. The first index does not contribute to the
* element's address calculation and hence does nothing apart from eating up few cycles.
*/
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
input.squeeze(0);
output.squeeze(0);
ranges.erase(std::begin(ranges));
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == ranges.size());
}
auto inShape = input.shape_as_vector();
auto outShape = output.shape_as_vector();
/* contiguous axes which do not have any padding can be combined into one axis
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
* padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
*
* Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
* a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
*/
for (int i = 0; i < inShape.size(); i++) {
/* check if axis `i` requires any padding */
if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
/* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
CV_Assert(inShape[i] == outShape[i]);
/* we now iterate through the axes which follow and try to merge */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
CV_Assert(inShape[j] == outShape[j]);
/* `j` is also unpadded; merge `i` and `j` */
auto new_size = inShape[i] * inShape[j];
inShape[i] = new_size;
outShape[i] = new_size;
ranges[i].second = new_size;
/* delete axis `j` */
inShape.erase(std::begin(inShape) + j);
outShape.erase(std::begin(outShape) + j);
ranges.erase(std::begin(ranges) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape.size() == outShape.size());
CV_Assert(inShape.size() == ranges.size());
CV_Assert(inShape[i] == outShape[i]);
CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
}
}
}
auto rank = inShape.size();
std::vector<std::size_t> inStride(rank), outStride(rank);
inStride.back() = 1;
outStride.back() = 1;
/* garbage, ..., garbage, 1 */
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
/* dim[0], dim[1], ..., dim[-1], 1 */
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
/* stride[0], stride[1], ..., stride[-2], 1 */
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
#endif
template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,288 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "kernel_dispatcher.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
#include <vector>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t Rank>
__global__ void permute(
array<index_type, Rank> axis_order,
Span<T> output, array<size_type, Rank> outStrides,
View<T> input, array<size_type, Rank> inStrides)
{
for (auto i : grid_stride_range(input.size())) {
index_type oldPosition = 0;
index_type newPosition = i;
for (int j = 0; j < Rank; j++)
{
auto order = axis_order[j];
oldPosition += (newPosition / outStrides[j]) * inStrides[order];
newPosition %= outStrides[j];
}
output[i] = input[oldPosition];
}
}
template <class T, int TILE_SIZE, int ROWS_PER_THREAD>
__global__ void transpose(Span<T> output, View<T> input, size_type in_width, size_type out_width)
{
__shared__ T tile[TILE_SIZE][TILE_SIZE + 1];
/* blockDim.y = TILE_SIZE / ROWS_PER_THREAD, blockDim.x = TILE_SIZE */
const index_type in_x = blockIdx.x * TILE_SIZE + threadIdx.x;
const index_type in_y_begin = blockIdx.y * TILE_SIZE + threadIdx.y;
/* Every valid input location has a corresponding output location and vice versa.
* Hence, if we do not load values into the shared memory for a given location, we
* also won't read them for storing in the output.
*/
for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
{
const auto in_y_current = in_y_begin + j;
if (in_x < in_width && in_y_current < out_width)
tile[threadIdx.y + j][threadIdx.x] = input[in_y_current * in_width + in_x];
}
__syncthreads();
/* We interchange `threadIdx.x` and `threadIdx.y` so that consecutive output indices map to
* consecutive threads. This would allow writes across threds in a warp to be coalesced.
*/
const index_type out_x = blockIdx.y * TILE_SIZE + threadIdx.x;
const index_type out_y_begin = blockIdx.x * TILE_SIZE + threadIdx.y;
for (int j = 0; j < TILE_SIZE; j += TILE_SIZE / ROWS_PER_THREAD)
{
const auto out_y_current = out_y_begin + j;
if (out_x < out_width && out_y_current < in_width)
output[out_y_current * out_width + out_x] = tile[threadIdx.x][threadIdx.y + j];
}
}
}
template <class T>
void transpose(const Stream& stream, Span<T> output, View<T> input, std::size_t in_width, std::size_t out_width)
{
/* Each block processes a TILE_SIZE x TILE_SIZE piece */
constexpr int TILE_SIZE = 32;
/* Each thread processes ROWS_PER_THREAD rows. We do this to decrease the number of threads required
* in a block so that the cost of the block-wide synchronization is minimized.
*/
constexpr int ROWS_PER_THREAD = 4;
dim3 grid_size((in_width + TILE_SIZE - 1) / TILE_SIZE, (out_width + TILE_SIZE - 1) / TILE_SIZE);
dim3 block_size(TILE_SIZE, TILE_SIZE / ROWS_PER_THREAD);
auto policy = execution_policy(grid_size, block_size, stream);
auto kernel = raw::transpose<T, TILE_SIZE, ROWS_PER_THREAD>;
launch_kernel(kernel, policy, output, input, in_width, out_width);
}
template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t);
template void transpose(const Stream&, Span<float>, View<float>, std::size_t, std::size_t);
template <class T, std::size_t Rank> static
void launch_permute_kernel(
const Stream& stream,
const std::vector<std::size_t>& order,
Span<T> output, const std::vector<std::size_t>& outStride,
View<T> input, const std::vector<std::size_t>& inStride)
{
CV_Assert(order.size() == Rank);
CV_Assert(outStride.size() == Rank);
CV_Assert(inStride.size() == Rank);
array<index_type, Rank> order_k;
order_k.assign(std::begin(order), std::end(order));
array<size_type, Rank> outStride_k, inStride_k;
outStride_k.assign(std::begin(outStride), std::end(outStride));
inStride_k.assign(std::begin(inStride), std::end(inStride));
auto kernel = raw::permute<T, Rank>;
auto policy = make_policy(kernel, input.size(), 0, stream);
launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k);
}
GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel);
template <class T>
void permute(
const Stream& stream,
TensorSpan<T> output, TensorView<T> input,
std::vector<std::size_t> order)
{
CV_Assert(output.rank() == input.rank());
CV_Assert(input.rank() == order.size());
CV_Assert(input.size() == output.size());
auto rank = output.rank();
auto inShape = input.shape_as_vector();
auto outShape = output.shape_as_vector();
/* singleton axes do not contribute towards address calculation
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
* output tensor will be some permutation of the input tensor indices. Let the output
* tensor indices be [o1, o2, ...]. The permutation operation essentially copies items
* from the input tensor to new locations in the output tensor as dictated by the indices.
*
* If the size of the nth axis (say i2) of the input is one the input and output indicies for
* all the elements will be of the form be [i1, 0, ...] and [..., 0, ...] respectively.
* The index does not contribute to the element's address calculation and hence would give
* identical result if it weren't there.
*/
for (int i = 0; i < rank; i++)
{
/* index `i` corresponds to the axis index in the output; order[i] has the corresponding axis index in the input */
while (i < rank && outShape[i] == 1)
{
int in_i = order[i];
CV_Assert(inShape[in_i] == 1);
/* delete axis `i` */
inShape.erase(std::begin(inShape) + in_i);
outShape.erase(std::begin(outShape) + i);
/* deletion of an axis reduces an axis in the input tensor which would cause the indices
* of the axes that come after the deleted axis to reduce by one
*/
order.erase(order.begin() + i);
for (auto& axis : order)
if (axis > in_i)
axis--;
rank--;
/* optimizations should not break the invariants */
CV_Assert(rank == order.size());
CV_Assert(inShape.size() == order.size());
CV_Assert(outShape.size() == order.size());
CV_Assert(input.size() == output.size());
}
}
/* contiguous axes whose relative ordering stays same before and after permutation can be merged into one axis
* example: in permute order 0 2 3 1, axes 2 and 3 can be grouped into a single axis
*
* Reasoning:
* ----------
* Suppose an item's indices in the input tensor is [i0, i1, i2, i3, ...]. Let the permutation order be [0, 3, 1, 2, ...].
* Note that i1 and i2 are adjacent axes in the same order in input as well as output. The indices in the output tensor
* will be [i0, i3, i1, i2, ...].
*
* Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
* the two axes add a total offset of `i1 * (size2 * stride2) + i2 * stride2` which is `(i1 * size2 + i2) * stride2`,
* in both input and output. Note stride2 can be different in the input and output. We can merge the two axes into one axis
* with a size of `size1 * size2`. The new offset added will be `i12 * stride12` as the kernel iterates through `i12`. Note
* that `i12` is actually `(i1 * size2 + i2)` and `stride12` is `stride2`.
*/
for (int i = 0; i < rank; i++) {
/* the indices used in the loops such as `i` and `j` are axis indices in the output tensor */
/* the corresponding input axis indices are `order[i]` and `order[j]`*/
/* loop invariant: `i` is the first axis in the contiguous unpermuted axis sequence */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < rank && (order[i] + 1) == order[j]) {
/* axis `i` and axis `j` do not change relative order */
auto in_i = order[i], in_j = order[j];
auto new_size = inShape[in_i] * inShape[in_j];
inShape[in_i] = new_size;
outShape[i] = new_size;
/* delete axis `j` */
inShape.erase(std::begin(inShape) + in_j);
outShape.erase(std::begin(outShape) + j);
/* deletion of an axis reduces an axis in the input tensor which would cause the indices
* of the axes that come after the deleted axis to reduce by one
*/
order.erase(order.begin() + j);
for (auto& axis : order)
if (axis > order[i])
axis--;
rank--;
/* optimizations should not break the invariants */
CV_Assert(rank == order.size());
CV_Assert(inShape.size() == order.size());
CV_Assert(outShape.size() == order.size());
CV_Assert(input.size() == output.size());
}
}
std::vector<std::size_t> inStride(rank), outStride(rank);
inStride.back() = 1;
outStride.back() = 1;
/* garbage, ..., garbage, 1 */
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
/* dim[0], dim[1], ..., dim[-1], 1 */
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
/* stride[0], stride[1], ..., stride[-2], 1 */
const bool is_in_order = [&order] {
for (int i = 0; i < order.size(); i++)
if (order[i] != i)
return false;
return true;
}();
if (is_in_order)
{
kernels::copy<T>(stream, output, input);
}
else if(rank == 2)
{
/* use the more efficient transpose kernel */
transpose<T>(stream, output, input, inShape[1], outShape[1]);
}
else
{
CV_Assert(3 <= rank && rank <= CSL_MAX_TENSOR_RANK);
permute_dispatcher<T, 3, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
#endif
template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,176 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "math.hpp"
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <cstddef>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, bool Normalize>
__global__ void prior_box(
Span<T> output,
View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
size_type layerWidth, size_type layerHeight,
size_type imageWidth, size_type imageHeight)
{
/* each box consists of two pair of coordinates and hence 4 values in total */
/* since the entire output consists (first channel at least) of these boxes,
* we are garunteeed that the output is aligned to a boundary of 4 values
*/
using vector_type = get_vector_type_t<T, 4>;
auto output_vPtr = vector_type::get_pointer(output.data());
/* num_points contains the number of points in the feature map of interest
* each iteration of the stride loop selects a point and generates prior boxes for it
*/
size_type num_points = layerWidth * layerHeight;
for (auto idx : grid_stride_range(num_points)) {
const index_type x = idx % layerWidth,
y = idx / layerWidth;
index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size();
for (int i = 0; i < boxWidth.size(); i++) {
for (int j = 0; j < offsetX.size(); j++) {
float center_x = (x + offsetX[j]) * stepX;
float center_y = (y + offsetY[j]) * stepY;
vector_type vec;
if(Normalize) {
vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth;
vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight;
vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth;
vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight;
} else {
vec.data[0] = center_x - boxWidth[i] * 0.5f;
vec.data[1] = center_y - boxHeight[i] * 0.5f;
vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f;
vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f;
}
v_store(output_vPtr[output_offset_v4], vec);
output_offset_v4++;
}
}
}
}
template <class T>
__global__ void prior_box_clip(Span<T> output) {
for (auto i : grid_stride_range(output.size())) {
using device::clamp;
output[i] = clamp<T>(output[i], 0.0, 1.0);
}
}
template <class T>
__global__ void prior_box_set_variance1(Span<T> output, float variance) {
using vector_type = get_vector_type_t<T, 4>;
auto output_vPtr = vector_type::get_pointer(output.data());
for (auto i : grid_stride_range(output.size() / 4)) {
vector_type vec;
for (int j = 0; j < 4; j++)
vec.data[j] = variance;
v_store(output_vPtr[i], vec);
}
}
template <class T>
__global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) {
using vector_type = get_vector_type_t<T, 4>;
auto output_vPtr = vector_type::get_pointer(output.data());
for (auto i : grid_stride_range(output.size() / 4)) {
vector_type vec;
for(int j = 0; j < 4; j++)
vec.data[j] = variance[j];
v_store(output_vPtr[i], vec);
}
}
}
template <class T, bool Normalize> static
void launch_prior_box_kernel(
const Stream& stream,
Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight)
{
auto num_points = layerWidth * layerHeight;
auto kernel = raw::prior_box<T, Normalize>;
auto policy = make_policy(kernel, num_points, 0, stream);
launch_kernel(kernel, policy,
output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
layerWidth, layerHeight, imageWidth, imageHeight);
}
template <class T>
void generate_prior_boxes(
const Stream& stream,
Span<T> output,
View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
std::vector<float> variance,
std::size_t numPriors,
std::size_t layerWidth, std::size_t layerHeight,
std::size_t imageWidth, std::size_t imageHeight,
bool normalize, bool clip)
{
if (normalize) {
launch_prior_box_kernel<T, true>(
stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
layerWidth, layerHeight, imageWidth, imageHeight
);
} else {
launch_prior_box_kernel<T, false>(
stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
layerWidth, layerHeight, imageWidth, imageHeight
);
}
std::size_t channel_size = layerHeight * layerWidth * numPriors * 4;
CV_Assert(channel_size * 2 == output.size());
if (clip) {
auto output_span_c1 = Span<T>(output.data(), channel_size);
auto kernel = raw::prior_box_clip<T>;
auto policy = make_policy(kernel, output_span_c1.size(), 0, stream);
launch_kernel(kernel, policy, output_span_c1);
}
auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size);
if (variance.size() == 1) {
auto kernel = raw::prior_box_set_variance1<T>;
auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
launch_kernel(kernel, policy, output_span_c2, variance[0]);
} else {
array<float, 4> variance_k;
variance_k.assign(std::begin(variance), std::end(variance));
auto kernel = raw::prior_box_set_variance4<T>;
auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
launch_kernel(kernel, policy, output_span_c2, variance_k);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float,
std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
#endif
template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float,
std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,216 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "limits.hpp"
#include "vector_traits.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T>
__global__ void region_box(
Span<T> output, View<T> input, View<T> bias,
size_type boxes_per_cell, size_type box_size,
size_type rows, size_type cols, T scale_x_y,
size_type height_norm, size_type width_norm,
T object_prob_cutoff, bool new_coords)
{
using vector2_type = get_vector_type_t<T, 2>;
auto bias_vPtr = vector2_type::get_pointer(bias.data());
for (auto box_index : grid_stride_range(output.size() / box_size)) {
const auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
const auto box_offset = box_index * box_size;
const auto batch_inner_size = rows * cols * boxes_per_cell;
const auto row_inner_size = cols * boxes_per_cell;
const auto col_inner_size = boxes_per_cell;
const auto y = (box_index % batch_inner_size) / row_inner_size;
const auto x = (box_index % row_inner_size) / col_inner_size;
/* When new_coords is true, we shouldn't use logistic activation again */
T objectness_prob;
if (new_coords)
{
const auto tmp_x = (input[box_offset + 0] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
const auto tmp_y = (input[box_offset + 1] - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
vector2_type bias_xy;
v_load(bias_xy, bias_vPtr[box_of_the_cell]);
output[box_offset + 2] = input[box_offset + 2] * input[box_offset + 2] *
static_cast<T>(4) * bias_xy.data[0] / static_cast<T>(width_norm);
output[box_offset + 3] = input[box_offset + 3] * input[box_offset + 3] *
static_cast<T>(4) * bias_xy.data[1] / static_cast<T>(height_norm);
objectness_prob = input[box_offset + 4];
}
else
{
const auto tmp_x = (fast_sigmoid(input[box_offset + 0]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
const auto tmp_y = (fast_sigmoid(input[box_offset + 1]) - static_cast<T>(0.5)) * scale_x_y + static_cast<T>(0.5);
output[box_offset + 0] = fast_divide_ftz(static_cast<T>(x) + tmp_x, static_cast<T>(cols));
output[box_offset + 1] = fast_divide_ftz(static_cast<T>(y) + tmp_y, static_cast<T>(rows));
vector2_type bias_xy;
v_load(bias_xy, bias_vPtr[box_of_the_cell]);
output[box_offset + 2] = fast_exp(input[box_offset + 2]) * bias_xy.data[0] / static_cast<T>(width_norm);
output[box_offset + 3] = fast_exp(input[box_offset + 3]) * bias_xy.data[1] / static_cast<T>(height_norm);
/* squash objectness score into a probability */
objectness_prob = fast_sigmoid(input[box_offset + 4]);
}
/* ignore prediction if the objectness probability is less than the cutoff */
if (objectness_prob < object_prob_cutoff)
objectness_prob = 0;
output[box_offset + 4] = objectness_prob;
}
}
template <class T>
__global__ void region_sigmoid_class_score(Span<T> output, View<T> input, T class_prob_cutoff,
size_type box_size, bool new_coords)
{
for (auto idx : grid_stride_range(output.size())) {
const index_type box_no = idx / box_size;
const index_type start_of_box = box_no * box_size;
const index_type box_offset = idx % box_size;
if (box_offset < 5) {
/* continue as we have already processed these in region_box */
continue;
}
auto objectness_prob = output[start_of_box + 4];
/* the class probabilities we currently have are conditional class probabilities
* given the object
*
* to obtain the actual class probability, we multiply the conditional probability
* with the object probability
*
* when new_coords is true, we shouldn't use logistic activation again.
*/
T actual_class_prob;
if (new_coords)
{
actual_class_prob = objectness_prob * input[idx];
}
else
{
actual_class_prob = objectness_prob * fast_sigmoid(input[idx]);
}
if (actual_class_prob <= class_prob_cutoff)
actual_class_prob = T(0);
output[idx] = actual_class_prob;
}
}
template <class T>
__global__ void region_softmax_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size) {
for (auto box_no : grid_stride_range(output.size() / box_size)) {
const index_type start_of_box = box_no * box_size;
const index_type start_idx = start_of_box + 5;
const index_type end_idx = start_of_box + box_size;
auto largest = numeric_limits<T>::lowest();
for (int idx = start_idx; idx < end_idx; idx++) {
using device::max;
largest = max(largest, input[idx]);
}
auto sum = T(0);
for (int idx = start_idx; idx < end_idx; idx++) {
using device::exp;
auto temp = exp(input[idx] - largest);
sum += temp;
output[idx] = temp;
}
for (int idx = start_idx; idx < end_idx; idx++) {
auto softmax_score = output[idx] / sum;
/* the class probabilities we currently have are conditional class probabilities
* given the object
*
* to obtain the actual class probability, we multiply the conditional probability
* with the object probability
*/
auto objectness_prob = output[start_of_box + 4];
auto actual_class_prob = objectness_prob * softmax_score;
if (actual_class_prob <= class_prob_cutoff)
actual_class_prob = T(0);
output[idx] = actual_class_prob;
}
}
}
}
template <class T>
void region(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
T object_prob_cutoff, T class_prob_cutoff,
std::size_t boxes_per_cell, std::size_t box_size,
std::size_t rows, std::size_t cols, T scale_x_y,
std::size_t height_norm, std::size_t width_norm,
bool if_true_sigmoid_else_softmax, /* true = sigmoid, false = softmax */
bool new_coords)
{
CV_Assert(output.size() == input.size());
CV_Assert(output.size() % box_size == 0);
CV_Assert(is_fully_aligned(bias, 2));
auto box_kernel = raw::region_box<T>;
auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream);
launch_kernel(box_kernel, box_policy,
output, input, bias, boxes_per_cell, box_size,
rows, cols, scale_x_y, height_norm, width_norm,
object_prob_cutoff, new_coords);
if (if_true_sigmoid_else_softmax) {
auto kernel_score = raw::region_sigmoid_class_score<T>;
auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size, new_coords);
} else {
auto kernel_score = raw::region_softmax_class_score<T>;
auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void region(const Stream&, Span<__half>, View<__half>, View<__half>,
__half, __half, std::size_t, std::size_t, std::size_t, std::size_t, __half, std::size_t, std::size_t, bool, bool);
#endif
template void region(const Stream&, Span<float>, View<float>, View<float>,
float, float, std::size_t, std::size_t, std::size_t, std::size_t, float, std::size_t, std::size_t, bool, bool);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,245 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "memory.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <cuda_runtime.h>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t CHANNELS_PER_ITER>
__global__ void resize_nn(
Span<T> output, size_type out_height, size_type out_width,
View<T> input, size_type in_height, size_type in_width,
float o2i_fy, float o2i_fx, bool round, bool half_pixel_centers)
{
auto in_image_size = in_height * in_width;
auto out_image_size = out_height * out_width;
/* think of the output and input as a collection of 2d images with the last axis
* representing the width and the last but one axis representing the height
*
* the remaining axis together form a collection of these images/channels
*/
auto num_effective_channels = output.size() / out_image_size;
/* we process multiple channels every iteration to reuse the identical computation
* involved with the spatial dimensions
*
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
* (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
*/
auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
/* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
* combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
* iterations in total to finish the resize operation
*/
auto iters_required = num_channel_iters_per_xy * out_image_size;
for (auto iter : grid_stride_range(iters_required)) {
const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
/* note here that consecutive `iter` values will often have consecutive `x` values
* => stores into output will be coalesced across threads
*/
const index_type y = (iter % out_image_size) / out_width;
const index_type x = iter % out_width;
auto in_yf = half_pixel_centers ? (y + 0.5f) * o2i_fy : y * o2i_fy;
auto in_xf = half_pixel_centers ? (x + 0.5f) * o2i_fx : x * o2i_fx;
using device::lround;
index_type in_y = round ? lround(in_yf) : static_cast<index_type>(in_yf);
index_type in_x = round ? lround(in_xf) : static_cast<index_type>(in_xf);
using device::min;
in_y = min(in_y, in_height - 1);
in_x = min(in_x, in_width - 1);
index_type in_idx = c_start * in_image_size + in_y * in_width + in_x;
index_type out_idx = c_start * out_image_size + y * out_width + x;
for (int i = 0; i < CHANNELS_PER_ITER; i++) {
output[out_idx] = load_ldg(input[in_idx]);
in_idx += in_image_size;
out_idx += out_image_size;
}
}
}
template <class T, std::size_t CHANNELS_PER_ITER>
__global__ void resize_bilinear(
Span<T> output, size_type out_height, size_type out_width,
View<T> input, size_type in_height, size_type in_width,
float o2i_fy, float o2i_fx, bool half_pixel_centers)
{
auto in_image_size = in_height * in_width;
auto out_image_size = out_height * out_width;
/* think of the output and input as a collection of 2d images with the last axis
* representing the width and the last but one axis representing the height
*
* the remaining axis together form a collection of these images/channels
*/
auto num_effective_channels = output.size() / out_image_size;
/* we process multiple channels every iteration to reuse the identical computation
* involved with the spatial dimensions
*
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
* (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
*/
auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
/* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
* combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
* iterations in total to finish the resize operation
*/
auto iters_required = num_channel_iters_per_xy * out_image_size;
for (auto iter : grid_stride_range(iters_required)) {
const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
const index_type c_end = c_start + CHANNELS_PER_ITER;
/* note here that consecutive `iter` values will often have consecutive `x` values
* => stores into output will be coalesced across threads
*/
const index_type y = (iter % out_image_size) / out_width;
const index_type x = iter % out_width;
using device::max;
auto in_x = half_pixel_centers ? max<float>((x + 0.5f) * o2i_fx - 0.5f, 0.0f) : x * o2i_fx;
auto in_y = half_pixel_centers ? max<float>((y + 0.5f) * o2i_fy - 0.5f, 0.0f) : y * o2i_fy;
auto in_x0 = static_cast<index_type>(in_x);
auto in_y0 = static_cast<index_type>(in_y);
using device::min;
auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
index_type out_idx = c_start * out_image_size + y * out_width + x;
#pragma unroll 1 /* disable unrolling to reduce register pressure; not sure how but it works */
for (auto c = c_start; c < c_end; c++) {
auto v_00 = load_ldg(input[in_offset_r0 + in_x0]),
v_01 = load_ldg(input[in_offset_r0 + in_x1]),
v_10 = load_ldg(input[in_offset_r1 + in_x0]),
v_11 = load_ldg(input[in_offset_r1 + in_x1]);
output[out_idx] =
v_00 +
T(in_y - in_y0) * T(v_10 - v_00) +
T(in_x - in_x0) * T(v_01 - v_00) +
T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
in_offset_r0 += in_image_size;
in_offset_r1 += in_image_size;
out_idx += out_image_size;
}
}
}
}
template <class T, std::size_t CHANNELS_PER_ITER> static
void launch_multichannel_resize_nn(const Stream& stream,
Span<T> output, size_type out_height, size_type out_width,
View<T> input, size_type in_height, size_type in_width,
float scale_y, float scale_x, bool round, bool half_pixel_centers)
{
auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>;
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
}
template <class T>
void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers) {
auto out_height = output.get_axis_size(-2);
auto out_width = output.get_axis_size(-1);
auto in_height = input.get_axis_size(-2);
auto in_width = input.get_axis_size(-1);
auto num_effective_channels = input.size_range(0, 2);
auto num_iters = num_effective_channels * out_height * out_width;
if (num_effective_channels % 32 == 0 && num_iters > 655360) {
launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
} else if (num_effective_channels % 16 == 0 && num_iters > 327680) {
launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
} else if (num_effective_channels % 8 == 0 && num_iters > 163840) {
launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
} else if (num_effective_channels % 4 == 0 && num_iters > 81920) {
launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
} else if (num_effective_channels % 2 == 0) {
launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
} else {
launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool, bool);
#endif
template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool,bool);
template <class T, std::size_t CHANNELS_PER_ITER> static
void launch_multichannel_resize_bilinear(const Stream& stream,
Span<T> output, size_type out_height, size_type out_width,
View<T> input, size_type in_height, size_type in_width,
float scale_y, float scale_x, bool half_pixel_centers)
{
auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>;
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
}
template <class T>
void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers) {
auto out_height = output.get_axis_size(-2);
auto out_width = output.get_axis_size(-1);
auto in_height = input.get_axis_size(-2);
auto in_width = input.get_axis_size(-1);
auto num_effective_channels = input.size_range(0, 2);
auto num_iters = num_effective_channels * out_height * out_width;
if (num_effective_channels % 16 == 0 && num_iters > 163840) {
launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
} else if (num_effective_channels % 8 == 0 && num_iters > 81920) {
launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
} else if (num_effective_channels % 4 == 0 && num_iters > 40960) {
launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
} else if (num_effective_channels % 2 == 0) {
launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
} else {
launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool);
#endif
template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,181 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "math.hpp"
#include "limits.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "memory.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t CHANNELS_PER_ITER>
__global__ void roi_pooling(
Span<T> output, size_type pooled_height, size_type pooled_width,
View<T> input, size_type in_height, size_type in_width,
View<T> rois, size_type num_channels, float spatial_scale)
{
// input: [1, num_channels, in_height, in_width]
const auto in_image_size = in_height * in_width;
// rois: [num_rois, 5]
auto num_rois = rois.size() / 5;
// output: [num_rois, num_channels, pooled_height, pooled_width]
const auto out_spatial_size = pooled_height * pooled_width;
const auto out_roi_size = num_channels * out_spatial_size;
/* we have to compute the output value for every combination of (roi, c, y, x) in the output
*
* the computation involving (y, x) are identical for all non-spatial dimensions
* the computation and memory requests involving the roi are identical for remaining three axes
*
* we process multiple channels every iteration to reuse the identical computation
* and memory requests involved with the roi and spatial dimensions
*/
/*
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
* (num_channels / CHANNELS_PER_ITER) iterations per (roi, x, y)
*/
auto num_channel_iters_per_roi_xy = num_channels / CHANNELS_PER_ITER;
/* we need `num_channel_iters_per_roi_xy` iterations per (roi, x, y) and there are
* `num_rois` rois and `out_spatial_size` combinations of (x, y)
*/
auto iters_per_roi = num_channel_iters_per_roi_xy * out_spatial_size;
auto iters_required = num_rois * iters_per_roi;
for (auto iter : grid_stride_range(iters_required))
{
const index_type roi_no = iter / iters_per_roi;
const index_type c_start = ((iter % iters_per_roi) / out_spatial_size) * CHANNELS_PER_ITER;
/* note here that consecutive `iter` values will often have consecutive `x` values
* => stores into output will be coalesced across threads
*/
const index_type y = (iter % out_spatial_size) / pooled_width;
const index_type x = iter % pooled_width;
const index_type roi_offset = roi_no * 5;
using device::round;
const index_type batch_id = rois[roi_offset + 0];
const index_type x_start_roi = round(static_cast<float>(rois[roi_offset + 1]) * spatial_scale);
const index_type y_start_roi = round(static_cast<float>(rois[roi_offset + 2]) * spatial_scale);
const index_type x_end_roi = round(static_cast<float>(rois[roi_offset + 3]) * spatial_scale);
const index_type y_end_roi = round(static_cast<float>(rois[roi_offset + 4]) * spatial_scale);
using device::max;
const auto roi_width = max<index_type>(x_end_roi - x_start_roi + 1, 1);
const auto roi_height = max<index_type>(y_end_roi - y_start_roi + 1, 1);
const auto roi_width_ratio = static_cast<float>(roi_width) / pooled_width;
const auto roi_height_ratio = static_cast<float>(roi_height) / pooled_height;
auto x_start = x_start_roi + static_cast<index_type>(x * roi_width_ratio);
auto y_start = y_start_roi + static_cast<index_type>(y * roi_height_ratio);
using device::ceil;
auto x_end = x_start_roi + static_cast<index_type>(ceil((x + 1) * roi_width_ratio));
auto y_end = y_start_roi + static_cast<index_type>(ceil((y + 1) * roi_height_ratio));
using device::max;
x_start = max<index_type>(x_start, 0);
y_start = max<index_type>(y_start, 0);
using device::min;
x_end = min<index_type>(x_end, in_width);
y_end = min<index_type>(y_end, in_height);
index_type in_offset = (batch_id * num_channels + c_start) * in_height * in_width;
index_type out_idx = roi_no * out_roi_size + c_start * out_spatial_size + y * pooled_width + x;
for (int i = 0; i < CHANNELS_PER_ITER; i++)
{
/* We have to set the output to zero if (x_start >= x_end) or (y_start >= y_end). If either
* condition is true, the loops below won't execute even a single iteration. Hence, by setting
* `max_val` to zero in this case, we can combine it with the `else` code.
*/
T max_val = (x_start >= x_end || y_start >= y_end) ? T(0) : device::numeric_limits<T>::lowest();
for (auto iy = y_start; iy < y_end; iy++)
{
const auto in_idx = in_offset + iy * in_width;
for (auto ix = x_start; ix < x_end; ix++)
{
max_val = max(max_val, load_ldg(input[in_idx + ix]));
}
}
output[out_idx] = max_val;
in_offset += in_image_size;
out_idx += out_spatial_size;
}
}
}
}
template <class T, std::size_t CHANNELS_PER_ITER> static
void launch_multichannel_roi_pooling(const Stream& stream,
Span<T> output, size_type pooled_height, size_type pooled_width,
View<T> input, size_type in_height, size_type in_width,
View<T> rois, size_type num_channels, float spatial_scale)
{
auto kernel = raw::roi_pooling<T, CHANNELS_PER_ITER>;
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
}
template <class T>
void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois, float spatial_scale)
{
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
size_type num_channels = output.get_axis_size(1);
size_type pooled_height = output.get_axis_size(2);
size_type pooled_width = output.get_axis_size(3);
size_type in_height = input.get_axis_size(2);
size_type in_width = input.get_axis_size(3);
if (num_channels % 64 == 0) {
launch_multichannel_roi_pooling<T, 64>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 32 == 0) {
launch_multichannel_roi_pooling<T, 32>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 16 == 0) {
launch_multichannel_roi_pooling<T, 16>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 8 == 0) {
launch_multichannel_roi_pooling<T, 8>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 4 == 0) {
launch_multichannel_roi_pooling<T, 4>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 2 == 0) {
launch_multichannel_roi_pooling<T, 2>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else {
launch_multichannel_roi_pooling<T, 1>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois, float spatial_scale);
#endif
template void roi_pooling(const Stream& stream, TensorSpan<float> output, TensorView<float> input, View<float> rois, float spatial_scale);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,235 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "types.hpp"
#include "vector_traits.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t N>
__global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
inner_size /= vector_type::size();
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
const index_type bias_idx = (i / inner_size) % bias.size();
vector_type vec;
v_load(vec, input_vPtr[i]);
for(int j = 0; j < vec.size(); j++)
vec.data[j] = vec.data[j] + bias[bias_idx];
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
{
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
inner_size /= vector_type::size();
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
const index_type scale_idx = (i / inner_size) % weights.size();
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vec.size(); j++)
vec.data[j] = vec.data[j] * weights[scale_idx];
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
{
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vec.size(); j++)
vec.data[j] = alpha * vec.data[j] + beta;
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
{
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
inner_size /= vector_type::size();
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
const index_type scale_idx = (i / inner_size) % weights.size();
vector_type vec;
v_load(vec, input_vPtr[i]);
for (int j = 0; j < vec.size(); j++)
vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
v_store(output_vPtr[i], vec);
}
}
}
template <class T, std::size_t N> static
void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::biasN_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, inner_size, bias);
}
template <class T>
void biasN(
const Stream& stream,
TensorSpan<T> output,
TensorView<T> input, std::size_t inner_size,
TensorView<T> bias)
{
CV_Assert(is_shape_same(input, output));
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
} else {
launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
#endif
template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
template <class T, std::size_t N> static
void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::scaleN_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, inner_size, weights);
}
template <class T>
void scaleN(
const Stream& stream,
TensorSpan<T> output,
TensorView<T> input, std::size_t inner_size,
TensorView<T> weights)
{
CV_Assert(is_shape_same(input, output));
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
} else {
launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
#endif
template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
template <class T, std::size_t N> static
void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
auto kernel = raw::scale1_with_bias1_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, alpha, beta);
}
template <class T>
void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
CV_Assert(output.size() == input.size());
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
} else {
launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
#endif
template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
template <class T, std::size_t N> static
void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
CV_Assert(inner_size % N == 0);
auto kernel = raw::scaleN_with_biasN_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
}
template <class T>
void scaleN_with_biasN(
const Stream& stream,
TensorSpan<T> output,
TensorView<T> input, std::size_t inner_size,
TensorView<T> weights, TensorView<T> bias)
{
CV_Assert(is_shape_same(input, output));
CV_Assert(weights.size() == bias.size());
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
} else {
launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
#endif
template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,111 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include <opencv2/core.hpp>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t N>
__global__ void input_shortcut_vec(
Span<T> output,
View<T> input, index_type c_input, /* `c_input` = number of channels in `input` */
View<T> from, index_type c_from, /* `c_from` = number of channels in `from` */
size_type channel_stride /* common for both `input` and `from` */)
{
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
auto input_vPtr = vector_type::get_pointer(input.data());
auto from_vPtr = vector_type::get_pointer(from.data());
auto batch_stride_input = c_input * channel_stride;
auto batch_stride_from = c_from * channel_stride;
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
const auto actual_idx = i * vector_type::size();
const auto b = actual_idx / batch_stride_input; /* `input` and `output` have the same shape */
const auto c = (actual_idx % batch_stride_input) / channel_stride;
const auto c_offset = actual_idx % channel_stride;
vector_type vec_input;
v_load(vec_input, input_vPtr[i]);
/* We can break down the shortcut operation into two steps:
* - copy `input` to `output`
* - add `from` to corresponding channels in `output`
*
* In this scheme, only some channels in the `output` differ from `input`. They differ in the channels
* which have a corresponding channel in `from`.
*/
if (c < c_from) {
const auto from_actual_idx = b * batch_stride_from + c * channel_stride + c_offset;
const auto from_vec_idx = from_actual_idx / vector_type::size();
vector_type vec_from;
v_load(vec_from, from_vPtr[from_vec_idx]);
for (int j = 0; j < vector_type::size(); j++)
vec_input.data[j] += vec_from.data[j];
}
v_store(output_vPtr[i], vec_input);
}
}
}
template <class T, std::size_t N>
void launch_vectorized_input_shortcut(const Stream& stream, Span<T> output, View<T> input, std::size_t c_input, View<T> from, std::size_t c_from, std::size_t channel_stride) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
CV_Assert(is_fully_aligned<T>(from, N));
CV_Assert(channel_stride % N == 0);
auto kernel = raw::input_shortcut_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input, c_input, from, c_from, channel_stride);
}
template <class T>
void input_shortcut(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> from) {
CV_Assert(is_shape_same(output, input));
CV_Assert(output.rank() == from.rank());
for (int i = 0; i < output.rank(); i++) {
if (i != 1) {
CV_Assert(from.get_axis_size(i) == output.get_axis_size(i));
}
}
auto channel_stride = output.size_range(2, output.rank()); /* same for `output`, `input` and `from` */
auto c_input = input.get_axis_size(1);
auto c_from = from.get_axis_size(1);
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_fully_aligned<T>(from, 4) && channel_stride % 4 == 0) {
launch_vectorized_input_shortcut<T, 4>(stream, output, input, c_input, from, c_from, channel_stride);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_fully_aligned<T>(from, 2) && channel_stride % 2 == 0) {
launch_vectorized_input_shortcut<T, 2>(stream, output, input, c_input, from, c_from, channel_stride);
} else {
launch_vectorized_input_shortcut<T, 1>(stream, output, input, c_input, from, c_from, channel_stride);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void input_shortcut(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<__half>);
#endif
template void input_shortcut(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<float>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,203 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "array.hpp"
#include "types.hpp"
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "kernel_dispatcher.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include <opencv2/core.hpp>
#include <cstddef>
#include <vector>
#include <iostream>
#include <algorithm>
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T, std::size_t Rank>
__global__ void slice(
Span<T> output, array<size_type, Rank> out_strides,
View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset)
{
for (auto i : grid_stride_range(output.size())) {
index_type out_index = i / out_strides[0];
index_type in_index = in_offset[0] + out_index;
index_type iidx = in_index * in_strides[0];
for (int j = 1; j < Rank; j++) {
out_index = (i % out_strides[j - 1]) / out_strides[j];
in_index = in_offset[j] + out_index;
iidx += in_index * in_strides[j];
}
output[i] = input[iidx];
}
}
}
template <class T, std::size_t Rank> static
void launch_slice(
const Stream& stream,
Span<T> output, const std::vector<std::size_t>& outStride,
View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset)
{
CV_Assert(outStride.size() == Rank);
CV_Assert(inStride.size() == Rank);
CV_Assert(inOffset.size() == Rank);
array<size_type, Rank> outStride_k, inStride_k;
outStride_k.assign(std::begin(outStride), std::end(outStride));
inStride_k.assign(std::begin(inStride), std::end(inStride));
array<index_type, Rank> inOffset_k;
inOffset_k.assign(std::begin(inOffset), std::end(inOffset));
auto kernel = raw::slice<T, Rank>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k);
}
GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice);
template <class T>
void slice(const Stream& stream,
TensorSpan<T> output, TensorView<T> input,
std::vector<std::size_t> offsets)
{
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == offsets.size());
/* copy directly if no slicing is required */
if (is_shape_same(output, input))
{
CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
kernels::copy<T>(stream, output, input);
return;
}
/* squeezable axes at the beginning of both tensors can be eliminated
*
* Reasoning:
* ----------
* Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input
* tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are ignored.
*
* If the size of the first axis of the input and output tensor is unity, the input and output indices
* for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that
* there cannot be any ignored items since the axes have unit size. The first index does not contribute to the
* element's address calculation and hence does nothing apart from eating up few cycles.
*/
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
CV_Assert(offsets[0] == 0);
input.squeeze(0);
output.squeeze(0);
offsets.erase(std::begin(offsets));
CV_Assert(output.rank() == input.rank());
CV_Assert(output.rank() == offsets.size());
}
auto inShape = input.shape_as_vector();
auto outShape = output.shape_as_vector();
/* contiguous axes which do not undergo slicing can be combined into one axis
*
* Reasoning:
* ----------
* Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any
* slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...].
*
* Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example,
* the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with
* a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`.
* Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor.
*/
for (int i = 0; i < inShape.size(); i++) {
/* check if axis `i` requires any slicing */
if (offsets[i] == 0 && inShape[i] == outShape[i]) {
/* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
int j = i + 1; /* `j` is the axis which we will attempt to merge */
while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
/* `j` axis is also unsliced; merge `i` and `j` */
auto new_size = inShape[i] * inShape[j];
inShape[i] = new_size;
outShape[i] = new_size;
offsets[i] = 0; /* redundant */
/* delete axis `j` */
inShape.erase(std::begin(inShape) + j);
outShape.erase(std::begin(outShape) + j);
offsets.erase(std::begin(offsets) + j);
/* optimizations should not break the invariants */
CV_Assert(inShape.size() == outShape.size());
CV_Assert(inShape.size() == offsets.size());
CV_Assert(inShape[i] == outShape[i]);
CV_Assert(offsets[i] == 0);
}
}
}
auto rank = inShape.size();
/* We can do a copy if the reduced rank is two and only the first axis is sliced.
* The general requirement is that only one axis is sliced and all the axes that
* preceed the sliced axis are singleton. However, the reductions above will remove
* all the leading singleton axes and merge the trailing unsliced axes into one, or
* zero if there are no trailing unsliced axes. The latter is handled separately.
*/
if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
{
auto stride = inShape[1];
auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
kernels::copy<T>(stream, output, sliced_input);
return;
}
if (rank == 1)
{
auto sliced_input = View<T>(input.get() + offsets[0], output.size());
kernels::copy<T>(stream, output, sliced_input);
return;
}
std::vector<std::size_t> inStride(rank), outStride(rank);
inStride.back() = 1;
outStride.back() = 1;
/* garbage, ..., garbage, 1 */
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
/* dim[0], dim[1], ..., dim[-1], 1 */
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
/* stride[0], stride[1], ..., stride[-2], 1 */
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets);
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
#endif
template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */

View File

@@ -0,0 +1,27 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP
#define OPENCV_DNN_SRC_CUDA_TYPES_HPP
#include <cstdint>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
/* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
* Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions.
*
* If we do not need to handle huge tensors, we can use 32-bit indices and get better performance.
*/
#ifdef __CUDACC__
using size_type = int;
using index_type = int;
#else
using size_type = std::int32_t;
using index_type = std::int32_t;
#endif
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */

View File

@@ -0,0 +1,120 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
#include <cuda_runtime.h>
#include "types.hpp"
#include "memory.hpp"
#include "../cuda4dnn/csl/pointer.hpp"
#include <type_traits>
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
/** \file vector_traits.hpp
* \brief utility classes and functions for vectorized memory loads/stores
*
* Example:
* using vector_type = get_vector_type_t<float, 4>;
*
* auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
* auto output_vPtr = type::get_pointer(optr); // optr is of type DevicePtr<float>
*
* vector_type vec;
* v_load(vec, input_vPtr);
*
* for(int i = 0; i < vector_type::size(); i++)
* vec[i] = do_something(vec[i]);
*
* v_store(output_vPtr, vec);
*/
namespace detail {
template <size_type N> struct raw_type_ { };
template <> struct raw_type_<256> { typedef ulonglong4 type; };
template <> struct raw_type_<128> { typedef uint4 type; };
template <> struct raw_type_<64> { typedef uint2 type; };
template <> struct raw_type_<32> { typedef uint1 type; };
template <> struct raw_type_<16> { typedef uchar2 type; };
template <> struct raw_type_<8> { typedef uchar1 type; };
template <size_type N> struct raw_type {
using type = typename raw_type_<N>::type;
static_assert(sizeof(type) * 8 == N, "");
};
}
/* \tparam T type of element in the vector
* \tparam N "number of elements" of type T in the vector
*/
template <class T, size_type N>
union vector_type {
using value_type = T;
using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type;
__device__ vector_type() { }
__device__ static constexpr size_type size() { return N; }
raw_type raw;
T data[N];
template <class U> static __device__
typename std::enable_if<std::is_const<U>::value, const vector_type*>
::type get_pointer(csl::DevicePtr<U> ptr) {
return reinterpret_cast<const vector_type*>(ptr.get());
}
template <class U> static __device__
typename std::enable_if<!std::is_const<U>::value, vector_type*>
::type get_pointer(csl::DevicePtr<U> ptr) {
return reinterpret_cast<vector_type*>(ptr.get());
}
};
template <class V>
__device__ void v_load(V& dest, const V& src) {
dest.raw = src.raw;
}
template <class V>
__device__ void v_load(V& dest, const V* src) {
dest.raw = src->raw;
}
template <class V>
__device__ void v_load_ldg(V& dest, const V& src) {
dest.raw = load_ldg(src.raw);
}
template <class V>
__device__ void v_load_ldg(V& dest, const V* src) {
dest.raw = load_ldg(src->raw);
}
template <class V>
__device__ void v_store(V* dest, const V& src) {
dest->raw = src.raw;
}
template <class V>
__device__ void v_store(V& dest, const V& src) {
dest.raw = src.raw;
}
template <class T, size_type N>
struct get_vector_type {
typedef vector_type<T, N> type;
};
template <class T, size_type N>
using get_vector_type_t = typename get_vector_type<T, N>::type;
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */

View File

@@ -0,0 +1,368 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
#include "error.hpp"
#include "stream.hpp"
#include "pointer.hpp"
#include <opencv2/core.hpp>
#include <cublas_v2.h>
#include <cstddef>
#include <memory>
#include <utility>
#define CUDA4DNN_CHECK_CUBLAS(call) \
::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
/** @brief exception class for errors thrown by the cuBLAS API */
class cuBLASException : public CUDAException {
public:
using CUDAException::CUDAException;
};
namespace detail {
static void check(cublasStatus_t status, const char* func, const char* file, int line) {
auto cublasGetErrorString = [](cublasStatus_t err) {
switch (err) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "UNKNOWN_CUBLAS_ERROR";
};
if (status != CUBLAS_STATUS_SUCCESS)
throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
}
}
/** non-copyable cuBLAS smart handle
*
* UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
* is destroyed after use. The handle must always be associated with a non-default stream. The stream
* must be specified during construction.
*
* Refer to stream API for more information for the choice of forcing non-default streams.
*/
class UniqueHandle {
public:
UniqueHandle() noexcept : handle{ nullptr } { }
UniqueHandle(UniqueHandle&) = delete;
UniqueHandle(UniqueHandle&& other) noexcept {
stream = std::move(other.stream);
handle = other.handle;
other.handle = nullptr;
}
/** creates a cuBLAS handle and associates it with the stream specified
*
* Exception Guarantee: Basic
*/
UniqueHandle(Stream strm) : stream(std::move(strm)) {
CV_Assert(stream);
CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
try {
CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
} catch (...) {
/* cublasDestroy won't throw if a valid handle is passed */
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
throw;
}
}
~UniqueHandle() noexcept {
if (handle) {
/* cublasDestroy won't throw if a valid handle is passed */
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
}
}
UniqueHandle& operator=(const UniqueHandle&) = delete;
UniqueHandle& operator=(UniqueHandle&& other) noexcept {
CV_Assert(other);
if (&other != this) {
UniqueHandle(std::move(*this)); /* destroy current handle */
stream = std::move(other.stream);
handle = other.handle;
other.handle = nullptr;
}
return *this;
}
/** returns the raw cuBLAS handle */
cublasHandle_t get() const noexcept {
CV_Assert(handle);
return handle;
}
/** returns true if the handle is valid */
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
private:
Stream stream;
cublasHandle_t handle;
};
/** @brief sharable cuBLAS smart handle
*
* Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
* is destroyed after all references to the handle are destroyed. The handle must always
* be associated with a non-default stream. The stream must be specified during construction.
*
* @note Moving a Handle object to another invalidates the former
*/
class Handle {
public:
Handle() = default;
Handle(const Handle&) = default;
Handle(Handle&&) = default;
/** creates a cuBLAS handle and associates it with the stream specified
*
* Exception Guarantee: Basic
*/
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
Handle& operator=(const Handle&) = default;
Handle& operator=(Handle&&) = default;
/** returns true if the handle is valid */
explicit operator bool() const noexcept { return static_cast<bool>(handle); }
/** returns the raw cuBLAS handle */
cublasHandle_t get() const noexcept {
CV_Assert(handle);
return handle->get();
}
private:
std::shared_ptr<UniqueHandle> handle;
};
/** @brief GEMM for colummn-major matrices
*
* \f$ C = \alpha AB + \beta C \f$
*
* @tparam T matrix element type (must be `half` or `float`)
*
* @param handle valid cuBLAS Handle
* @param transa use transposed matrix of A for computation
* @param transb use transposed matrix of B for computation
* @param rows_c number of rows in C
* @param cols_c number of columns in C
* @param common_dim common dimension of A (or trans A) and B (or trans B)
* @param alpha scale factor for AB
* @param[in] A pointer to column-major matrix A in device memory
* @param lda leading dimension of matrix A
* @param[in] B pointer to column-major matrix B in device memory
* @param ldb leading dimension of matrix B
* @param beta scale factor for C
* @param[in,out] C pointer to column-major matrix C in device memory
* @param ldc leading dimension of matrix C
*
* Exception Guarantee: Basic
*/
template <class T>
void gemm(const Handle& handle,
bool transa, bool transb,
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
T alpha, const DevicePtr<const T> A, std::size_t lda,
const DevicePtr<const T> B, std::size_t ldb,
T beta, const DevicePtr<T> C, std::size_t ldc);
template <> inline
void gemm<half>(const Handle& handle,
bool transa, bool transb,
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
half alpha, const DevicePtr<const half> A, std::size_t lda,
const DevicePtr<const half> B, std::size_t ldb,
half beta, const DevicePtr<half> C, std::size_t ldc)
{
CV_Assert(handle);
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
int irows_c = static_cast<int>(rows_c),
icols_c = static_cast<int>(cols_c),
icommon_dim = static_cast<int>(common_dim),
ilda = static_cast<int>(lda),
ildb = static_cast<int>(ldb),
ildc = static_cast<int>(ldc);
CUDA4DNN_CHECK_CUBLAS(
cublasHgemm(
handle.get(),
opa, opb,
irows_c, icols_c, icommon_dim,
&alpha, A.get(), ilda,
B.get(), ildb,
&beta, C.get(), ildc
)
);
}
template <> inline
void gemm<float>(const Handle& handle,
bool transa, bool transb,
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
float alpha, const DevicePtr<const float> A, std::size_t lda,
const DevicePtr<const float> B, std::size_t ldb,
float beta, const DevicePtr<float> C, std::size_t ldc)
{
CV_Assert(handle);
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
int irows_c = static_cast<int>(rows_c),
icols_c = static_cast<int>(cols_c),
icommon_dim = static_cast<int>(common_dim),
ilda = static_cast<int>(lda),
ildb = static_cast<int>(ldb),
ildc = static_cast<int>(ldc);
CUDA4DNN_CHECK_CUBLAS(
cublasSgemm(
handle.get(),
opa, opb,
irows_c, icols_c, icommon_dim,
&alpha, A.get(), ilda,
B.get(), ildb,
&beta, C.get(), ildc
)
);
}
/** @brief Strided batched GEMM for colummn-major matrices
*
* \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i
*
* @tparam T matrix element type (must be `half` or `float`)
*
* @param handle valid cuBLAS Handle
* @param transa use transposed matrix of A_i for computation
* @param transb use transposed matrix of B_i for computation
* @param rows_c number of rows in C_i
* @param cols_c number of columns in C_i
* @param common_dim common dimension of A_i (or trans A_i) and B_i (or trans B_i)
* @param alpha scale factor for A_i B_i
* @param[in] A pointer to stack of column-major matrices A in device memory
* @param lda leading dimension of matrix A_i
* @param strideA stride between matrices in A
* @param[in] B pointer to stack of column-major matrices B in device memory
* @param ldb leading dimension of matrix B_i
* @param strideB stride between matrices in B
* @param beta scale factor for C_i
* @param[in,out] C pointer to stack of column-major matrices C in device memory
* @param ldc leading dimension of matrix C_i
* @param strideC stride between matrices in C
* @param batchCount number of matrices in the batch
*
* Exception Guarantee: Basic
*/
template <class T>
void gemmStridedBatched(const Handle& handle,
bool transa, bool transb,
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
T alpha, const DevicePtr<const T> A, std::size_t lda, std::size_t strideA,
const DevicePtr<const T> B, std::size_t ldb, std::size_t strideB,
T beta, const DevicePtr<T> C, std::size_t ldc, std::size_t strideC,
std::size_t batchCount);
template <> inline
void gemmStridedBatched<half>(const Handle& handle,
bool transa, bool transb,
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
half alpha, const DevicePtr<const half> A, std::size_t lda, std::size_t strideA,
const DevicePtr<const half> B, std::size_t ldb, std::size_t strideB,
half beta, const DevicePtr<half> C, std::size_t ldc, std::size_t strideC,
std::size_t batchCount)
{
CV_Assert(handle);
const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
const auto irows_c = static_cast<int>(rows_c),
icols_c = static_cast<int>(cols_c),
icommon_dim = static_cast<int>(common_dim),
ilda = static_cast<int>(lda),
ildb = static_cast<int>(ldb),
ildc = static_cast<int>(ldc);
const auto batch_count = static_cast<int>(batchCount);
const auto stride_a = static_cast<long long int>(strideA),
stride_b = static_cast<long long int>(strideB),
stride_c = static_cast<long long int>(strideC);
CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
CUDA4DNN_CHECK_CUBLAS(
cublasHgemmStridedBatched(
handle.get(),
opa, opb,
irows_c, icols_c, icommon_dim,
&alpha, A.get(), ilda, stride_a,
B.get(), ildb, stride_b,
&beta, C.get(), ildc, stride_c,
batch_count
)
);
}
template <> inline
void gemmStridedBatched<float>(const Handle& handle,
bool transa, bool transb,
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
float alpha, const DevicePtr<const float> A, std::size_t lda, std::size_t strideA,
const DevicePtr<const float> B, std::size_t ldb, std::size_t strideB,
float beta, const DevicePtr<float> C, std::size_t ldc, std::size_t strideC,
std::size_t batchCount)
{
CV_Assert(handle);
const auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
const auto irows_c = static_cast<int>(rows_c),
icols_c = static_cast<int>(cols_c),
icommon_dim = static_cast<int>(common_dim),
ilda = static_cast<int>(lda),
ildb = static_cast<int>(ldb),
ildc = static_cast<int>(ldc);
const auto batch_count = static_cast<int>(batchCount);
const auto stride_a = static_cast<long long int>(strideA),
stride_b = static_cast<long long int>(strideB),
stride_c = static_cast<long long int>(strideC);
CV_Assert(stride_c >= irows_c * icols_c); // output matrices must not overlap
CUDA4DNN_CHECK_CUBLAS(
cublasSgemmStridedBatched(
handle.get(),
opa, opb,
irows_c, icols_c, icommon_dim,
&alpha, A.get(), ilda, stride_a,
B.get(), ildb, stride_b,
&beta, C.get(), ildc, stride_c,
batch_count
)
);
}
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */

Some files were not shown because too many files have changed in this diff Show More