feat: 切换后端至PaddleOCR-NCNN,切换工程为CMake
1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试 2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程 3.重整权利声明文件,重整代码工程,确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
This commit is contained in:
24
3rdparty/opencv-4.5.4/samples/dnn/CMakeLists.txt
vendored
Normal file
24
3rdparty/opencv-4.5.4/samples/dnn/CMakeLists.txt
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
ocv_install_example_src(dnn *.cpp *.hpp CMakeLists.txt)
|
||||
|
||||
set(OPENCV_DNN_SAMPLES_REQUIRED_DEPS
|
||||
opencv_core
|
||||
opencv_imgproc
|
||||
opencv_dnn
|
||||
opencv_objdetect
|
||||
opencv_video
|
||||
opencv_imgcodecs
|
||||
opencv_videoio
|
||||
opencv_highgui)
|
||||
ocv_check_dependencies(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
|
||||
|
||||
if(NOT BUILD_EXAMPLES OR NOT OCV_DEPENDENCIES_FOUND)
|
||||
return()
|
||||
endif()
|
||||
|
||||
project(dnn_samples)
|
||||
ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
|
||||
file(GLOB_RECURSE dnn_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
|
||||
foreach(sample_filename ${dnn_samples})
|
||||
ocv_define_sample(tgt ${sample_filename} dnn)
|
||||
ocv_target_link_libraries(${tgt} PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
|
||||
endforeach()
|
84
3rdparty/opencv-4.5.4/samples/dnn/README.md
vendored
Normal file
84
3rdparty/opencv-4.5.4/samples/dnn/README.md
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
# OpenCV deep learning module samples
|
||||
|
||||
## Model Zoo
|
||||
|
||||
Check [a wiki](https://github.com/opencv/opencv/wiki/Deep-Learning-in-OpenCV) for a list of tested models.
|
||||
|
||||
If OpenCV is built with [Intel's Inference Engine support](https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend) you can use [Intel's pre-trained](https://github.com/opencv/open_model_zoo) models.
|
||||
|
||||
There are different preprocessing parameters such mean subtraction or scale factors for different models.
|
||||
You may check the most popular models and their parameters at [models.yml](https://github.com/opencv/opencv/blob/master/samples/dnn/models.yml) configuration file. It might be also used for aliasing samples parameters. In example,
|
||||
|
||||
```bash
|
||||
python object_detection.py opencv_fd --model /path/to/caffemodel --config /path/to/prototxt
|
||||
```
|
||||
|
||||
Check `-h` option to know which values are used by default:
|
||||
|
||||
```bash
|
||||
python object_detection.py opencv_fd -h
|
||||
```
|
||||
|
||||
### Sample models
|
||||
|
||||
You can download sample models using ```download_models.py```. For example, the following command will download network weights for OpenCV Face Detector model and store them in FaceDetector folder:
|
||||
|
||||
```bash
|
||||
python download_models.py --save_dir FaceDetector opencv_fd
|
||||
```
|
||||
|
||||
You can use default configuration files adopted for OpenCV from [here](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn).
|
||||
|
||||
You also can use the script to download necessary files from your code. Assume you have the following code inside ```your_script.py```:
|
||||
|
||||
```python
|
||||
from download_models import downloadFile
|
||||
|
||||
filepath1 = downloadFile("https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc", None, filename="MobileNetSSD_deploy.caffemodel", save_dir="save_dir_1")
|
||||
filepath2 = downloadFile("https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc", "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a", filename="MobileNetSSD_deploy.caffemodel")
|
||||
print(filepath1)
|
||||
print(filepath2)
|
||||
# Your code
|
||||
```
|
||||
|
||||
By running the following commands, you will get **MobileNetSSD_deploy.caffemodel** file:
|
||||
```bash
|
||||
export OPENCV_DOWNLOAD_DATA_PATH=download_folder
|
||||
python your_script.py
|
||||
```
|
||||
|
||||
**Note** that you can provide a directory using **save_dir** parameter or via **OPENCV_SAVE_DIR** environment variable.
|
||||
|
||||
#### Face detection
|
||||
[An origin model](https://github.com/opencv/opencv/tree/master/samples/dnn/face_detector)
|
||||
with single precision floating point weights has been quantized using [TensorFlow framework](https://www.tensorflow.org/).
|
||||
To achieve the best accuracy run the model on BGR images resized to `300x300` applying mean subtraction
|
||||
of values `(104, 177, 123)` for each blue, green and red channels correspondingly.
|
||||
|
||||
The following are accuracy metrics obtained using [COCO object detection evaluation
|
||||
tool](http://cocodataset.org/#detections-eval) on [FDDB dataset](http://vis-www.cs.umass.edu/fddb/)
|
||||
(see [script](https://github.com/opencv/opencv/blob/master/modules/dnn/misc/face_detector_accuracy.py))
|
||||
applying resize to `300x300` and keeping an origin images' sizes.
|
||||
```
|
||||
AP - Average Precision | FP32/FP16 | UINT8 | FP32/FP16 | UINT8 |
|
||||
AR - Average Recall | 300x300 | 300x300 | any size | any size |
|
||||
--------------------------------------------------|-----------|----------------|-----------|----------------|
|
||||
AP @[ IoU=0.50:0.95 | area= all | maxDets=100 ] | 0.408 | 0.408 | 0.378 | 0.328 (-0.050) |
|
||||
AP @[ IoU=0.50 | area= all | maxDets=100 ] | 0.849 | 0.849 | 0.797 | 0.790 (-0.007) |
|
||||
AP @[ IoU=0.75 | area= all | maxDets=100 ] | 0.251 | 0.251 | 0.208 | 0.140 (-0.068) |
|
||||
AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ] | 0.050 | 0.051 (+0.001) | 0.107 | 0.070 (-0.037) |
|
||||
AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] | 0.381 | 0.379 (-0.002) | 0.380 | 0.368 (-0.012) |
|
||||
AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.455 | 0.455 | 0.412 | 0.337 (-0.075) |
|
||||
AR @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] | 0.299 | 0.299 | 0.279 | 0.246 (-0.033) |
|
||||
AR @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] | 0.482 | 0.482 | 0.476 | 0.436 (-0.040) |
|
||||
AR @[ IoU=0.50:0.95 | area= all | maxDets=100 ] | 0.496 | 0.496 | 0.491 | 0.451 (-0.040) |
|
||||
AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ] | 0.189 | 0.193 (+0.004) | 0.284 | 0.232 (-0.052) |
|
||||
AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] | 0.481 | 0.480 (-0.001) | 0.470 | 0.458 (-0.012) |
|
||||
AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.528 | 0.528 | 0.520 | 0.462 (-0.058) |
|
||||
```
|
||||
|
||||
## References
|
||||
* [Models downloading script](https://github.com/opencv/opencv/samples/dnn/download_models.py)
|
||||
* [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn)
|
||||
* [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API)
|
||||
* [Names of classes from different datasets](https://github.com/opencv/opencv/tree/master/samples/data/dnn)
|
82
3rdparty/opencv-4.5.4/samples/dnn/action_recognition.py
vendored
Normal file
82
3rdparty/opencv-4.5.4/samples/dnn/action_recognition.py
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
from common import findFile
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run action recognition using 3D ResNet34',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input', '-i', help='Path to input video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', required=True, help='Path to model.')
|
||||
parser.add_argument('--classes', default=findFile('action_recongnition_kinetics.txt'), help='Path to classes list.')
|
||||
|
||||
# To get net download original repository https://github.com/kenshohara/video-classification-3d-cnn-pytorch
|
||||
# For correct ONNX export modify file: video-classification-3d-cnn-pytorch/models/resnet.py
|
||||
# change
|
||||
# - def downsample_basic_block(x, planes, stride):
|
||||
# - out = F.avg_pool3d(x, kernel_size=1, stride=stride)
|
||||
# - zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
|
||||
# - out.size(2), out.size(3),
|
||||
# - out.size(4)).zero_()
|
||||
# - if isinstance(out.data, torch.cuda.FloatTensor):
|
||||
# - zero_pads = zero_pads.cuda()
|
||||
# -
|
||||
# - out = Variable(torch.cat([out.data, zero_pads], dim=1))
|
||||
# - return out
|
||||
|
||||
# To
|
||||
# + def downsample_basic_block(x, planes, stride):
|
||||
# + out = F.avg_pool3d(x, kernel_size=1, stride=stride)
|
||||
# + out = F.pad(out, (0, 0, 0, 0, 0, 0, 0, int(planes - out.size(1)), 0, 0), "constant", 0)
|
||||
# + return out
|
||||
|
||||
# To ONNX export use torch.onnx.export(model, inputs, model_name)
|
||||
|
||||
def get_class_names(path):
|
||||
class_names = []
|
||||
with open(path) as f:
|
||||
for row in f:
|
||||
class_names.append(row[:-1])
|
||||
return class_names
|
||||
|
||||
def classify_video(video_path, net_path):
|
||||
SAMPLE_DURATION = 16
|
||||
SAMPLE_SIZE = 112
|
||||
mean = (114.7748, 107.7354, 99.4750)
|
||||
class_names = get_class_names(args.classes)
|
||||
|
||||
net = cv.dnn.readNet(net_path)
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
|
||||
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
|
||||
|
||||
winName = 'Deep learning image classification in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cap = cv.VideoCapture(video_path)
|
||||
while cv.waitKey(1) < 0:
|
||||
frames = []
|
||||
for _ in range(SAMPLE_DURATION):
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
exit(0)
|
||||
frames.append(frame)
|
||||
|
||||
inputs = cv.dnn.blobFromImages(frames, 1, (SAMPLE_SIZE, SAMPLE_SIZE), mean, True, crop=True)
|
||||
inputs = np.transpose(inputs, (1, 0, 2, 3))
|
||||
inputs = np.expand_dims(inputs, axis=0)
|
||||
net.setInput(inputs)
|
||||
outputs = net.forward()
|
||||
class_pred = np.argmax(outputs)
|
||||
label = class_names[class_pred]
|
||||
|
||||
for frame in frames:
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
||||
cv.rectangle(frame, (0, 10 - labelSize[1]),
|
||||
(labelSize[0], 10 + baseLine), (255, 255, 255), cv.FILLED)
|
||||
cv.putText(frame, label, (0, 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
cv.imshow(winName, frame)
|
||||
if cv.waitKey(1) & 0xFF == ord('q'):
|
||||
break
|
||||
|
||||
if __name__ == "__main__":
|
||||
args, _ = parser.parse_known_args()
|
||||
classify_video(args.input if args.input else 0, args.model)
|
170
3rdparty/opencv-4.5.4/samples/dnn/classification.cpp
vendored
Normal file
170
3rdparty/opencv-4.5.4/samples/dnn/classification.cpp
vendored
Normal file
@ -0,0 +1,170 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
|
||||
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
|
||||
"{ initial_width | 0 | Preprocess input image by initial resizing to a specific width.}"
|
||||
"{ initial_height | 0 | Preprocess input image by initial resizing to a specific height.}"
|
||||
"{ std | 0.0 0.0 0.0 | Preprocess input image by dividing on a standard deviation.}"
|
||||
"{ crop | false | Preprocess input image by center cropping.}"
|
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
|
||||
"{ classes | | Optional path to a text file with names of classes. }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation, "
|
||||
"4: VKCOM, "
|
||||
"5: CUDA },"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU, "
|
||||
"4: Vulkan, "
|
||||
"6: CUDA, "
|
||||
"7: CUDA fp16 (half-float preprocess) }";
|
||||
|
||||
using namespace cv;
|
||||
using namespace dnn;
|
||||
|
||||
std::vector<std::string> classes;
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
const std::string modelName = parser.get<String>("@alias");
|
||||
const std::string zooFile = parser.get<String>("zoo");
|
||||
|
||||
keys += genPreprocArguments(modelName, zooFile);
|
||||
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run classification deep learning networks using OpenCV.");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rszWidth = parser.get<int>("initial_width");
|
||||
int rszHeight = parser.get<int>("initial_height");
|
||||
float scale = parser.get<float>("scale");
|
||||
Scalar mean = parser.get<Scalar>("mean");
|
||||
Scalar std = parser.get<Scalar>("std");
|
||||
bool swapRB = parser.get<bool>("rgb");
|
||||
bool crop = parser.get<bool>("crop");
|
||||
int inpWidth = parser.get<int>("width");
|
||||
int inpHeight = parser.get<int>("height");
|
||||
String model = findFile(parser.get<String>("model"));
|
||||
String config = findFile(parser.get<String>("config"));
|
||||
String framework = parser.get<String>("framework");
|
||||
int backendId = parser.get<int>("backend");
|
||||
int targetId = parser.get<int>("target");
|
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes"))
|
||||
{
|
||||
std::string file = parser.get<String>("classes");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
classes.push_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
CV_Assert(!model.empty());
|
||||
|
||||
//! [Read and initialize network]
|
||||
Net net = readNet(model, config, framework);
|
||||
net.setPreferableBackend(backendId);
|
||||
net.setPreferableTarget(targetId);
|
||||
//! [Read and initialize network]
|
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning image classification in OpenCV";
|
||||
namedWindow(kWinName, WINDOW_NORMAL);
|
||||
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
VideoCapture cap;
|
||||
if (parser.has("input"))
|
||||
cap.open(parser.get<String>("input"));
|
||||
else
|
||||
cap.open(0);
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
|
||||
// Process frames.
|
||||
Mat frame, blob;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
if (rszWidth != 0 && rszHeight != 0)
|
||||
{
|
||||
resize(frame, frame, Size(rszWidth, rszHeight));
|
||||
}
|
||||
|
||||
//! [Create a 4D blob from a frame]
|
||||
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
|
||||
|
||||
// Check std values.
|
||||
if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
|
||||
{
|
||||
// Divide blob by std.
|
||||
divide(blob, std, blob);
|
||||
}
|
||||
//! [Create a 4D blob from a frame]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(blob);
|
||||
//! [Set input blob]
|
||||
//! [Make forward pass]
|
||||
Mat prob = net.forward();
|
||||
//! [Make forward pass]
|
||||
|
||||
//! [Get a class with a highest score]
|
||||
Point classIdPoint;
|
||||
double confidence;
|
||||
minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
|
||||
int classId = classIdPoint.x;
|
||||
//! [Get a class with a highest score]
|
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes;
|
||||
double freq = getTickFrequency() / 1000;
|
||||
double t = net.getPerfProfile(layersTimes) / freq;
|
||||
std::string label = format("Inference time: %.2f ms", t);
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
// Print predicted class.
|
||||
label = format("%s: %.4f", (classes.empty() ? format("Class #%d", classId).c_str() :
|
||||
classes[classId].c_str()),
|
||||
confidence);
|
||||
putText(frame, label, Point(0, 40), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
return 0;
|
||||
}
|
117
3rdparty/opencv-4.5.4/samples/dnn/classification.py
vendored
Normal file
117
3rdparty/opencv-4.5.4/samples/dnn/classification.py
vendored
Normal file
@ -0,0 +1,117 @@
|
||||
import argparse
|
||||
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
from common import *
|
||||
|
||||
|
||||
def get_args_parser(func_args):
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
|
||||
cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
|
||||
cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
|
||||
help='An optional path to file with preprocessing parameters.')
|
||||
parser.add_argument('--input',
|
||||
help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
|
||||
help='Optional name of an origin framework of the model. '
|
||||
'Detect it automatically if it does not set.')
|
||||
parser.add_argument('--std', nargs='*', type=float,
|
||||
help='Preprocess input image by dividing on a standard deviation.')
|
||||
parser.add_argument('--crop', type=bool, default=False,
|
||||
help='Preprocess input image by dividing on a standard deviation.')
|
||||
parser.add_argument('--initial_width', type=int,
|
||||
help='Preprocess input image by initial resizing to a specific width.')
|
||||
parser.add_argument('--initial_height', type=int,
|
||||
help='Preprocess input image by initial resizing to a specific height.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA fp16 (half-float preprocess)'% targets)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
add_preproc_args(args.zoo, parser, 'classification')
|
||||
parser = argparse.ArgumentParser(parents=[parser],
|
||||
description='Use this script to run classification deep learning networks using OpenCV.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
return parser.parse_args(func_args)
|
||||
|
||||
|
||||
def main(func_args=None):
|
||||
args = get_args_parser(func_args)
|
||||
args.model = findFile(args.model)
|
||||
args.config = findFile(args.config)
|
||||
args.classes = findFile(args.classes)
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(args.model, args.config, args.framework)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
|
||||
winName = 'Deep learning image classification in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
inpWidth = args.width if args.width else frame.shape[1]
|
||||
inpHeight = args.height if args.height else frame.shape[0]
|
||||
|
||||
if args.initial_width and args.initial_height:
|
||||
frame = cv.resize(frame, (args.initial_width, args.initial_height))
|
||||
|
||||
blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=args.crop)
|
||||
if args.std:
|
||||
blob[0] /= np.asarray(args.std, dtype=np.float32).reshape(3, 1, 1)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob)
|
||||
out = net.forward()
|
||||
|
||||
# Get a class with a highest score.
|
||||
out = out.flatten()
|
||||
classId = np.argmax(out)
|
||||
confidence = out[classId]
|
||||
|
||||
# Put efficiency information.
|
||||
t, _ = net.getPerfProfile()
|
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
# Print predicted class.
|
||||
label = '%s: %.4f' % (classes[classId] if classes else 'Class #%d' % classId, confidence)
|
||||
cv.putText(frame, label, (0, 40), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
cv.imshow(winName, frame)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
128
3rdparty/opencv-4.5.4/samples/dnn/colorization.cpp
vendored
Normal file
128
3rdparty/opencv-4.5.4/samples/dnn/colorization.cpp
vendored
Normal file
@ -0,0 +1,128 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <iostream>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
using namespace std;
|
||||
|
||||
// the 313 ab cluster centers from pts_in_hull.npy (already transposed)
|
||||
static float hull_pts[] = {
|
||||
-90., -90., -90., -90., -90., -80., -80., -80., -80., -80., -80., -80., -80., -70., -70., -70., -70., -70., -70., -70., -70.,
|
||||
-70., -70., -60., -60., -60., -60., -60., -60., -60., -60., -60., -60., -60., -60., -50., -50., -50., -50., -50., -50., -50., -50.,
|
||||
-50., -50., -50., -50., -50., -50., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -40., -30.,
|
||||
-30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -30., -20., -20., -20., -20., -20., -20., -20.,
|
||||
-20., -20., -20., -20., -20., -20., -20., -20., -20., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10., -10.,
|
||||
-10., -10., -10., -10., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 10., 10., 10.,
|
||||
10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20., 20.,
|
||||
20., 20., 20., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 30., 40., 40., 40., 40.,
|
||||
40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 50., 50., 50., 50., 50., 50., 50., 50., 50., 50.,
|
||||
50., 50., 50., 50., 50., 50., 50., 50., 50., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60., 60.,
|
||||
60., 60., 60., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 70., 80., 80., 80.,
|
||||
80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 80., 90., 90., 90., 90., 90., 90., 90., 90., 90., 90.,
|
||||
90., 90., 90., 90., 90., 90., 90., 90., 90., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 50., 60., 70., 80., 90.,
|
||||
20., 30., 40., 50., 60., 70., 80., 90., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -20., -10., 0., 10., 20., 30., 40., 50.,
|
||||
60., 70., 80., 90., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., -40., -30., -20., -10., 0., 10., 20.,
|
||||
30., 40., 50., 60., 70., 80., 90., 100., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., -50.,
|
||||
-40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., -60., -50., -40., -30., -20., -10., 0., 10., 20.,
|
||||
30., 40., 50., 60., 70., 80., 90., 100., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90.,
|
||||
100., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -80., -70., -60., -50.,
|
||||
-40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -90., -80., -70., -60., -50., -40., -30., -20., -10.,
|
||||
0., 10., 20., 30., 40., 50., 60., 70., 80., 90., -100., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30.,
|
||||
40., 50., 60., 70., 80., 90., -100., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70.,
|
||||
80., -110., -100., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., -110., -100.,
|
||||
-90., -80., -70., -60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., 80., -110., -100., -90., -80., -70.,
|
||||
-60., -50., -40., -30., -20., -10., 0., 10., 20., 30., 40., 50., 60., 70., -110., -100., -90., -80., -70., -60., -50., -40., -30.,
|
||||
-20., -10., 0., 10., 20., 30., 40., 50., 60., 70., -90., -80., -70., -60., -50., -40., -30., -20., -10., 0.
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const string about =
|
||||
"This sample demonstrates recoloring grayscale images with dnn.\n"
|
||||
"This program is based on:\n"
|
||||
" http://richzhang.github.io/colorization\n"
|
||||
" https://github.com/richzhang/colorization\n"
|
||||
"Download caffemodel and prototxt files:\n"
|
||||
" http://eecs.berkeley.edu/~rich.zhang/projects/2016_colorization/files/demo_v2/colorization_release_v2.caffemodel\n"
|
||||
" https://raw.githubusercontent.com/richzhang/colorization/caffe/models/colorization_deploy_v2.prototxt\n";
|
||||
const string keys =
|
||||
"{ h help | | print this help message }"
|
||||
"{ proto | colorization_deploy_v2.prototxt | model configuration }"
|
||||
"{ model | colorization_release_v2.caffemodel | model weights }"
|
||||
"{ image | space_shuttle.jpg | path to image file }"
|
||||
"{ opencl | | enable OpenCL }";
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about(about);
|
||||
if (parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
string modelTxt = samples::findFile(parser.get<string>("proto"));
|
||||
string modelBin = samples::findFile(parser.get<string>("model"));
|
||||
string imageFile = samples::findFile(parser.get<string>("image"));
|
||||
bool useOpenCL = parser.has("opencl");
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
Mat img = imread(imageFile);
|
||||
if (img.empty())
|
||||
{
|
||||
cout << "Can't read image from file: " << imageFile << endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// fixed input size for the pretrained network
|
||||
const int W_in = 224;
|
||||
const int H_in = 224;
|
||||
Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
|
||||
if (useOpenCL)
|
||||
net.setPreferableTarget(DNN_TARGET_OPENCL);
|
||||
|
||||
// setup additional layers:
|
||||
int sz[] = {2, 313, 1, 1};
|
||||
const Mat pts_in_hull(4, sz, CV_32F, hull_pts);
|
||||
Ptr<dnn::Layer> class8_ab = net.getLayer("class8_ab");
|
||||
class8_ab->blobs.push_back(pts_in_hull);
|
||||
Ptr<dnn::Layer> conv8_313_rh = net.getLayer("conv8_313_rh");
|
||||
conv8_313_rh->blobs.push_back(Mat(1, 313, CV_32F, Scalar(2.606)));
|
||||
|
||||
// extract L channel and subtract mean
|
||||
Mat lab, L, input;
|
||||
img.convertTo(img, CV_32F, 1.0/255);
|
||||
cvtColor(img, lab, COLOR_BGR2Lab);
|
||||
extractChannel(lab, L, 0);
|
||||
resize(L, input, Size(W_in, H_in));
|
||||
input -= 50;
|
||||
|
||||
// run the L channel through the network
|
||||
Mat inputBlob = blobFromImage(input);
|
||||
net.setInput(inputBlob);
|
||||
Mat result = net.forward();
|
||||
|
||||
// retrieve the calculated a,b channels from the network output
|
||||
Size siz(result.size[2], result.size[3]);
|
||||
Mat a = Mat(siz, CV_32F, result.ptr(0,0));
|
||||
Mat b = Mat(siz, CV_32F, result.ptr(0,1));
|
||||
resize(a, a, img.size());
|
||||
resize(b, b, img.size());
|
||||
|
||||
// merge, and convert back to BGR
|
||||
Mat color, chn[] = {L, a, b};
|
||||
merge(chn, 3, lab);
|
||||
cvtColor(lab, color, COLOR_Lab2BGR);
|
||||
|
||||
imshow("color", color);
|
||||
imshow("original", img);
|
||||
waitKey();
|
||||
return 0;
|
||||
}
|
69
3rdparty/opencv-4.5.4/samples/dnn/colorization.py
vendored
Normal file
69
3rdparty/opencv-4.5.4/samples/dnn/colorization.py
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
# Script is based on https://github.com/richzhang/colorization/blob/master/colorization/colorize.py
|
||||
# To download the caffemodel and the prototxt, see: https://github.com/richzhang/colorization/tree/caffe/colorization/models
|
||||
# To download pts_in_hull.npy, see: https://github.com/richzhang/colorization/tree/caffe/colorization/resources/pts_in_hull.npy
|
||||
import numpy as np
|
||||
import argparse
|
||||
import cv2 as cv
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='iColor: deep interactive colorization')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--prototxt', help='Path to colorization_deploy_v2.prototxt', required=True)
|
||||
parser.add_argument('--caffemodel', help='Path to colorization_release_v2.caffemodel', required=True)
|
||||
parser.add_argument('--kernel', help='Path to pts_in_hull.npy', required=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
if __name__ == '__main__':
|
||||
W_in = 224
|
||||
H_in = 224
|
||||
imshowSize = (640, 480)
|
||||
|
||||
args = parse_args()
|
||||
|
||||
# Select desired model
|
||||
net = cv.dnn.readNetFromCaffe(args.prototxt, args.caffemodel)
|
||||
|
||||
pts_in_hull = np.load(args.kernel) # load cluster centers
|
||||
|
||||
# populate cluster centers as 1x1 convolution kernel
|
||||
pts_in_hull = pts_in_hull.transpose().reshape(2, 313, 1, 1)
|
||||
net.getLayer(net.getLayerId('class8_ab')).blobs = [pts_in_hull.astype(np.float32)]
|
||||
net.getLayer(net.getLayerId('conv8_313_rh')).blobs = [np.full([1, 313], 2.606, np.float32)]
|
||||
|
||||
if args.input:
|
||||
cap = cv.VideoCapture(args.input)
|
||||
else:
|
||||
cap = cv.VideoCapture(0)
|
||||
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
img_rgb = (frame[:,:,[2, 1, 0]] * 1.0 / 255).astype(np.float32)
|
||||
|
||||
img_lab = cv.cvtColor(img_rgb, cv.COLOR_RGB2Lab)
|
||||
img_l = img_lab[:,:,0] # pull out L channel
|
||||
(H_orig,W_orig) = img_rgb.shape[:2] # original image size
|
||||
|
||||
# resize image to network input size
|
||||
img_rs = cv.resize(img_rgb, (W_in, H_in)) # resize image to network input size
|
||||
img_lab_rs = cv.cvtColor(img_rs, cv.COLOR_RGB2Lab)
|
||||
img_l_rs = img_lab_rs[:,:,0]
|
||||
img_l_rs -= 50 # subtract 50 for mean-centering
|
||||
|
||||
net.setInput(cv.dnn.blobFromImage(img_l_rs))
|
||||
ab_dec = net.forward()[0,:,:,:].transpose((1,2,0)) # this is our result
|
||||
|
||||
(H_out,W_out) = ab_dec.shape[:2]
|
||||
ab_dec_us = cv.resize(ab_dec, (W_orig, H_orig))
|
||||
img_lab_out = np.concatenate((img_l[:,:,np.newaxis],ab_dec_us),axis=2) # concatenate with original image L
|
||||
img_bgr_out = np.clip(cv.cvtColor(img_lab_out, cv.COLOR_Lab2BGR), 0, 1)
|
||||
|
||||
frame = cv.resize(frame, imshowSize)
|
||||
cv.imshow('origin', frame)
|
||||
cv.imshow('gray', cv.cvtColor(frame, cv.COLOR_RGB2GRAY))
|
||||
cv.imshow('colorized', cv.resize(img_bgr_out, imshowSize))
|
95
3rdparty/opencv-4.5.4/samples/dnn/common.hpp
vendored
Normal file
95
3rdparty/opencv-4.5.4/samples/dnn/common.hpp
vendored
Normal file
@ -0,0 +1,95 @@
|
||||
#include <opencv2/core/utils/filesystem.hpp>
|
||||
|
||||
using namespace cv;
|
||||
|
||||
std::string genArgument(const std::string& argName, const std::string& help,
|
||||
const std::string& modelName, const std::string& zooFile,
|
||||
char key = ' ', std::string defaultVal = "");
|
||||
|
||||
std::string genPreprocArguments(const std::string& modelName, const std::string& zooFile);
|
||||
|
||||
std::string findFile(const std::string& filename);
|
||||
|
||||
std::string genArgument(const std::string& argName, const std::string& help,
|
||||
const std::string& modelName, const std::string& zooFile,
|
||||
char key, std::string defaultVal)
|
||||
{
|
||||
if (!modelName.empty())
|
||||
{
|
||||
FileStorage fs(zooFile, FileStorage::READ);
|
||||
if (fs.isOpened())
|
||||
{
|
||||
FileNode node = fs[modelName];
|
||||
if (!node.empty())
|
||||
{
|
||||
FileNode value = node[argName];
|
||||
if (!value.empty())
|
||||
{
|
||||
if (value.isReal())
|
||||
defaultVal = format("%f", (float)value);
|
||||
else if (value.isString())
|
||||
defaultVal = (std::string)value;
|
||||
else if (value.isInt())
|
||||
defaultVal = format("%d", (int)value);
|
||||
else if (value.isSeq())
|
||||
{
|
||||
for (size_t i = 0; i < value.size(); ++i)
|
||||
{
|
||||
FileNode v = value[(int)i];
|
||||
if (v.isInt())
|
||||
defaultVal += format("%d ", (int)v);
|
||||
else if (v.isReal())
|
||||
defaultVal += format("%f ", (float)v);
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unexpected value format");
|
||||
}
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unexpected field format");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return "{ " + argName + " " + key + " | " + defaultVal + " | " + help + " }";
|
||||
}
|
||||
|
||||
std::string findFile(const std::string& filename)
|
||||
{
|
||||
if (filename.empty() || utils::fs::exists(filename))
|
||||
return filename;
|
||||
|
||||
const char* extraPaths[] = {getenv("OPENCV_DNN_TEST_DATA_PATH"),
|
||||
getenv("OPENCV_TEST_DATA_PATH")};
|
||||
for (int i = 0; i < 2; ++i)
|
||||
{
|
||||
if (extraPaths[i] == NULL)
|
||||
continue;
|
||||
std::string absPath = utils::fs::join(extraPaths[i], utils::fs::join("dnn", filename));
|
||||
if (utils::fs::exists(absPath))
|
||||
return absPath;
|
||||
}
|
||||
CV_Error(Error::StsObjectNotFound, "File " + filename + " not found! "
|
||||
"Please specify a path to /opencv_extra/testdata in OPENCV_DNN_TEST_DATA_PATH "
|
||||
"environment variable or pass a full path to model.");
|
||||
}
|
||||
|
||||
std::string genPreprocArguments(const std::string& modelName, const std::string& zooFile)
|
||||
{
|
||||
return genArgument("model", "Path to a binary file of model contains trained weights. "
|
||||
"It could be a file with extensions .caffemodel (Caffe), "
|
||||
".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet), .bin (OpenVINO).",
|
||||
modelName, zooFile, 'm') +
|
||||
genArgument("config", "Path to a text file of model contains network configuration. "
|
||||
"It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet), .xml (OpenVINO).",
|
||||
modelName, zooFile, 'c') +
|
||||
genArgument("mean", "Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces.",
|
||||
modelName, zooFile) +
|
||||
genArgument("scale", "Preprocess input image by multiplying on a scale factor.",
|
||||
modelName, zooFile, ' ', "1.0") +
|
||||
genArgument("width", "Preprocess input image by resizing to a specific width.",
|
||||
modelName, zooFile, ' ', "-1") +
|
||||
genArgument("height", "Preprocess input image by resizing to a specific height.",
|
||||
modelName, zooFile, ' ', "-1") +
|
||||
genArgument("rgb", "Indicate that model works with RGB input images instead BGR ones.",
|
||||
modelName, zooFile);
|
||||
}
|
112
3rdparty/opencv-4.5.4/samples/dnn/common.py
vendored
Normal file
112
3rdparty/opencv-4.5.4/samples/dnn/common.py
vendored
Normal file
@ -0,0 +1,112 @@
|
||||
import sys
|
||||
import os
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
def add_argument(zoo, parser, name, help, required=False, default=None, type=None, action=None, nargs=None):
|
||||
if len(sys.argv) <= 1:
|
||||
return
|
||||
|
||||
modelName = sys.argv[1]
|
||||
|
||||
if os.path.isfile(zoo):
|
||||
fs = cv.FileStorage(zoo, cv.FILE_STORAGE_READ)
|
||||
node = fs.getNode(modelName)
|
||||
if not node.empty():
|
||||
value = node.getNode(name)
|
||||
if not value.empty():
|
||||
if value.isReal():
|
||||
default = value.real()
|
||||
elif value.isString():
|
||||
default = value.string()
|
||||
elif value.isInt():
|
||||
default = int(value.real())
|
||||
elif value.isSeq():
|
||||
default = []
|
||||
for i in range(value.size()):
|
||||
v = value.at(i)
|
||||
if v.isInt():
|
||||
default.append(int(v.real()))
|
||||
elif v.isReal():
|
||||
default.append(v.real())
|
||||
else:
|
||||
print('Unexpected value format')
|
||||
exit(0)
|
||||
else:
|
||||
print('Unexpected field format')
|
||||
exit(0)
|
||||
required = False
|
||||
|
||||
if action == 'store_true':
|
||||
default = 1 if default == 'true' else (0 if default == 'false' else default)
|
||||
assert(default is None or default == 0 or default == 1)
|
||||
parser.add_argument('--' + name, required=required, help=help, default=bool(default),
|
||||
action=action)
|
||||
else:
|
||||
parser.add_argument('--' + name, required=required, help=help, default=default,
|
||||
action=action, nargs=nargs, type=type)
|
||||
|
||||
|
||||
def add_preproc_args(zoo, parser, sample):
|
||||
aliases = []
|
||||
if os.path.isfile(zoo):
|
||||
fs = cv.FileStorage(zoo, cv.FILE_STORAGE_READ)
|
||||
root = fs.root()
|
||||
for name in root.keys():
|
||||
model = root.getNode(name)
|
||||
if model.getNode('sample').string() == sample:
|
||||
aliases.append(name)
|
||||
|
||||
parser.add_argument('alias', nargs='?', choices=aliases,
|
||||
help='An alias name of model to extract preprocessing parameters from models.yml file.')
|
||||
add_argument(zoo, parser, 'model', required=True,
|
||||
help='Path to a binary file of model contains trained weights. '
|
||||
'It could be a file with extensions .caffemodel (Caffe), '
|
||||
'.pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet), .bin (OpenVINO)')
|
||||
add_argument(zoo, parser, 'config',
|
||||
help='Path to a text file of model contains network configuration. '
|
||||
'It could be a file with extensions .prototxt (Caffe), .pbtxt or .config (TensorFlow), .cfg (Darknet), .xml (OpenVINO)')
|
||||
add_argument(zoo, parser, 'mean', nargs='+', type=float, default=[0, 0, 0],
|
||||
help='Preprocess input image by subtracting mean values. '
|
||||
'Mean values should be in BGR order.')
|
||||
add_argument(zoo, parser, 'scale', type=float, default=1.0,
|
||||
help='Preprocess input image by multiplying on a scale factor.')
|
||||
add_argument(zoo, parser, 'width', type=int,
|
||||
help='Preprocess input image by resizing to a specific width.')
|
||||
add_argument(zoo, parser, 'height', type=int,
|
||||
help='Preprocess input image by resizing to a specific height.')
|
||||
add_argument(zoo, parser, 'rgb', action='store_true',
|
||||
help='Indicate that model works with RGB input images instead BGR ones.')
|
||||
add_argument(zoo, parser, 'classes',
|
||||
help='Optional path to a text file with names of classes to label detected objects.')
|
||||
|
||||
|
||||
def findFile(filename):
|
||||
if filename:
|
||||
if os.path.exists(filename):
|
||||
return filename
|
||||
|
||||
fpath = cv.samples.findFile(filename, False)
|
||||
if fpath:
|
||||
return fpath
|
||||
|
||||
samplesDataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'..',
|
||||
'data',
|
||||
'dnn')
|
||||
if os.path.exists(os.path.join(samplesDataDir, filename)):
|
||||
return os.path.join(samplesDataDir, filename)
|
||||
|
||||
for path in ['OPENCV_DNN_TEST_DATA_PATH', 'OPENCV_TEST_DATA_PATH']:
|
||||
try:
|
||||
extraPath = os.environ[path]
|
||||
absPath = os.path.join(extraPath, 'dnn', filename)
|
||||
if os.path.exists(absPath):
|
||||
return absPath
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
print('File ' + filename + ' not found! Please specify a path to '
|
||||
'/opencv_extra/testdata in OPENCV_DNN_TEST_DATA_PATH environment '
|
||||
'variable or pass a full path to model.')
|
||||
exit(0)
|
283
3rdparty/opencv-4.5.4/samples/dnn/custom_layers.hpp
vendored
Normal file
283
3rdparty/opencv-4.5.4/samples/dnn/custom_layers.hpp
vendored
Normal file
@ -0,0 +1,283 @@
|
||||
#ifndef __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
|
||||
#define __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/dnn/shape_utils.hpp> // getPlane
|
||||
|
||||
//! [InterpLayer]
|
||||
class InterpLayer : public cv::dnn::Layer
|
||||
{
|
||||
public:
|
||||
InterpLayer(const cv::dnn::LayerParams ¶ms) : Layer(params)
|
||||
{
|
||||
outWidth = params.get<int>("width", 0);
|
||||
outHeight = params.get<int>("height", 0);
|
||||
}
|
||||
|
||||
static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params)
|
||||
{
|
||||
return cv::Ptr<cv::dnn::Layer>(new InterpLayer(params));
|
||||
}
|
||||
|
||||
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
|
||||
const int requiredOutputs,
|
||||
std::vector<std::vector<int> > &outputs,
|
||||
std::vector<std::vector<int> > &internals) const CV_OVERRIDE
|
||||
{
|
||||
CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
|
||||
std::vector<int> outShape(4);
|
||||
outShape[0] = inputs[0][0]; // batch size
|
||||
outShape[1] = inputs[0][1]; // number of channels
|
||||
outShape[2] = outHeight;
|
||||
outShape[3] = outWidth;
|
||||
outputs.assign(1, outShape);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
|
||||
virtual void forward(cv::InputArrayOfArrays inputs_arr,
|
||||
cv::OutputArrayOfArrays outputs_arr,
|
||||
cv::OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
||||
{
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
{
|
||||
// In case of DNN_TARGET_OPENCL_FP16 target the following method
|
||||
// converts data from FP16 to FP32 and calls this forward again.
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> inputs, outputs;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
outputs_arr.getMatVector(outputs);
|
||||
|
||||
cv::Mat& inp = inputs[0];
|
||||
cv::Mat& out = outputs[0];
|
||||
const float* inpData = (float*)inp.data;
|
||||
float* outData = (float*)out.data;
|
||||
|
||||
const int batchSize = inp.size[0];
|
||||
const int numChannels = inp.size[1];
|
||||
const int inpHeight = inp.size[2];
|
||||
const int inpWidth = inp.size[3];
|
||||
|
||||
const float rheight = (outHeight > 1) ? static_cast<float>(inpHeight - 1) / (outHeight - 1) : 0.f;
|
||||
const float rwidth = (outWidth > 1) ? static_cast<float>(inpWidth - 1) / (outWidth - 1) : 0.f;
|
||||
for (int h2 = 0; h2 < outHeight; ++h2)
|
||||
{
|
||||
const float h1r = rheight * h2;
|
||||
const int h1 = static_cast<int>(h1r);
|
||||
const int h1p = (h1 < inpHeight - 1) ? 1 : 0;
|
||||
const float h1lambda = h1r - h1;
|
||||
const float h0lambda = 1.f - h1lambda;
|
||||
for (int w2 = 0; w2 < outWidth; ++w2)
|
||||
{
|
||||
const float w1r = rwidth * w2;
|
||||
const int w1 = static_cast<int>(w1r);
|
||||
const int w1p = (w1 < inpWidth - 1) ? 1 : 0;
|
||||
const float w1lambda = w1r - w1;
|
||||
const float w0lambda = 1.f - w1lambda;
|
||||
const float* pos1 = inpData + h1 * inpWidth + w1;
|
||||
float* pos2 = outData + h2 * outWidth + w2;
|
||||
for (int c = 0; c < batchSize * numChannels; ++c)
|
||||
{
|
||||
pos2[0] =
|
||||
h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
|
||||
h1lambda * (w0lambda * pos1[h1p * inpWidth] + w1lambda * pos1[h1p * inpWidth + w1p]);
|
||||
pos1 += inpWidth * inpHeight;
|
||||
pos2 += outWidth * outHeight;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int outWidth, outHeight;
|
||||
};
|
||||
//! [InterpLayer]
|
||||
|
||||
//! [ResizeBilinearLayer]
|
||||
class ResizeBilinearLayer CV_FINAL : public cv::dnn::Layer
|
||||
{
|
||||
public:
|
||||
ResizeBilinearLayer(const cv::dnn::LayerParams ¶ms) : Layer(params)
|
||||
{
|
||||
CV_Assert(!params.get<bool>("align_corners", false));
|
||||
CV_Assert(!blobs.empty());
|
||||
|
||||
for (size_t i = 0; i < blobs.size(); ++i)
|
||||
CV_Assert(blobs[i].type() == CV_32SC1);
|
||||
|
||||
// There are two cases of input blob: a single blob which contains output
|
||||
// shape and two blobs with scaling factors.
|
||||
if (blobs.size() == 1)
|
||||
{
|
||||
CV_Assert(blobs[0].total() == 2);
|
||||
outHeight = blobs[0].at<int>(0, 0);
|
||||
outWidth = blobs[0].at<int>(0, 1);
|
||||
factorHeight = factorWidth = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
CV_Assert(blobs.size() == 2); CV_Assert(blobs[0].total() == 1); CV_Assert(blobs[1].total() == 1);
|
||||
factorHeight = blobs[0].at<int>(0, 0);
|
||||
factorWidth = blobs[1].at<int>(0, 0);
|
||||
outHeight = outWidth = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params)
|
||||
{
|
||||
return cv::Ptr<cv::dnn::Layer>(new ResizeBilinearLayer(params));
|
||||
}
|
||||
|
||||
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
|
||||
const int,
|
||||
std::vector<std::vector<int> > &outputs,
|
||||
std::vector<std::vector<int> > &) const CV_OVERRIDE
|
||||
{
|
||||
std::vector<int> outShape(4);
|
||||
outShape[0] = inputs[0][0]; // batch size
|
||||
outShape[1] = inputs[0][1]; // number of channels
|
||||
outShape[2] = outHeight != 0 ? outHeight : (inputs[0][2] * factorHeight);
|
||||
outShape[3] = outWidth != 0 ? outWidth : (inputs[0][3] * factorWidth);
|
||||
outputs.assign(1, outShape);
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void finalize(cv::InputArrayOfArrays, cv::OutputArrayOfArrays outputs_arr) CV_OVERRIDE
|
||||
{
|
||||
std::vector<cv::Mat> outputs;
|
||||
outputs_arr.getMatVector(outputs);
|
||||
if (!outWidth && !outHeight)
|
||||
{
|
||||
outHeight = outputs[0].size[2];
|
||||
outWidth = outputs[0].size[3];
|
||||
}
|
||||
}
|
||||
|
||||
// This implementation is based on a reference implementation from
|
||||
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
|
||||
virtual void forward(cv::InputArrayOfArrays inputs_arr,
|
||||
cv::OutputArrayOfArrays outputs_arr,
|
||||
cv::OutputArrayOfArrays internals_arr) CV_OVERRIDE
|
||||
{
|
||||
if (inputs_arr.depth() == CV_16S)
|
||||
{
|
||||
// In case of DNN_TARGET_OPENCL_FP16 target the following method
|
||||
// converts data from FP16 to FP32 and calls this forward again.
|
||||
forward_fallback(inputs_arr, outputs_arr, internals_arr);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<cv::Mat> inputs, outputs;
|
||||
inputs_arr.getMatVector(inputs);
|
||||
outputs_arr.getMatVector(outputs);
|
||||
|
||||
cv::Mat& inp = inputs[0];
|
||||
cv::Mat& out = outputs[0];
|
||||
const float* inpData = (float*)inp.data;
|
||||
float* outData = (float*)out.data;
|
||||
|
||||
const int batchSize = inp.size[0];
|
||||
const int numChannels = inp.size[1];
|
||||
const int inpHeight = inp.size[2];
|
||||
const int inpWidth = inp.size[3];
|
||||
|
||||
float heightScale = static_cast<float>(inpHeight) / outHeight;
|
||||
float widthScale = static_cast<float>(inpWidth) / outWidth;
|
||||
for (int b = 0; b < batchSize; ++b)
|
||||
{
|
||||
for (int y = 0; y < outHeight; ++y)
|
||||
{
|
||||
float input_y = y * heightScale;
|
||||
int y0 = static_cast<int>(std::floor(input_y));
|
||||
int y1 = std::min(y0 + 1, inpHeight - 1);
|
||||
for (int x = 0; x < outWidth; ++x)
|
||||
{
|
||||
float input_x = x * widthScale;
|
||||
int x0 = static_cast<int>(std::floor(input_x));
|
||||
int x1 = std::min(x0 + 1, inpWidth - 1);
|
||||
for (int c = 0; c < numChannels; ++c)
|
||||
{
|
||||
float interpolation =
|
||||
inpData[offset(inp.size, c, x0, y0, b)] * (1 - (input_y - y0)) * (1 - (input_x - x0)) +
|
||||
inpData[offset(inp.size, c, x0, y1, b)] * (input_y - y0) * (1 - (input_x - x0)) +
|
||||
inpData[offset(inp.size, c, x1, y0, b)] * (1 - (input_y - y0)) * (input_x - x0) +
|
||||
inpData[offset(inp.size, c, x1, y1, b)] * (input_y - y0) * (input_x - x0);
|
||||
outData[offset(out.size, c, x, y, b)] = interpolation;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static inline int offset(const cv::MatSize& size, int c, int x, int y, int b)
|
||||
{
|
||||
return x + size[3] * (y + size[2] * (c + size[1] * b));
|
||||
}
|
||||
|
||||
int outWidth, outHeight, factorWidth, factorHeight;
|
||||
};
|
||||
//! [ResizeBilinearLayer]
|
||||
|
||||
//
|
||||
// The following code is used only to generate tutorials documentation.
|
||||
//
|
||||
|
||||
//! [A custom layer interface]
|
||||
class MyLayer : public cv::dnn::Layer
|
||||
{
|
||||
public:
|
||||
//! [MyLayer::MyLayer]
|
||||
MyLayer(const cv::dnn::LayerParams ¶ms);
|
||||
//! [MyLayer::MyLayer]
|
||||
|
||||
//! [MyLayer::create]
|
||||
static cv::Ptr<cv::dnn::Layer> create(cv::dnn::LayerParams& params);
|
||||
//! [MyLayer::create]
|
||||
|
||||
//! [MyLayer::getMemoryShapes]
|
||||
virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
|
||||
const int requiredOutputs,
|
||||
std::vector<std::vector<int> > &outputs,
|
||||
std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
|
||||
//! [MyLayer::getMemoryShapes]
|
||||
|
||||
//! [MyLayer::forward]
|
||||
virtual void forward(cv::InputArrayOfArrays inputs,
|
||||
cv::OutputArrayOfArrays outputs,
|
||||
cv::OutputArrayOfArrays internals) CV_OVERRIDE;
|
||||
//! [MyLayer::forward]
|
||||
|
||||
//! [MyLayer::finalize]
|
||||
virtual void finalize(cv::InputArrayOfArrays inputs,
|
||||
cv::OutputArrayOfArrays outputs) CV_OVERRIDE;
|
||||
//! [MyLayer::finalize]
|
||||
};
|
||||
//! [A custom layer interface]
|
||||
|
||||
//! [Register a custom layer]
|
||||
#include <opencv2/dnn/layer.details.hpp> // CV_DNN_REGISTER_LAYER_CLASS
|
||||
|
||||
static inline void loadNet()
|
||||
{
|
||||
CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
|
||||
// ...
|
||||
//! [Register a custom layer]
|
||||
|
||||
//! [Register InterpLayer]
|
||||
CV_DNN_REGISTER_LAYER_CLASS(Interp, InterpLayer);
|
||||
cv::dnn::Net caffeNet = cv::dnn::readNet("/path/to/config.prototxt", "/path/to/weights.caffemodel");
|
||||
//! [Register InterpLayer]
|
||||
|
||||
//! [Register ResizeBilinearLayer]
|
||||
CV_DNN_REGISTER_LAYER_CLASS(ResizeBilinear, ResizeBilinearLayer);
|
||||
cv::dnn::Net tfNet = cv::dnn::readNet("/path/to/graph.pb");
|
||||
//! [Register ResizeBilinearLayer]
|
||||
|
||||
if (false) loadNet(); // To prevent unused function warning.
|
||||
}
|
||||
|
||||
#endif // __OPENCV_SAMPLES_DNN_CUSTOM_LAYERS__
|
189
3rdparty/opencv-4.5.4/samples/dnn/dasiamrpn_tracker.cpp
vendored
Normal file
189
3rdparty/opencv-4.5.4/samples/dnn/dasiamrpn_tracker.cpp
vendored
Normal file
@ -0,0 +1,189 @@
|
||||
// DaSiamRPN tracker.
|
||||
// Original paper: https://arxiv.org/abs/1808.06048
|
||||
// Link to original repo: https://github.com/foolwood/DaSiamRPN
|
||||
// Links to onnx models:
|
||||
// - network: https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
|
||||
// - kernel_r1: https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
|
||||
// - kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
|
||||
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/video.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
const char *keys =
|
||||
"{ help h | | Print help message }"
|
||||
"{ input i | | Full path to input video folder, the specific camera index. (empty for camera 0) }"
|
||||
"{ net | dasiamrpn_model.onnx | Path to onnx model of net}"
|
||||
"{ kernel_cls1 | dasiamrpn_kernel_cls1.onnx | Path to onnx model of kernel_r1 }"
|
||||
"{ kernel_r1 | dasiamrpn_kernel_r1.onnx | Path to onnx model of kernel_cls1 }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation, "
|
||||
"4: VKCOM, "
|
||||
"5: CUDA },"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU, "
|
||||
"4: Vulkan, "
|
||||
"6: CUDA, "
|
||||
"7: CUDA fp16 (half-float preprocess) }"
|
||||
;
|
||||
|
||||
static
|
||||
int run(int argc, char** argv)
|
||||
{
|
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
if (parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string inputName = parser.get<String>("input");
|
||||
std::string net = parser.get<String>("net");
|
||||
std::string kernel_cls1 = parser.get<String>("kernel_cls1");
|
||||
std::string kernel_r1 = parser.get<String>("kernel_r1");
|
||||
int backend = parser.get<int>("backend");
|
||||
int target = parser.get<int>("target");
|
||||
|
||||
Ptr<TrackerDaSiamRPN> tracker;
|
||||
try
|
||||
{
|
||||
TrackerDaSiamRPN::Params params;
|
||||
params.model = samples::findFile(net);
|
||||
params.kernel_cls1 = samples::findFile(kernel_cls1);
|
||||
params.kernel_r1 = samples::findFile(kernel_r1);
|
||||
params.backend = backend;
|
||||
params.target = target;
|
||||
tracker = TrackerDaSiamRPN::create(params);
|
||||
}
|
||||
catch (const cv::Exception& ee)
|
||||
{
|
||||
std::cerr << "Exception: " << ee.what() << std::endl;
|
||||
std::cout << "Can't load the network by using the following files:" << std::endl;
|
||||
std::cout << "siamRPN : " << net << std::endl;
|
||||
std::cout << "siamKernelCL1 : " << kernel_cls1 << std::endl;
|
||||
std::cout << "siamKernelR1 : " << kernel_r1 << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
const std::string winName = "DaSiamRPN";
|
||||
namedWindow(winName, WINDOW_AUTOSIZE);
|
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap;
|
||||
|
||||
if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
|
||||
{
|
||||
int c = inputName.empty() ? 0 : inputName[0] - '0';
|
||||
std::cout << "Trying to open camera #" << c << " ..." << std::endl;
|
||||
if (!cap.open(c))
|
||||
{
|
||||
std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
else if (inputName.size())
|
||||
{
|
||||
inputName = samples::findFileOrKeep(inputName);
|
||||
if (!cap.open(inputName))
|
||||
{
|
||||
std::cout << "Could not open: " << inputName << std::endl;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Read the first image.
|
||||
Mat image;
|
||||
cap >> image;
|
||||
if (image.empty())
|
||||
{
|
||||
std::cerr << "Can't capture frame!" << std::endl;
|
||||
return 2;
|
||||
}
|
||||
|
||||
Mat image_select = image.clone();
|
||||
putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
Rect selectRect = selectROI(winName, image_select);
|
||||
std::cout << "ROI=" << selectRect << std::endl;
|
||||
|
||||
tracker->init(image, selectRect);
|
||||
|
||||
TickMeter tickMeter;
|
||||
|
||||
for (int count = 0; ; ++count)
|
||||
{
|
||||
cap >> image;
|
||||
if (image.empty())
|
||||
{
|
||||
std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
Rect rect;
|
||||
|
||||
tickMeter.start();
|
||||
bool ok = tracker->update(image, rect);
|
||||
tickMeter.stop();
|
||||
|
||||
float score = tracker->getTrackingScore();
|
||||
|
||||
std::cout << "frame " << count <<
|
||||
": predicted score=" << score <<
|
||||
" rect=" << rect <<
|
||||
" time=" << tickMeter.getTimeMilli() << "ms" <<
|
||||
std::endl;
|
||||
|
||||
Mat render_image = image.clone();
|
||||
|
||||
if (ok)
|
||||
{
|
||||
rectangle(render_image, rect, Scalar(0, 255, 0), 2);
|
||||
|
||||
std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
|
||||
std::string scoreLabel = format("Score: %f", score);
|
||||
putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
}
|
||||
|
||||
imshow(winName, render_image);
|
||||
|
||||
tickMeter.reset();
|
||||
|
||||
int c = waitKey(1);
|
||||
if (c == 27 /*ESC*/)
|
||||
break;
|
||||
}
|
||||
|
||||
std::cout << "Exit" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
try
|
||||
{
|
||||
return run(argc, argv);
|
||||
}
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
23
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/abstract_model.py
vendored
Normal file
23
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/abstract_model.py
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
from abc import ABC, ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class AbstractModel(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get_prepared_models(self):
|
||||
pass
|
||||
|
||||
|
||||
class Framework(object):
|
||||
in_blob_name = ''
|
||||
out_blob_name = ''
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def get_name(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_output(self, input_blob):
|
||||
pass
|
@ -0,0 +1,96 @@
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...utils import get_final_summary_info
|
||||
|
||||
|
||||
class ClsAccEvaluation:
|
||||
log = sys.stdout
|
||||
img_classes = {}
|
||||
batch_size = 0
|
||||
|
||||
def __init__(self, log_path, img_classes_file, batch_size):
|
||||
self.log = open(log_path, 'w')
|
||||
self.img_classes = self.read_classes(img_classes_file)
|
||||
self.batch_size = batch_size
|
||||
|
||||
# collect the accuracies for both models
|
||||
self.general_quality_metric = []
|
||||
self.general_inference_time = []
|
||||
|
||||
@staticmethod
|
||||
def read_classes(img_classes_file):
|
||||
result = {}
|
||||
with open(img_classes_file) as file:
|
||||
for l in file.readlines():
|
||||
result[l.split()[0]] = int(l.split()[1])
|
||||
return result
|
||||
|
||||
def get_correct_answers(self, img_list, net_output_blob):
|
||||
correct_answers = 0
|
||||
for i in range(len(img_list)):
|
||||
indexes = np.argsort(net_output_blob[i])[-5:]
|
||||
correct_index = self.img_classes[img_list[i]]
|
||||
if correct_index in indexes:
|
||||
correct_answers += 1
|
||||
return correct_answers
|
||||
|
||||
def process(self, frameworks, data_fetcher):
|
||||
sorted_imgs_names = sorted(self.img_classes.keys())
|
||||
correct_answers = [0] * len(frameworks)
|
||||
samples_handled = 0
|
||||
blobs_l1_diff = [0] * len(frameworks)
|
||||
blobs_l1_diff_count = [0] * len(frameworks)
|
||||
blobs_l_inf_diff = [sys.float_info.min] * len(frameworks)
|
||||
inference_time = [0.0] * len(frameworks)
|
||||
|
||||
for x in range(0, len(sorted_imgs_names), self.batch_size):
|
||||
sublist = sorted_imgs_names[x:x + self.batch_size]
|
||||
batch = data_fetcher.get_batch(sublist)
|
||||
|
||||
samples_handled += len(sublist)
|
||||
fw_accuracy = []
|
||||
fw_time = []
|
||||
frameworks_out = []
|
||||
for i in range(len(frameworks)):
|
||||
start = time.time()
|
||||
out = frameworks[i].get_output(batch)
|
||||
end = time.time()
|
||||
correct_answers[i] += self.get_correct_answers(sublist, out)
|
||||
fw_accuracy.append(100 * correct_answers[i] / float(samples_handled))
|
||||
frameworks_out.append(out)
|
||||
inference_time[i] += end - start
|
||||
fw_time.append(inference_time[i] / samples_handled * 1000)
|
||||
print(samples_handled, 'Accuracy for', frameworks[i].get_name() + ':', fw_accuracy[i], file=self.log)
|
||||
print("Inference time, ms ", frameworks[i].get_name(), fw_time[i], file=self.log)
|
||||
|
||||
self.general_quality_metric.append(fw_accuracy)
|
||||
self.general_inference_time.append(fw_time)
|
||||
|
||||
for i in range(1, len(frameworks)):
|
||||
log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
|
||||
diff = np.abs(frameworks_out[0] - frameworks_out[i])
|
||||
l1_diff = np.sum(diff) / diff.size
|
||||
print(samples_handled, "L1 difference", log_str, l1_diff, file=self.log)
|
||||
blobs_l1_diff[i] += l1_diff
|
||||
blobs_l1_diff_count[i] += 1
|
||||
if np.max(diff) > blobs_l_inf_diff[i]:
|
||||
blobs_l_inf_diff[i] = np.max(diff)
|
||||
print(samples_handled, "L_INF difference", log_str, blobs_l_inf_diff[i], file=self.log)
|
||||
|
||||
self.log.flush()
|
||||
|
||||
for i in range(1, len(blobs_l1_diff)):
|
||||
log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
|
||||
print('Final l1 diff', log_str, blobs_l1_diff[i] / blobs_l1_diff_count[i], file=self.log)
|
||||
|
||||
print(
|
||||
get_final_summary_info(
|
||||
self.general_quality_metric,
|
||||
self.general_inference_time,
|
||||
"accuracy"
|
||||
),
|
||||
file=self.log
|
||||
)
|
@ -0,0 +1,87 @@
|
||||
import os
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from ...img_utils import read_rgb_img, get_pytorch_preprocess
|
||||
from ...test.configs.default_preprocess_config import PYTORCH_RSZ_HEIGHT, PYTORCH_RSZ_WIDTH
|
||||
|
||||
|
||||
class DataFetch(object):
|
||||
imgs_dir = ''
|
||||
frame_size = 0
|
||||
bgr_to_rgb = False
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def preprocess(self, img):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def reshape_img(img):
|
||||
img = img[:, :, 0:3].transpose(2, 0, 1)
|
||||
return np.expand_dims(img, 0)
|
||||
|
||||
def center_crop(self, img):
|
||||
cols = img.shape[1]
|
||||
rows = img.shape[0]
|
||||
|
||||
y1 = round((rows - self.frame_size) / 2)
|
||||
y2 = round(y1 + self.frame_size)
|
||||
x1 = round((cols - self.frame_size) / 2)
|
||||
x2 = round(x1 + self.frame_size)
|
||||
return img[y1:y2, x1:x2]
|
||||
|
||||
def initial_preprocess(self, img):
|
||||
min_dim = min(img.shape[-3], img.shape[-2])
|
||||
resize_ratio = self.frame_size / float(min_dim)
|
||||
|
||||
img = cv2.resize(img, (0, 0), fx=resize_ratio, fy=resize_ratio)
|
||||
img = self.center_crop(img)
|
||||
return img
|
||||
|
||||
def get_preprocessed_img(self, img_path):
|
||||
image_data = read_rgb_img(img_path, self.bgr_to_rgb)
|
||||
image_data = self.preprocess(image_data)
|
||||
return self.reshape_img(image_data)
|
||||
|
||||
def get_batch(self, img_names):
|
||||
assert type(img_names) is list
|
||||
batch = np.zeros((len(img_names), 3, self.frame_size, self.frame_size)).astype(np.float32)
|
||||
|
||||
for i in range(len(img_names)):
|
||||
img_name = img_names[i]
|
||||
img_file = os.path.join(self.imgs_dir, img_name)
|
||||
assert os.path.exists(img_file)
|
||||
|
||||
batch[i] = self.get_preprocessed_img(img_file)
|
||||
return batch
|
||||
|
||||
|
||||
class PyTorchPreprocessedFetch(DataFetch):
|
||||
def __init__(self, pytorch_cls_config, preprocess_input=None):
|
||||
self.imgs_dir = pytorch_cls_config.img_root_dir
|
||||
self.frame_size = pytorch_cls_config.frame_size
|
||||
self.bgr_to_rgb = pytorch_cls_config.bgr_to_rgb
|
||||
self.preprocess_input = preprocess_input
|
||||
|
||||
def preprocess(self, img):
|
||||
img = cv2.resize(img, (PYTORCH_RSZ_WIDTH, PYTORCH_RSZ_HEIGHT))
|
||||
img = self.center_crop(img)
|
||||
if self.preprocess_input:
|
||||
return self.presprocess_input(img)
|
||||
return get_pytorch_preprocess(img)
|
||||
|
||||
|
||||
class TFPreprocessedFetch(DataFetch):
|
||||
def __init__(self, tf_cls_config, preprocess_input):
|
||||
self.imgs_dir = tf_cls_config.img_root_dir
|
||||
self.frame_size = tf_cls_config.frame_size
|
||||
self.bgr_to_rgb = tf_cls_config.bgr_to_rgb
|
||||
self.preprocess_input = preprocess_input
|
||||
|
||||
def preprocess(self, img):
|
||||
img = self.initial_preprocess(img)
|
||||
return self.preprocess_input(img)
|
19
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/img_utils.py
vendored
Normal file
19
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/img_utils.py
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from .test.configs.default_preprocess_config import BASE_IMG_SCALE_FACTOR
|
||||
|
||||
|
||||
def read_rgb_img(img_file, is_bgr_to_rgb=True):
|
||||
img = cv2.imread(img_file, cv2.IMREAD_COLOR)
|
||||
if is_bgr_to_rgb:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
return img
|
||||
|
||||
|
||||
def get_pytorch_preprocess(img):
|
||||
img = img.astype(np.float32)
|
||||
img *= BASE_IMG_SCALE_FACTOR
|
||||
img -= [0.485, 0.456, 0.406]
|
||||
img /= [0.229, 0.224, 0.225]
|
||||
return img
|
@ -0,0 +1,60 @@
|
||||
from .configs.test_config import TestClsConfig, TestClsModuleConfig
|
||||
from .model_test_pipeline import ModelTestPipeline
|
||||
from ..evaluation.classification.cls_accuracy_evaluator import ClsAccEvaluation
|
||||
from ..utils import get_test_module
|
||||
|
||||
|
||||
class ClsModelTestPipeline(ModelTestPipeline):
|
||||
def __init__(
|
||||
self,
|
||||
network_model,
|
||||
model_processor,
|
||||
dnn_model_processor,
|
||||
data_fetcher,
|
||||
img_processor=None,
|
||||
cls_args_parser=None,
|
||||
default_input_blob_preproc=None
|
||||
):
|
||||
super(ClsModelTestPipeline, self).__init__(
|
||||
network_model,
|
||||
model_processor,
|
||||
dnn_model_processor
|
||||
)
|
||||
|
||||
if cls_args_parser:
|
||||
self._parser = cls_args_parser
|
||||
|
||||
self.test_config = TestClsConfig()
|
||||
|
||||
parser_args = self._parser.parse_args()
|
||||
|
||||
if parser_args.test:
|
||||
self._test_module_config = TestClsModuleConfig()
|
||||
self._test_module = get_test_module(
|
||||
self._test_module_config.test_module_name,
|
||||
self._test_module_config.test_module_path
|
||||
)
|
||||
|
||||
if parser_args.default_img_preprocess:
|
||||
self._default_input_blob_preproc = default_input_blob_preproc
|
||||
if parser_args.evaluate:
|
||||
self._data_fetcher = data_fetcher(self.test_config, img_processor)
|
||||
|
||||
def _configure_test_module_params(self):
|
||||
self._test_module_param_list.extend((
|
||||
'--crop', self._test_module_config.crop,
|
||||
'--std', *self._test_module_config.std
|
||||
))
|
||||
|
||||
if self._test_module_config.rsz_height and self._test_module_config.rsz_width:
|
||||
self._test_module_param_list.extend((
|
||||
'--initial_height', self._test_module_config.rsz_height,
|
||||
'--initial_width', self._test_module_config.rsz_width,
|
||||
))
|
||||
|
||||
def _configure_acc_eval(self, log_path):
|
||||
self._accuracy_evaluator = ClsAccEvaluation(
|
||||
log_path,
|
||||
self.test_config.img_cls_file,
|
||||
self.test_config.batch_size
|
||||
)
|
@ -0,0 +1,37 @@
|
||||
BASE_IMG_SCALE_FACTOR = 1 / 255.0
|
||||
PYTORCH_RSZ_HEIGHT = 256
|
||||
PYTORCH_RSZ_WIDTH = 256
|
||||
|
||||
pytorch_resize_input_blob = {
|
||||
"mean": ["123.675", "116.28", "103.53"],
|
||||
"scale": str(BASE_IMG_SCALE_FACTOR),
|
||||
"std": ["0.229", "0.224", "0.225"],
|
||||
"crop": "True",
|
||||
"rgb": True,
|
||||
"rsz_height": str(PYTORCH_RSZ_HEIGHT),
|
||||
"rsz_width": str(PYTORCH_RSZ_WIDTH)
|
||||
}
|
||||
|
||||
pytorch_input_blob = {
|
||||
"mean": ["123.675", "116.28", "103.53"],
|
||||
"scale": str(BASE_IMG_SCALE_FACTOR),
|
||||
"std": ["0.229", "0.224", "0.225"],
|
||||
"crop": "True",
|
||||
"rgb": True
|
||||
}
|
||||
|
||||
tf_input_blob = {
|
||||
"scale": str(1 / 127.5),
|
||||
"mean": ["127.5", "127.5", "127.5"],
|
||||
"std": [],
|
||||
"crop": "True",
|
||||
"rgb": True
|
||||
}
|
||||
|
||||
tf_model_blob_caffe_mode = {
|
||||
"mean": ["103.939", "116.779", "123.68"],
|
||||
"scale": "1.0",
|
||||
"std": [],
|
||||
"crop": "True",
|
||||
"rgb": False
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommonConfig:
|
||||
output_data_root_dir: str = "dnn_model_runner/dnn_conversion"
|
||||
logs_dir: str = os.path.join(output_data_root_dir, "logs")
|
||||
log_file_path: str = os.path.join(logs_dir, "{}_log.txt")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestClsConfig:
|
||||
batch_size: int = 1
|
||||
frame_size: int = 224
|
||||
img_root_dir: str = "./ILSVRC2012_img_val"
|
||||
# location of image-class matching
|
||||
img_cls_file: str = "./val.txt"
|
||||
bgr_to_rgb: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestClsModuleConfig:
|
||||
cls_test_data_dir: str = "../data"
|
||||
test_module_name: str = "classification"
|
||||
test_module_path: str = "classification.py"
|
||||
input_img: str = os.path.join(cls_test_data_dir, "squirrel_cls.jpg")
|
||||
model: str = ""
|
||||
|
||||
frame_height: str = str(TestClsConfig.frame_size)
|
||||
frame_width: str = str(TestClsConfig.frame_size)
|
||||
scale: str = "1.0"
|
||||
mean: List[str] = field(default_factory=lambda: ["0.0", "0.0", "0.0"])
|
||||
std: List[str] = field(default_factory=list)
|
||||
crop: str = "False"
|
||||
rgb: str = "True"
|
||||
rsz_height: str = ""
|
||||
rsz_width: str = ""
|
||||
classes: str = os.path.join(cls_test_data_dir, "dnn", "classification_classes_ILSVRC2012.txt")
|
126
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/test/model_test_pipeline.py
vendored
Normal file
126
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/test/model_test_pipeline.py
vendored
Normal file
@ -0,0 +1,126 @@
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .configs.test_config import CommonConfig
|
||||
from ..utils import create_parser, plot_acc
|
||||
|
||||
|
||||
class ModelTestPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
network_model,
|
||||
model_processor,
|
||||
dnn_model_processor
|
||||
):
|
||||
self._net_model = network_model
|
||||
self._model_processor = model_processor
|
||||
self._dnn_model_processor = dnn_model_processor
|
||||
|
||||
self._parser = create_parser()
|
||||
|
||||
self._test_module = None
|
||||
self._test_module_config = None
|
||||
self._test_module_param_list = None
|
||||
|
||||
self.test_config = None
|
||||
self._data_fetcher = None
|
||||
|
||||
self._default_input_blob_preproc = None
|
||||
self._accuracy_evaluator = None
|
||||
|
||||
def init_test_pipeline(self):
|
||||
cmd_args = self._parser.parse_args()
|
||||
model_dict = self._net_model.get_prepared_models()
|
||||
|
||||
model_names = list(model_dict.keys())
|
||||
print(
|
||||
"The model {} was successfully obtained and converted to OpenCV {}".format(model_names[0], model_names[1])
|
||||
)
|
||||
|
||||
if cmd_args.test:
|
||||
if not self._test_module_config.model:
|
||||
self._test_module_config.model = self._net_model.model_path["full_path"]
|
||||
|
||||
if cmd_args.default_img_preprocess:
|
||||
self._test_module_config.scale = self._default_input_blob_preproc["scale"]
|
||||
self._test_module_config.mean = self._default_input_blob_preproc["mean"]
|
||||
self._test_module_config.std = self._default_input_blob_preproc["std"]
|
||||
self._test_module_config.crop = self._default_input_blob_preproc["crop"]
|
||||
|
||||
if "rsz_height" in self._default_input_blob_preproc and "rsz_width" in self._default_input_blob_preproc:
|
||||
self._test_module_config.rsz_height = self._default_input_blob_preproc["rsz_height"]
|
||||
self._test_module_config.rsz_width = self._default_input_blob_preproc["rsz_width"]
|
||||
|
||||
self._test_module_param_list = [
|
||||
'--model', self._test_module_config.model,
|
||||
'--input', self._test_module_config.input_img,
|
||||
'--width', self._test_module_config.frame_width,
|
||||
'--height', self._test_module_config.frame_height,
|
||||
'--scale', self._test_module_config.scale,
|
||||
'--mean', *self._test_module_config.mean,
|
||||
'--std', *self._test_module_config.std,
|
||||
'--classes', self._test_module_config.classes,
|
||||
]
|
||||
|
||||
if self._default_input_blob_preproc["rgb"]:
|
||||
self._test_module_param_list.append('--rgb')
|
||||
|
||||
self._configure_test_module_params()
|
||||
|
||||
self._test_module.main(
|
||||
self._test_module_param_list
|
||||
)
|
||||
|
||||
if cmd_args.evaluate:
|
||||
original_model_name = model_names[0]
|
||||
dnn_model_name = model_names[1]
|
||||
|
||||
self.run_test_pipeline(
|
||||
[
|
||||
self._model_processor(model_dict[original_model_name], original_model_name),
|
||||
self._dnn_model_processor(model_dict[dnn_model_name], dnn_model_name)
|
||||
],
|
||||
original_model_name.replace(" ", "_")
|
||||
)
|
||||
|
||||
def run_test_pipeline(
|
||||
self,
|
||||
models_list,
|
||||
formatted_exp_name,
|
||||
is_plot_acc=True
|
||||
):
|
||||
log_path, logs_dir = self._configure_eval_log(formatted_exp_name)
|
||||
|
||||
print(
|
||||
"===== Running evaluation of the model with the following params:\n"
|
||||
"\t* val data location: {}\n"
|
||||
"\t* log file location: {}\n".format(
|
||||
self.test_config.img_root_dir,
|
||||
log_path
|
||||
)
|
||||
)
|
||||
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
self._configure_acc_eval(log_path)
|
||||
self._accuracy_evaluator.process(models_list, self._data_fetcher)
|
||||
|
||||
if is_plot_acc:
|
||||
plot_acc(
|
||||
np.array(self._accuracy_evaluator.general_inference_time),
|
||||
formatted_exp_name
|
||||
)
|
||||
|
||||
print("===== End of the evaluation pipeline =====")
|
||||
|
||||
def _configure_acc_eval(self, log_path):
|
||||
pass
|
||||
|
||||
def _configure_test_module_params(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _configure_eval_log(formatted_exp_name):
|
||||
common_test_config = CommonConfig()
|
||||
return common_test_config.log_file_path.format(formatted_exp_name), common_test_config.logs_dir
|
153
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py
vendored
Normal file
153
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/common/utils.py
vendored
Normal file
@ -0,0 +1,153 @@
|
||||
import argparse
|
||||
import importlib.util
|
||||
import os
|
||||
import random
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
||||
from .test.configs.test_config import CommonConfig
|
||||
|
||||
SEED_VAL = 42
|
||||
DNN_LIB = "DNN"
|
||||
# common path for model savings
|
||||
MODEL_PATH_ROOT = os.path.join(CommonConfig().output_data_root_dir, "{}/models")
|
||||
|
||||
|
||||
def get_full_model_path(lib_name, model_full_name):
|
||||
model_path = MODEL_PATH_ROOT.format(lib_name)
|
||||
return {
|
||||
"path": model_path,
|
||||
"full_path": os.path.join(model_path, model_full_name)
|
||||
}
|
||||
|
||||
|
||||
def plot_acc(data_list, experiment_name):
|
||||
plt.figure(figsize=[8, 6])
|
||||
plt.plot(data_list[:, 0], "r", linewidth=2.5, label="Original Model")
|
||||
plt.plot(data_list[:, 1], "b", linewidth=2.5, label="Converted DNN Model")
|
||||
plt.xlabel("Iterations ", fontsize=15)
|
||||
plt.ylabel("Time (ms)", fontsize=15)
|
||||
plt.title(experiment_name, fontsize=15)
|
||||
plt.legend()
|
||||
full_path_to_fig = os.path.join(CommonConfig().output_data_root_dir, experiment_name + ".png")
|
||||
plt.savefig(full_path_to_fig, bbox_inches="tight")
|
||||
|
||||
|
||||
def get_final_summary_info(general_quality_metric, general_inference_time, metric_name):
|
||||
general_quality_metric = np.array(general_quality_metric)
|
||||
general_inference_time = np.array(general_inference_time)
|
||||
summary_line = "===== End of processing. General results:\n"
|
||||
"\t* mean {} for the original model: {}\t"
|
||||
"\t* mean time (min) for the original model inferences: {}\n"
|
||||
"\t* mean {} for the DNN model: {}\t"
|
||||
"\t* mean time (min) for the DNN model inferences: {}\n".format(
|
||||
metric_name, np.mean(general_quality_metric[:, 0]),
|
||||
np.mean(general_inference_time[:, 0]) / 60000,
|
||||
metric_name, np.mean(general_quality_metric[:, 1]),
|
||||
np.mean(general_inference_time[:, 1]) / 60000,
|
||||
)
|
||||
return summary_line
|
||||
|
||||
|
||||
def set_common_reproducibility():
|
||||
random.seed(SEED_VAL)
|
||||
np.random.seed(SEED_VAL)
|
||||
|
||||
|
||||
def set_pytorch_env():
|
||||
set_common_reproducibility()
|
||||
torch.manual_seed(SEED_VAL)
|
||||
torch.set_printoptions(precision=10)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(SEED_VAL)
|
||||
torch.backends.cudnn_benchmark_enabled = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
|
||||
def set_tf_env(is_use_gpu=True):
|
||||
set_common_reproducibility()
|
||||
tf.random.set_seed(SEED_VAL)
|
||||
os.environ["TF_DETERMINISTIC_OPS"] = "1"
|
||||
|
||||
if tf.config.list_physical_devices("GPU") and is_use_gpu:
|
||||
gpu_devices = tf.config.list_physical_devices("GPU")
|
||||
tf.config.experimental.set_visible_devices(gpu_devices[0], "GPU")
|
||||
tf.config.experimental.set_memory_growth(gpu_devices[0], True)
|
||||
os.environ["TF_USE_CUDNN"] = "1"
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
|
||||
|
||||
def str_bool(input_val):
|
||||
if input_val.lower() in ('yes', 'true', 't', 'y', '1'):
|
||||
return True
|
||||
elif input_val.lower() in ('no', 'false', 'f', 'n', '0'):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError('Boolean value was expected')
|
||||
|
||||
|
||||
def get_formatted_model_list(model_list):
|
||||
note_line = 'Please, choose the model from the below list:\n'
|
||||
spaces_to_set = ' ' * (len(note_line) - 2)
|
||||
return note_line + ''.join([spaces_to_set, '{} \n'] * len(model_list)).format(*model_list)
|
||||
|
||||
|
||||
def model_str(model_list):
|
||||
def type_model_list(input_val):
|
||||
if input_val.lower() in model_list:
|
||||
return input_val.lower()
|
||||
else:
|
||||
raise argparse.ArgumentTypeError(
|
||||
'The model is currently unavailable for test.\n' +
|
||||
get_formatted_model_list(model_list)
|
||||
)
|
||||
|
||||
return type_model_list
|
||||
|
||||
|
||||
def get_test_module(test_module_name, test_module_path):
|
||||
module_spec = importlib.util.spec_from_file_location(test_module_name, test_module_path)
|
||||
test_module = importlib.util.module_from_spec(module_spec)
|
||||
module_spec.loader.exec_module(test_module)
|
||||
module_spec.loader.exec_module(test_module)
|
||||
return test_module
|
||||
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
type=str_bool,
|
||||
help="Define whether you'd like to run the model with OpenCV for testing.",
|
||||
default=False
|
||||
),
|
||||
parser.add_argument(
|
||||
"--default_img_preprocess",
|
||||
type=str_bool,
|
||||
help="Define whether you'd like to preprocess the input image with defined"
|
||||
" PyTorch or TF functions for model test with OpenCV.",
|
||||
default=False
|
||||
),
|
||||
parser.add_argument(
|
||||
"--evaluate",
|
||||
type=str_bool,
|
||||
help="Define whether you'd like to run evaluation of the models (ex.: TF vs OpenCV networks).",
|
||||
default=True
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def create_extended_parser(model_list):
|
||||
parser = create_parser()
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=model_str(model_list=model_list),
|
||||
help="\nDefine the model name to test.\n" +
|
||||
get_formatted_model_list(model_list),
|
||||
required=True
|
||||
)
|
||||
return parser
|
78
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/README.md
vendored
Normal file
78
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/README.md
vendored
Normal file
@ -0,0 +1,78 @@
|
||||
# Run PaddlePaddle model using OpenCV
|
||||
|
||||
These two demonstrations show how to inference PaddlePaddle model using OpenCV.
|
||||
|
||||
## Environment Setup
|
||||
|
||||
```shell
|
||||
pip install paddlepaddle-gpu
|
||||
pip install paddlehub
|
||||
pip install paddle2onnx
|
||||
```
|
||||
|
||||
## 1. Run PaddlePaddle ResNet50 using OpenCV
|
||||
|
||||
### Run PaddlePaddle model demo
|
||||
|
||||
Run the code sample as follows:
|
||||
|
||||
```shell
|
||||
python paddle_resnet50.py
|
||||
```
|
||||
|
||||
There are three parts to the process:
|
||||
|
||||
1. Export PaddlePaddle ResNet50 model to onnx format.
|
||||
2. Use `cv2.dnn.readNetFromONNX` to load the model file.
|
||||
3. Preprocess image file and do the inference.
|
||||
|
||||
## 2. Run PaddleSeg Portrait Segmentation using OpenCV
|
||||
|
||||
### Convert to ONNX Model
|
||||
|
||||
#### 1. Get Paddle Inference model
|
||||
|
||||
For more details, please refer to [PaddleSeg](https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.1/contrib/HumanSeg/README.md).
|
||||
|
||||
```shell
|
||||
wget https://x2paddle.bj.bcebos.com/inference/models/humanseg_hrnet18_small_v1.zip
|
||||
unzip humanseg_hrnet18_small_v1.zip
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
* The exported model must have a fixed input shape, as dynamic is not supported at this moment.
|
||||
|
||||
#### 2. Convert to ONNX model using paddle2onnx
|
||||
|
||||
To convert the model, use the following command:
|
||||
|
||||
```
|
||||
paddle2onnx --model_dir humanseg_hrnet18_small_v1 \
|
||||
--model_filename model.pdmodel \
|
||||
--params_filename model.pdiparams \
|
||||
--opset_version 11 \
|
||||
--save_file humanseg_hrnet18_tiny.onnx
|
||||
```
|
||||
|
||||
The converted model can be found in the current directory by the name `humanseg_hrnet18_tiny.onnx` .
|
||||
|
||||
### Run PaddleSeg Portrait Segmentation demo
|
||||
|
||||
Run the code sample as follows:
|
||||
|
||||
```shell
|
||||
python paddle_humanseg.py
|
||||
```
|
||||
|
||||
There are three parts to the process:
|
||||
|
||||
1. Use `cv2.dnn.readNetFromONNX` to load the model file.
|
||||
2. Preprocess image file and do inference.
|
||||
3. Postprocess image file and visualize.
|
||||
|
||||
The resulting file can be found at `data/result_test_human.jpg` .
|
||||
|
||||
### Portrait segmentation visualization
|
||||
|
||||
<img src="../../../../data/messi5.jpg" width="50%" height="50%"><img src="./data/result_test_human.jpg" width="50%" height="50%">
|
BIN
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/cat.jpg
vendored
Normal file
BIN
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/cat.jpg
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 126 KiB |
1000
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/labels.txt
vendored
Normal file
1000
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/labels.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 61 KiB |
112
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_humanseg.py
vendored
Normal file
112
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_humanseg.py
vendored
Normal file
@ -0,0 +1,112 @@
|
||||
import os
|
||||
import paddlehub.vision.transforms as T
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
def get_color_map_list(num_classes):
|
||||
"""
|
||||
Returns the color map for visualizing the segmentation mask,
|
||||
which can support arbitrary number of classes.
|
||||
|
||||
Args:
|
||||
num_classes (int): Number of classes.
|
||||
|
||||
Returns:
|
||||
(list). The color map.
|
||||
"""
|
||||
|
||||
num_classes += 1
|
||||
color_map = num_classes * [0, 0, 0]
|
||||
for i in range(0, num_classes):
|
||||
j = 0
|
||||
lab = i
|
||||
while lab:
|
||||
color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
|
||||
color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
|
||||
color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
|
||||
j += 1
|
||||
lab >>= 3
|
||||
color_map = color_map[3:]
|
||||
return color_map
|
||||
|
||||
|
||||
def visualize(image, result, save_dir=None, weight=0.6):
|
||||
"""
|
||||
Convert predict result to color image, and save added image.
|
||||
|
||||
Args:
|
||||
image (str): The path of origin image.
|
||||
result (np.ndarray): The predict result of image.
|
||||
save_dir (str): The directory for saving visual image. Default: None.
|
||||
weight (float): The image weight of visual image, and the result weight is (1 - weight). Default: 0.6
|
||||
|
||||
Returns:
|
||||
vis_result (np.ndarray): If `save_dir` is None, return the visualized result.
|
||||
"""
|
||||
|
||||
color_map = get_color_map_list(256)
|
||||
color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
|
||||
color_map = np.array(color_map).astype("uint8")
|
||||
# Use OpenCV LUT for color mapping
|
||||
c1 = cv.LUT(result, color_map[:, 0])
|
||||
c2 = cv.LUT(result, color_map[:, 1])
|
||||
c3 = cv.LUT(result, color_map[:, 2])
|
||||
pseudo_img = np.dstack((c1, c2, c3))
|
||||
|
||||
im = cv.imread(image)
|
||||
vis_result = cv.addWeighted(im, weight, pseudo_img, 1 - weight, 0)
|
||||
|
||||
if save_dir is not None:
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
image_name = os.path.split(image)[-1]
|
||||
out_path = os.path.join(save_dir, image_name)
|
||||
cv.imwrite(out_path, vis_result)
|
||||
else:
|
||||
return vis_result
|
||||
|
||||
|
||||
def preprocess(image_path):
|
||||
''' preprocess input image file to np.ndarray
|
||||
|
||||
Args:
|
||||
image_path(str): Path of input image file
|
||||
|
||||
Returns:
|
||||
ProcessedImage(numpy.ndarray): A numpy.ndarray
|
||||
variable which shape is (1, 3, 192, 192)
|
||||
'''
|
||||
transforms = T.Compose([
|
||||
T.Resize((192, 192)),
|
||||
T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
],
|
||||
to_rgb=True)
|
||||
return np.expand_dims(transforms(image_path), axis=0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
img_path = "../../../../data/messi5.jpg"
|
||||
# load PPSeg Model use cv.dnn
|
||||
net = cv.dnn.readNetFromONNX('humanseg_hrnet18_tiny.onnx')
|
||||
# read and preprocess image file
|
||||
im = preprocess(img_path)
|
||||
# inference
|
||||
net.setInput(im)
|
||||
result = net.forward(['save_infer_model/scale_0.tmp_1'])
|
||||
# post process
|
||||
image = cv.imread(img_path)
|
||||
r, c, _ = image.shape
|
||||
result = np.argmax(result[0], axis=1).astype(np.uint8)
|
||||
result = cv.resize(result[0, :, :],
|
||||
dsize=(c, r),
|
||||
interpolation=cv.INTER_NEAREST)
|
||||
|
||||
print("grid_image.shape is: ", result.shape)
|
||||
folder_path = "data"
|
||||
if not os.path.exists(folder_path):
|
||||
os.makedirs(folder_path)
|
||||
file_path = os.path.join(folder_path, '%s.jpg' % "result_test_human")
|
||||
result_color = visualize(img_path, result)
|
||||
cv.imwrite(file_path, result_color)
|
||||
print('%s saved' % file_path)
|
@ -0,0 +1,61 @@
|
||||
import paddle
|
||||
import paddlehub as hub
|
||||
import paddlehub.vision.transforms as T
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
|
||||
|
||||
def preprocess(image_path):
|
||||
''' preprocess input image file to np.ndarray
|
||||
|
||||
Args:
|
||||
image_path(str): Path of input image file
|
||||
|
||||
Returns:
|
||||
ProcessedImage(numpy.ndarray): A numpy.ndarray
|
||||
variable which shape is (1, 3, 224, 224)
|
||||
'''
|
||||
transforms = T.Compose([
|
||||
T.Resize((256, 256)),
|
||||
T.CenterCrop(224),
|
||||
T.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225])],
|
||||
to_rgb=True)
|
||||
return np.expand_dims(transforms(image_path), axis=0)
|
||||
|
||||
|
||||
def export_onnx_resnet50(save_path):
|
||||
''' export PaddlePaddle model to ONNX format
|
||||
|
||||
Args:
|
||||
save_path(str): Path to save exported ONNX model
|
||||
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
model = hub.Module(name="resnet50_vd_imagenet_ssld")
|
||||
input_spec = paddle.static.InputSpec(
|
||||
[1, 3, 224, 224], "float32", "image")
|
||||
paddle.onnx.export(model, save_path,
|
||||
input_spec=[input_spec],
|
||||
opset_version=10)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
save_path = './resnet50'
|
||||
image_file = './data/cat.jpg'
|
||||
labels = open('./data/labels.txt').read().strip().split('\n')
|
||||
model = export_onnx_resnet50(save_path)
|
||||
|
||||
# load resnet50 use cv.dnn
|
||||
net = cv.dnn.readNetFromONNX(save_path + '.onnx')
|
||||
# read and preprocess image file
|
||||
im = preprocess(image_file)
|
||||
# inference
|
||||
net.setInput(im)
|
||||
result = net.forward(['save_infer_model/scale_0.tmp_0'])
|
||||
# post process
|
||||
class_id = np.argmax(result[0])
|
||||
label = labels[class_id]
|
||||
print("Image: {}".format(image_file))
|
||||
print("Predict Category: {}".format(label))
|
@ -0,0 +1,71 @@
|
||||
from torchvision import models
|
||||
|
||||
from ..pytorch_model import (
|
||||
PyTorchModelPreparer,
|
||||
PyTorchModelProcessor,
|
||||
PyTorchDnnModelProcessor
|
||||
)
|
||||
from ...common.evaluation.classification.cls_data_fetcher import PyTorchPreprocessedFetch
|
||||
from ...common.test.cls_model_test_pipeline import ClsModelTestPipeline
|
||||
from ...common.test.configs.default_preprocess_config import pytorch_resize_input_blob
|
||||
from ...common.test.configs.test_config import TestClsConfig
|
||||
from ...common.utils import set_pytorch_env, create_extended_parser
|
||||
|
||||
model_dict = {
|
||||
"alexnet": models.alexnet,
|
||||
|
||||
"vgg11": models.vgg11,
|
||||
"vgg13": models.vgg13,
|
||||
"vgg16": models.vgg16,
|
||||
"vgg19": models.vgg19,
|
||||
|
||||
"resnet18": models.resnet18,
|
||||
"resnet34": models.resnet34,
|
||||
"resnet50": models.resnet50,
|
||||
"resnet101": models.resnet101,
|
||||
"resnet152": models.resnet152,
|
||||
|
||||
"squeezenet1_0": models.squeezenet1_0,
|
||||
"squeezenet1_1": models.squeezenet1_1,
|
||||
|
||||
"resnext50_32x4d": models.resnext50_32x4d,
|
||||
"resnext101_32x8d": models.resnext101_32x8d,
|
||||
|
||||
"wide_resnet50_2": models.wide_resnet50_2,
|
||||
"wide_resnet101_2": models.wide_resnet101_2
|
||||
}
|
||||
|
||||
|
||||
class PyTorchClsModel(PyTorchModelPreparer):
|
||||
def __init__(self, height, width, model_name, original_model):
|
||||
super(PyTorchClsModel, self).__init__(height, width, model_name, original_model)
|
||||
|
||||
|
||||
def main():
|
||||
set_pytorch_env()
|
||||
|
||||
parser = create_extended_parser(list(model_dict.keys()))
|
||||
cmd_args = parser.parse_args()
|
||||
model_name = cmd_args.model_name
|
||||
|
||||
cls_model = PyTorchClsModel(
|
||||
height=TestClsConfig().frame_size,
|
||||
width=TestClsConfig().frame_size,
|
||||
model_name=model_name,
|
||||
original_model=model_dict[model_name](pretrained=True)
|
||||
)
|
||||
|
||||
pytorch_cls_pipeline = ClsModelTestPipeline(
|
||||
network_model=cls_model,
|
||||
model_processor=PyTorchModelProcessor,
|
||||
dnn_model_processor=PyTorchDnnModelProcessor,
|
||||
data_fetcher=PyTorchPreprocessedFetch,
|
||||
cls_args_parser=parser,
|
||||
default_input_blob_preproc=pytorch_resize_input_blob
|
||||
)
|
||||
|
||||
pytorch_cls_pipeline.init_test_pipeline()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,139 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.onnx
|
||||
from torch.autograd import Variable
|
||||
from torchvision import models
|
||||
|
||||
|
||||
def get_pytorch_onnx_model(original_model):
|
||||
# define the directory for further converted model save
|
||||
onnx_model_path = "models"
|
||||
# define the name of further converted model
|
||||
onnx_model_name = "resnet50.onnx"
|
||||
|
||||
# create directory for further converted model
|
||||
os.makedirs(onnx_model_path, exist_ok=True)
|
||||
|
||||
# get full path to the converted model
|
||||
full_model_path = os.path.join(onnx_model_path, onnx_model_name)
|
||||
|
||||
# generate model input
|
||||
generated_input = Variable(
|
||||
torch.randn(1, 3, 224, 224)
|
||||
)
|
||||
|
||||
# model export into ONNX format
|
||||
torch.onnx.export(
|
||||
original_model,
|
||||
generated_input,
|
||||
full_model_path,
|
||||
verbose=True,
|
||||
input_names=["input"],
|
||||
output_names=["output"],
|
||||
opset_version=11
|
||||
)
|
||||
|
||||
return full_model_path
|
||||
|
||||
|
||||
def get_preprocessed_img(img_path):
|
||||
# read the image
|
||||
input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
||||
input_img = input_img.astype(np.float32)
|
||||
|
||||
input_img = cv2.resize(input_img, (256, 256))
|
||||
|
||||
# define preprocess parameters
|
||||
mean = np.array([0.485, 0.456, 0.406]) * 255.0
|
||||
scale = 1 / 255.0
|
||||
std = [0.229, 0.224, 0.225]
|
||||
|
||||
# prepare input blob to fit the model input:
|
||||
# 1. subtract mean
|
||||
# 2. scale to set pixel values from 0 to 1
|
||||
input_blob = cv2.dnn.blobFromImage(
|
||||
image=input_img,
|
||||
scalefactor=scale,
|
||||
size=(224, 224), # img target size
|
||||
mean=mean,
|
||||
swapRB=True, # BGR -> RGB
|
||||
crop=True # center crop
|
||||
)
|
||||
# 3. divide by std
|
||||
input_blob[0] /= np.asarray(std, dtype=np.float32).reshape(3, 1, 1)
|
||||
return input_blob
|
||||
|
||||
|
||||
def get_imagenet_labels(labels_path):
|
||||
with open(labels_path) as f:
|
||||
imagenet_labels = [line.strip() for line in f.readlines()]
|
||||
return imagenet_labels
|
||||
|
||||
|
||||
def get_opencv_dnn_prediction(opencv_net, preproc_img, imagenet_labels):
|
||||
# set OpenCV DNN input
|
||||
opencv_net.setInput(preproc_img)
|
||||
|
||||
# OpenCV DNN inference
|
||||
out = opencv_net.forward()
|
||||
print("OpenCV DNN prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = np.argmax(out)
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
print("* confidence: {:.4f}".format(confidence))
|
||||
|
||||
|
||||
def get_pytorch_dnn_prediction(original_net, preproc_img, imagenet_labels):
|
||||
original_net.eval()
|
||||
preproc_img = torch.FloatTensor(preproc_img)
|
||||
|
||||
# inference
|
||||
with torch.no_grad():
|
||||
out = original_net(preproc_img)
|
||||
|
||||
print("\nPyTorch model prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = torch.argmax(out, axis=1).item()
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* confidence: {:.4f}".format(confidence.item()))
|
||||
|
||||
|
||||
def main():
|
||||
# initialize PyTorch ResNet-50 model
|
||||
original_model = models.resnet50(pretrained=True)
|
||||
|
||||
# get the path to the converted into ONNX PyTorch model
|
||||
full_model_path = get_pytorch_onnx_model(original_model)
|
||||
|
||||
# read converted .onnx model with OpenCV API
|
||||
opencv_net = cv2.dnn.readNetFromONNX(full_model_path)
|
||||
print("OpenCV model was successfully read. Layer IDs: \n", opencv_net.getLayerNames())
|
||||
|
||||
# get preprocessed image
|
||||
input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
|
||||
|
||||
# get ImageNet labels
|
||||
imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
|
||||
|
||||
# obtain OpenCV DNN predictions
|
||||
get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
|
||||
|
||||
# obtain original PyTorch ResNet50 predictions
|
||||
get_pytorch_dnn_prediction(original_model, input_img, imagenet_labels)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,50 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.onnx
|
||||
from torch.autograd import Variable
|
||||
from torchvision import models
|
||||
|
||||
|
||||
def get_pytorch_onnx_model(original_model):
|
||||
# define the directory for further converted model save
|
||||
onnx_model_path = "models"
|
||||
# define the name of further converted model
|
||||
onnx_model_name = "resnet50.onnx"
|
||||
|
||||
# create directory for further converted model
|
||||
os.makedirs(onnx_model_path, exist_ok=True)
|
||||
|
||||
# get full path to the converted model
|
||||
full_model_path = os.path.join(onnx_model_path, onnx_model_name)
|
||||
|
||||
# generate model input
|
||||
generated_input = Variable(
|
||||
torch.randn(1, 3, 224, 224)
|
||||
)
|
||||
|
||||
# model export into ONNX format
|
||||
torch.onnx.export(
|
||||
original_model,
|
||||
generated_input,
|
||||
full_model_path,
|
||||
verbose=True,
|
||||
input_names=["input"],
|
||||
output_names=["output"],
|
||||
opset_version=11
|
||||
)
|
||||
|
||||
return full_model_path
|
||||
|
||||
|
||||
def main():
|
||||
# initialize PyTorch ResNet-50 model
|
||||
original_model = models.resnet50(pretrained=True)
|
||||
|
||||
# get the path to the converted into ONNX PyTorch model
|
||||
full_model_path = get_pytorch_onnx_model(original_model)
|
||||
print("PyTorch ResNet-50 model was successfully converted: ", full_model_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
98
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/pytorch_model.py
vendored
Normal file
98
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/pytorch/pytorch_model.py
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import torch.onnx
|
||||
from torch.autograd import Variable
|
||||
|
||||
from ..common.abstract_model import AbstractModel, Framework
|
||||
from ..common.utils import DNN_LIB, get_full_model_path
|
||||
|
||||
CURRENT_LIB = "PyTorch"
|
||||
MODEL_FORMAT = ".onnx"
|
||||
|
||||
|
||||
class PyTorchModelPreparer(AbstractModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
height,
|
||||
width,
|
||||
model_name="default",
|
||||
original_model=object,
|
||||
batch_size=1,
|
||||
default_input_name="input",
|
||||
default_output_name="output"
|
||||
):
|
||||
self._height = height
|
||||
self._width = width
|
||||
self._model_name = model_name
|
||||
self._original_model = original_model
|
||||
self._batch_size = batch_size
|
||||
self._default_input_name = default_input_name
|
||||
self._default_output_name = default_output_name
|
||||
|
||||
self.model_path = self._set_model_path()
|
||||
self._dnn_model = self._set_dnn_model()
|
||||
|
||||
def _set_dnn_model(self):
|
||||
generated_input = Variable(torch.randn(
|
||||
self._batch_size, 3, self._height, self._width)
|
||||
)
|
||||
os.makedirs(self.model_path["path"], exist_ok=True)
|
||||
torch.onnx.export(
|
||||
self._original_model,
|
||||
generated_input,
|
||||
self.model_path["full_path"],
|
||||
verbose=True,
|
||||
input_names=[self._default_input_name],
|
||||
output_names=[self._default_output_name],
|
||||
opset_version=11
|
||||
)
|
||||
|
||||
return cv2.dnn.readNetFromONNX(self.model_path["full_path"])
|
||||
|
||||
def _set_model_path(self):
|
||||
model_to_save = self._model_name + MODEL_FORMAT
|
||||
return get_full_model_path(CURRENT_LIB.lower(), model_to_save)
|
||||
|
||||
def get_prepared_models(self):
|
||||
return {
|
||||
CURRENT_LIB + " " + self._model_name: self._original_model,
|
||||
DNN_LIB + " " + self._model_name: self._dnn_model
|
||||
}
|
||||
|
||||
|
||||
class PyTorchModelProcessor(Framework):
|
||||
def __init__(self, prepared_model, model_name):
|
||||
self._prepared_model = prepared_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
tensor = torch.FloatTensor(input_blob)
|
||||
self._prepared_model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
model_out = self._prepared_model(tensor)
|
||||
|
||||
# segmentation case
|
||||
if len(model_out) == 2:
|
||||
model_out = model_out['out']
|
||||
|
||||
out = model_out.detach().numpy()
|
||||
return out
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
|
||||
|
||||
class PyTorchDnnModelProcessor(Framework):
|
||||
def __init__(self, prepared_dnn_model, model_name):
|
||||
self._prepared_dnn_model = prepared_dnn_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
self._prepared_dnn_model.setInput(input_blob, '')
|
||||
return self._prepared_dnn_model.forward()
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
15
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
vendored
Normal file
15
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
# Python 3.7.5
|
||||
onnx>=1.7.0
|
||||
numpy>=1.19.1
|
||||
|
||||
torch>=1.5.1
|
||||
torchvision>=0.6.1
|
||||
|
||||
tensorflow>=2.1.0
|
||||
tensorflow-gpu>=2.1.0
|
||||
|
||||
paddlepaddle>=2.0.0
|
||||
paddlepaddle-gpu>=2.0.0
|
||||
paddlehub>=2.1.0
|
||||
paddle2onnx>=0.5.1
|
||||
paddleseg>=2.0.0
|
104
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_cls.py
vendored
Normal file
104
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/tf/classification/py_to_py_cls.py
vendored
Normal file
@ -0,0 +1,104 @@
|
||||
from tensorflow.keras.applications import (
|
||||
VGG16, vgg16,
|
||||
VGG19, vgg19,
|
||||
|
||||
ResNet50, resnet,
|
||||
ResNet101,
|
||||
ResNet152,
|
||||
|
||||
DenseNet121, densenet,
|
||||
DenseNet169,
|
||||
DenseNet201,
|
||||
|
||||
InceptionResNetV2, inception_resnet_v2,
|
||||
InceptionV3, inception_v3,
|
||||
|
||||
MobileNet, mobilenet,
|
||||
MobileNetV2, mobilenet_v2,
|
||||
|
||||
NASNetLarge, nasnet,
|
||||
NASNetMobile,
|
||||
|
||||
Xception, xception
|
||||
)
|
||||
|
||||
from ..tf_model import TFModelPreparer
|
||||
from ..tf_model import (
|
||||
TFModelProcessor,
|
||||
TFDnnModelProcessor
|
||||
)
|
||||
from ...common.evaluation.classification.cls_data_fetcher import TFPreprocessedFetch
|
||||
from ...common.test.cls_model_test_pipeline import ClsModelTestPipeline
|
||||
from ...common.test.configs.default_preprocess_config import (
|
||||
tf_input_blob,
|
||||
pytorch_input_blob,
|
||||
tf_model_blob_caffe_mode
|
||||
)
|
||||
from ...common.utils import set_tf_env, create_extended_parser
|
||||
|
||||
model_dict = {
|
||||
"vgg16": [VGG16, vgg16, tf_model_blob_caffe_mode],
|
||||
"vgg19": [VGG19, vgg19, tf_model_blob_caffe_mode],
|
||||
|
||||
"resnet50": [ResNet50, resnet, tf_model_blob_caffe_mode],
|
||||
"resnet101": [ResNet101, resnet, tf_model_blob_caffe_mode],
|
||||
"resnet152": [ResNet152, resnet, tf_model_blob_caffe_mode],
|
||||
|
||||
"densenet121": [DenseNet121, densenet, pytorch_input_blob],
|
||||
"densenet169": [DenseNet169, densenet, pytorch_input_blob],
|
||||
"densenet201": [DenseNet201, densenet, pytorch_input_blob],
|
||||
|
||||
"inceptionresnetv2": [InceptionResNetV2, inception_resnet_v2, tf_input_blob],
|
||||
"inceptionv3": [InceptionV3, inception_v3, tf_input_blob],
|
||||
|
||||
"mobilenet": [MobileNet, mobilenet, tf_input_blob],
|
||||
"mobilenetv2": [MobileNetV2, mobilenet_v2, tf_input_blob],
|
||||
|
||||
"nasnetlarge": [NASNetLarge, nasnet, tf_input_blob],
|
||||
"nasnetmobile": [NASNetMobile, nasnet, tf_input_blob],
|
||||
|
||||
"xception": [Xception, xception, tf_input_blob]
|
||||
}
|
||||
|
||||
CNN_CLASS_ID = 0
|
||||
CNN_UTILS_ID = 1
|
||||
DEFAULT_BLOB_PARAMS_ID = 2
|
||||
|
||||
|
||||
class TFClsModel(TFModelPreparer):
|
||||
def __init__(self, model_name, original_model):
|
||||
super(TFClsModel, self).__init__(model_name, original_model)
|
||||
|
||||
|
||||
def main():
|
||||
set_tf_env()
|
||||
|
||||
parser = create_extended_parser(list(model_dict.keys()))
|
||||
cmd_args = parser.parse_args()
|
||||
|
||||
model_name = cmd_args.model_name
|
||||
model_name_val = model_dict[model_name]
|
||||
|
||||
cls_model = TFClsModel(
|
||||
model_name=model_name,
|
||||
original_model=model_name_val[CNN_CLASS_ID](
|
||||
include_top=True,
|
||||
weights="imagenet"
|
||||
)
|
||||
)
|
||||
|
||||
tf_cls_pipeline = ClsModelTestPipeline(
|
||||
network_model=cls_model,
|
||||
model_processor=TFModelProcessor,
|
||||
dnn_model_processor=TFDnnModelProcessor,
|
||||
data_fetcher=TFPreprocessedFetch,
|
||||
img_processor=model_name_val[CNN_UTILS_ID].preprocess_input,
|
||||
cls_args_parser=parser,
|
||||
default_input_blob_preproc=model_name_val[DEFAULT_BLOB_PARAMS_ID]
|
||||
)
|
||||
|
||||
tf_cls_pipeline.init_test_pipeline()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,142 @@
|
||||
import os
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.applications import MobileNet
|
||||
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
|
||||
|
||||
from ...common.utils import set_tf_env
|
||||
|
||||
|
||||
def get_tf_model_proto(tf_model):
|
||||
# define the directory for .pb model
|
||||
pb_model_path = "models"
|
||||
|
||||
# define the name of .pb model
|
||||
pb_model_name = "mobilenet.pb"
|
||||
|
||||
# create directory for further converted model
|
||||
os.makedirs(pb_model_path, exist_ok=True)
|
||||
|
||||
# get model TF graph
|
||||
tf_model_graph = tf.function(lambda x: tf_model(x))
|
||||
|
||||
# get concrete function
|
||||
tf_model_graph = tf_model_graph.get_concrete_function(
|
||||
tf.TensorSpec(tf_model.inputs[0].shape, tf_model.inputs[0].dtype))
|
||||
|
||||
# obtain frozen concrete function
|
||||
frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
|
||||
# get frozen graph
|
||||
frozen_tf_func.graph.as_graph_def()
|
||||
|
||||
# save full tf model
|
||||
tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
|
||||
logdir=pb_model_path,
|
||||
name=pb_model_name,
|
||||
as_text=False)
|
||||
|
||||
return os.path.join(pb_model_path, pb_model_name)
|
||||
|
||||
|
||||
def get_preprocessed_img(img_path):
|
||||
# read the image
|
||||
input_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
||||
input_img = input_img.astype(np.float32)
|
||||
|
||||
# define preprocess parameters
|
||||
mean = np.array([1.0, 1.0, 1.0]) * 127.5
|
||||
scale = 1 / 127.5
|
||||
|
||||
# prepare input blob to fit the model input:
|
||||
# 1. subtract mean
|
||||
# 2. scale to set pixel values from 0 to 1
|
||||
input_blob = cv2.dnn.blobFromImage(
|
||||
image=input_img,
|
||||
scalefactor=scale,
|
||||
size=(224, 224), # img target size
|
||||
mean=mean,
|
||||
swapRB=True, # BGR -> RGB
|
||||
crop=True # center crop
|
||||
)
|
||||
print("Input blob shape: {}\n".format(input_blob.shape))
|
||||
|
||||
return input_blob
|
||||
|
||||
|
||||
def get_imagenet_labels(labels_path):
|
||||
with open(labels_path) as f:
|
||||
imagenet_labels = [line.strip() for line in f.readlines()]
|
||||
return imagenet_labels
|
||||
|
||||
|
||||
def get_opencv_dnn_prediction(opencv_net, preproc_img, imagenet_labels):
|
||||
# set OpenCV DNN input
|
||||
opencv_net.setInput(preproc_img)
|
||||
|
||||
# OpenCV DNN inference
|
||||
out = opencv_net.forward()
|
||||
print("OpenCV DNN prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = np.argmax(out)
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
print("* confidence: {:.4f}\n".format(confidence))
|
||||
|
||||
|
||||
def get_tf_dnn_prediction(original_net, preproc_img, imagenet_labels):
|
||||
# inference
|
||||
preproc_img = preproc_img.transpose(0, 2, 3, 1)
|
||||
print("TF input blob shape: {}\n".format(preproc_img.shape))
|
||||
|
||||
out = original_net(preproc_img)
|
||||
|
||||
print("\nTensorFlow model prediction: \n")
|
||||
print("* shape: ", out.shape)
|
||||
|
||||
# get the predicted class ID
|
||||
imagenet_class_id = np.argmax(out)
|
||||
print("* class ID: {}, label: {}".format(imagenet_class_id, imagenet_labels[imagenet_class_id]))
|
||||
|
||||
# get confidence
|
||||
confidence = out[0][imagenet_class_id]
|
||||
print("* confidence: {:.4f}".format(confidence))
|
||||
|
||||
|
||||
def main():
|
||||
# configure TF launching
|
||||
set_tf_env()
|
||||
|
||||
# initialize TF MobileNet model
|
||||
original_tf_model = MobileNet(
|
||||
include_top=True,
|
||||
weights="imagenet"
|
||||
)
|
||||
|
||||
# get TF frozen graph path
|
||||
full_pb_path = get_tf_model_proto(original_tf_model)
|
||||
|
||||
# read frozen graph with OpenCV API
|
||||
opencv_net = cv2.dnn.readNetFromTensorflow(full_pb_path)
|
||||
print("OpenCV model was successfully read. Model layers: \n", opencv_net.getLayerNames())
|
||||
|
||||
# get preprocessed image
|
||||
input_img = get_preprocessed_img("../data/squirrel_cls.jpg")
|
||||
|
||||
# get ImageNet labels
|
||||
imagenet_labels = get_imagenet_labels("../data/dnn/classification_classes_ILSVRC2012.txt")
|
||||
|
||||
# obtain OpenCV DNN predictions
|
||||
get_opencv_dnn_prediction(opencv_net, input_img, imagenet_labels)
|
||||
|
||||
# obtain TF model predictions
|
||||
get_tf_dnn_prediction(original_tf_model, input_img, imagenet_labels)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,45 @@
|
||||
import os
|
||||
import tarfile
|
||||
import urllib
|
||||
|
||||
DETECTION_MODELS_URL = 'http://download.tensorflow.org/models/object_detection/'
|
||||
|
||||
|
||||
def extract_tf_frozen_graph(model_name, extracted_model_path):
|
||||
# define model archive name
|
||||
tf_model_tar = model_name + '.tar.gz'
|
||||
# define link to retrieve model archive
|
||||
model_link = DETECTION_MODELS_URL + tf_model_tar
|
||||
|
||||
tf_frozen_graph_name = 'frozen_inference_graph'
|
||||
|
||||
try:
|
||||
urllib.request.urlretrieve(model_link, tf_model_tar)
|
||||
except Exception:
|
||||
print("TF {} was not retrieved: {}".format(model_name, model_link))
|
||||
return
|
||||
|
||||
print("TF {} was retrieved.".format(model_name))
|
||||
|
||||
tf_model_tar = tarfile.open(tf_model_tar)
|
||||
frozen_graph_path = ""
|
||||
|
||||
for model_tar_elem in tf_model_tar.getmembers():
|
||||
if tf_frozen_graph_name in os.path.basename(model_tar_elem.name):
|
||||
tf_model_tar.extract(model_tar_elem, extracted_model_path)
|
||||
frozen_graph_path = os.path.join(extracted_model_path, model_tar_elem.name)
|
||||
break
|
||||
tf_model_tar.close()
|
||||
|
||||
return frozen_graph_path
|
||||
|
||||
|
||||
def main():
|
||||
tf_model_name = 'ssd_mobilenet_v1_coco_2017_11_17'
|
||||
graph_extraction_dir = "./"
|
||||
frozen_graph_path = extract_tf_frozen_graph(tf_model_name, graph_extraction_dir)
|
||||
print("Frozen graph path for {}: {}".format(tf_model_name, frozen_graph_path))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
112
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py
vendored
Normal file
112
3rdparty/opencv-4.5.4/samples/dnn/dnn_model_runner/dnn_conversion/tf/tf_model.py
vendored
Normal file
@ -0,0 +1,112 @@
|
||||
import cv2
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
|
||||
|
||||
from ..common.abstract_model import AbstractModel, Framework
|
||||
from ..common.utils import DNN_LIB, get_full_model_path
|
||||
|
||||
CURRENT_LIB = "TF"
|
||||
MODEL_FORMAT = ".pb"
|
||||
|
||||
|
||||
class TFModelPreparer(AbstractModel):
|
||||
""" Class for the preparation of the TF models: original and converted OpenCV Net.
|
||||
|
||||
Args:
|
||||
model_name: TF model name
|
||||
original_model: TF configured model object or session
|
||||
is_ready_graph: indicates whether ready .pb file already exists
|
||||
tf_model_graph_path: path to the existing frozen TF graph
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name="default",
|
||||
original_model=None,
|
||||
is_ready_graph=False,
|
||||
tf_model_graph_path=""
|
||||
):
|
||||
self._model_name = model_name
|
||||
self._original_model = original_model
|
||||
self._model_to_save = ""
|
||||
|
||||
self._is_ready_to_transfer_graph = is_ready_graph
|
||||
self.model_path = self._set_model_path(tf_model_graph_path)
|
||||
self._dnn_model = self._set_dnn_model()
|
||||
|
||||
def _set_dnn_model(self):
|
||||
if not self._is_ready_to_transfer_graph:
|
||||
# get model TF graph
|
||||
tf_model_graph = tf.function(lambda x: self._original_model(x))
|
||||
|
||||
tf_model_graph = tf_model_graph.get_concrete_function(
|
||||
tf.TensorSpec(self._original_model.inputs[0].shape, self._original_model.inputs[0].dtype))
|
||||
|
||||
# obtain frozen concrete function
|
||||
frozen_tf_func = convert_variables_to_constants_v2(tf_model_graph)
|
||||
frozen_tf_func.graph.as_graph_def()
|
||||
|
||||
# save full TF model
|
||||
tf.io.write_graph(graph_or_graph_def=frozen_tf_func.graph,
|
||||
logdir=self.model_path["path"],
|
||||
name=self._model_to_save,
|
||||
as_text=False)
|
||||
|
||||
return cv2.dnn.readNetFromTensorflow(self.model_path["full_path"])
|
||||
|
||||
def _set_model_path(self, tf_pb_file_path):
|
||||
""" Method for setting model paths.
|
||||
|
||||
Args:
|
||||
tf_pb_file_path: path to the existing TF .pb
|
||||
|
||||
Returns:
|
||||
dictionary, where full_path key means saved model path and its full name.
|
||||
"""
|
||||
model_paths_dict = {
|
||||
"path": "",
|
||||
"full_path": tf_pb_file_path
|
||||
}
|
||||
|
||||
if not self._is_ready_to_transfer_graph:
|
||||
self._model_to_save = self._model_name + MODEL_FORMAT
|
||||
model_paths_dict = get_full_model_path(CURRENT_LIB.lower(), self._model_to_save)
|
||||
|
||||
return model_paths_dict
|
||||
|
||||
def get_prepared_models(self):
|
||||
original_lib_name = CURRENT_LIB + " " + self._model_name
|
||||
configured_model_dict = {
|
||||
original_lib_name: self._original_model,
|
||||
DNN_LIB + " " + self._model_name: self._dnn_model
|
||||
}
|
||||
return configured_model_dict
|
||||
|
||||
|
||||
class TFModelProcessor(Framework):
|
||||
def __init__(self, prepared_model, model_name):
|
||||
self._prepared_model = prepared_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
assert len(input_blob.shape) == 4
|
||||
batch_tf = input_blob.transpose(0, 2, 3, 1)
|
||||
out = self._prepared_model(batch_tf)
|
||||
return out
|
||||
|
||||
def get_name(self):
|
||||
return CURRENT_LIB
|
||||
|
||||
|
||||
class TFDnnModelProcessor(Framework):
|
||||
def __init__(self, prepared_dnn_model, model_name):
|
||||
self._prepared_dnn_model = prepared_dnn_model
|
||||
self._name = model_name
|
||||
|
||||
def get_output(self, input_blob):
|
||||
self._prepared_dnn_model.setInput(input_blob)
|
||||
ret_val = self._prepared_dnn_model.forward()
|
||||
return ret_val
|
||||
|
||||
def get_name(self):
|
||||
return DNN_LIB
|
364
3rdparty/opencv-4.5.4/samples/dnn/download_models.py
vendored
Normal file
364
3rdparty/opencv-4.5.4/samples/dnn/download_models.py
vendored
Normal file
@ -0,0 +1,364 @@
|
||||
'''
|
||||
Helper module to download extra data from Internet
|
||||
'''
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import cv2
|
||||
import sys
|
||||
import yaml
|
||||
import argparse
|
||||
import tarfile
|
||||
import platform
|
||||
import tempfile
|
||||
import hashlib
|
||||
import requests
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
if sys.version_info[0] < 3:
|
||||
from urllib2 import urlopen
|
||||
else:
|
||||
from urllib.request import urlopen
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
__all__ = ["downloadFile"]
|
||||
|
||||
class HashMismatchException(Exception):
|
||||
def __init__(self, expected, actual):
|
||||
Exception.__init__(self)
|
||||
self.expected = expected
|
||||
self.actual = actual
|
||||
def __str__(self):
|
||||
return 'Hash mismatch: expected {} vs actual of {}'.format(self.expected, self.actual)
|
||||
|
||||
def getHashsumFromFile(filepath):
|
||||
sha = hashlib.sha1()
|
||||
if os.path.exists(filepath):
|
||||
print(' there is already a file with the same name')
|
||||
with open(filepath, 'rb') as f:
|
||||
while True:
|
||||
buf = f.read(10*1024*1024)
|
||||
if not buf:
|
||||
break
|
||||
sha.update(buf)
|
||||
hashsum = sha.hexdigest()
|
||||
return hashsum
|
||||
|
||||
def checkHashsum(expected_sha, filepath, silent=True):
|
||||
print(' expected SHA1: {}'.format(expected_sha))
|
||||
actual_sha = getHashsumFromFile(filepath)
|
||||
print(' actual SHA1:{}'.format(actual_sha))
|
||||
hashes_matched = expected_sha == actual_sha
|
||||
if not hashes_matched and not silent:
|
||||
raise HashMismatchException(expected_sha, actual_sha)
|
||||
return hashes_matched
|
||||
|
||||
def isArchive(filepath):
|
||||
return tarfile.is_tarfile(filepath)
|
||||
|
||||
class DownloadInstance:
|
||||
def __init__(self, **kwargs):
|
||||
self.name = kwargs.pop('name')
|
||||
self.filename = kwargs.pop('filename')
|
||||
self.loader = kwargs.pop('loader', None)
|
||||
self.save_dir = kwargs.pop('save_dir')
|
||||
self.sha = kwargs.pop('sha', None)
|
||||
|
||||
def __str__(self):
|
||||
return 'DownloadInstance <{}>'.format(self.name)
|
||||
|
||||
def get(self):
|
||||
print(" Working on " + self.name)
|
||||
print(" Getting file " + self.filename)
|
||||
if self.sha is None:
|
||||
print(' No expected hashsum provided, loading file')
|
||||
else:
|
||||
filepath = os.path.join(self.save_dir, self.sha, self.filename)
|
||||
if checkHashsum(self.sha, filepath):
|
||||
print(' hash match - file already exists, skipping')
|
||||
return filepath
|
||||
else:
|
||||
print(' hash didn\'t match, loading file')
|
||||
|
||||
if not os.path.exists(self.save_dir):
|
||||
print(' creating directory: ' + self.save_dir)
|
||||
os.makedirs(self.save_dir)
|
||||
|
||||
|
||||
print(' hash check failed - loading')
|
||||
assert self.loader
|
||||
try:
|
||||
self.loader.load(self.filename, self.sha, self.save_dir)
|
||||
print(' done')
|
||||
print(' file {}'.format(self.filename))
|
||||
if self.sha is None:
|
||||
download_path = os.path.join(self.save_dir, self.filename)
|
||||
self.sha = getHashsumFromFile(download_path)
|
||||
new_dir = os.path.join(self.save_dir, self.sha)
|
||||
|
||||
if not os.path.exists(new_dir):
|
||||
os.makedirs(new_dir)
|
||||
filepath = os.path.join(new_dir, self.filename)
|
||||
if not (os.path.exists(filepath)):
|
||||
shutil.move(download_path, new_dir)
|
||||
print(' No expected hashsum provided, actual SHA is {}'.format(self.sha))
|
||||
else:
|
||||
checkHashsum(self.sha, filepath, silent=False)
|
||||
except Exception as e:
|
||||
print(" There was some problem with loading file {} for {}".format(self.filename, self.name))
|
||||
print(" Exception: {}".format(e))
|
||||
return
|
||||
|
||||
print(" Finished " + self.name)
|
||||
return filepath
|
||||
|
||||
class Loader(object):
|
||||
MB = 1024*1024
|
||||
BUFSIZE = 10*MB
|
||||
def __init__(self, download_name, download_sha, archive_member = None):
|
||||
self.download_name = download_name
|
||||
self.download_sha = download_sha
|
||||
self.archive_member = archive_member
|
||||
|
||||
def load(self, requested_file, sha, save_dir):
|
||||
if self.download_sha is None:
|
||||
download_dir = save_dir
|
||||
else:
|
||||
# create a new folder in save_dir to avoid possible name conflicts
|
||||
download_dir = os.path.join(save_dir, self.download_sha)
|
||||
if not os.path.exists(download_dir):
|
||||
os.makedirs(download_dir)
|
||||
download_path = os.path.join(download_dir, self.download_name)
|
||||
print(" Preparing to download file " + self.download_name)
|
||||
if checkHashsum(self.download_sha, download_path):
|
||||
print(' hash match - file already exists, no need to download')
|
||||
else:
|
||||
filesize = self.download(download_path)
|
||||
print(' Downloaded {} with size {} Mb'.format(self.download_name, filesize/self.MB))
|
||||
if self.download_sha is not None:
|
||||
checkHashsum(self.download_sha, download_path, silent=False)
|
||||
if self.download_name == requested_file:
|
||||
return
|
||||
else:
|
||||
if isArchive(download_path):
|
||||
if sha is not None:
|
||||
extract_dir = os.path.join(save_dir, sha)
|
||||
else:
|
||||
extract_dir = save_dir
|
||||
if not os.path.exists(extract_dir):
|
||||
os.makedirs(extract_dir)
|
||||
self.extract(requested_file, download_path, extract_dir)
|
||||
else:
|
||||
raise Exception("Downloaded file has different name")
|
||||
|
||||
def download(self, filepath):
|
||||
print("Warning: download is not implemented, this is a base class")
|
||||
return 0
|
||||
|
||||
def extract(self, requested_file, archive_path, save_dir):
|
||||
filepath = os.path.join(save_dir, requested_file)
|
||||
try:
|
||||
with tarfile.open(archive_path) as f:
|
||||
if self.archive_member is None:
|
||||
pathDict = dict((os.path.split(elem)[1], os.path.split(elem)[0]) for elem in f.getnames())
|
||||
self.archive_member = pathDict[requested_file]
|
||||
assert self.archive_member in f.getnames()
|
||||
self.save(filepath, f.extractfile(self.archive_member))
|
||||
except Exception as e:
|
||||
print(' catch {}'.format(e))
|
||||
|
||||
def save(self, filepath, r):
|
||||
with open(filepath, 'wb') as f:
|
||||
print(' progress ', end="")
|
||||
sys.stdout.flush()
|
||||
while True:
|
||||
buf = r.read(self.BUFSIZE)
|
||||
if not buf:
|
||||
break
|
||||
f.write(buf)
|
||||
print('>', end="")
|
||||
sys.stdout.flush()
|
||||
|
||||
class URLLoader(Loader):
|
||||
def __init__(self, download_name, download_sha, url, archive_member = None):
|
||||
super(URLLoader, self).__init__(download_name, download_sha, archive_member)
|
||||
self.download_name = download_name
|
||||
self.download_sha = download_sha
|
||||
self.url = url
|
||||
|
||||
def download(self, filepath):
|
||||
r = urlopen(self.url, timeout=60)
|
||||
self.printRequest(r)
|
||||
self.save(filepath, r)
|
||||
return os.path.getsize(filepath)
|
||||
|
||||
def printRequest(self, r):
|
||||
def getMB(r):
|
||||
d = dict(r.info())
|
||||
for c in ['content-length', 'Content-Length']:
|
||||
if c in d:
|
||||
return int(d[c]) / self.MB
|
||||
return '<unknown>'
|
||||
print(' {} {} [{} Mb]'.format(r.getcode(), r.msg, getMB(r)))
|
||||
|
||||
class GDriveLoader(Loader):
|
||||
BUFSIZE = 1024 * 1024
|
||||
PROGRESS_SIZE = 10 * 1024 * 1024
|
||||
def __init__(self, download_name, download_sha, gid, archive_member = None):
|
||||
super(GDriveLoader, self).__init__(download_name, download_sha, archive_member)
|
||||
self.download_name = download_name
|
||||
self.download_sha = download_sha
|
||||
self.gid = gid
|
||||
|
||||
def download(self, filepath):
|
||||
session = requests.Session() # re-use cookies
|
||||
|
||||
URL = "https://docs.google.com/uc?export=download"
|
||||
response = session.get(URL, params = { 'id' : self.gid }, stream = True)
|
||||
|
||||
def get_confirm_token(response): # in case of large files
|
||||
for key, value in response.cookies.items():
|
||||
if key.startswith('download_warning'):
|
||||
return value
|
||||
return None
|
||||
token = get_confirm_token(response)
|
||||
|
||||
if token:
|
||||
params = { 'id' : self.gid, 'confirm' : token }
|
||||
response = session.get(URL, params = params, stream = True)
|
||||
|
||||
sz = 0
|
||||
progress_sz = self.PROGRESS_SIZE
|
||||
with open(filepath, "wb") as f:
|
||||
for chunk in response.iter_content(self.BUFSIZE):
|
||||
if not chunk:
|
||||
continue # keep-alive
|
||||
|
||||
f.write(chunk)
|
||||
sz += len(chunk)
|
||||
if sz >= progress_sz:
|
||||
progress_sz += self.PROGRESS_SIZE
|
||||
print('>', end='')
|
||||
sys.stdout.flush()
|
||||
print('')
|
||||
return sz
|
||||
|
||||
def produceDownloadInstance(instance_name, filename, sha, url, save_dir, download_name=None, download_sha=None, archive_member=None):
|
||||
spec_param = url
|
||||
loader = URLLoader
|
||||
if download_name is None:
|
||||
download_name = filename
|
||||
if download_sha is None:
|
||||
download_sha = sha
|
||||
if "drive.google.com" in url:
|
||||
token = ""
|
||||
token_part = url.rsplit('/', 1)[-1]
|
||||
if "&id=" not in token_part:
|
||||
token_part = url.rsplit('/', 1)[-2]
|
||||
for param in token_part.split("&"):
|
||||
if param.startswith("id="):
|
||||
token = param[3:]
|
||||
if token:
|
||||
loader = GDriveLoader
|
||||
spec_param = token
|
||||
else:
|
||||
print("Warning: possibly wrong Google Drive link")
|
||||
return DownloadInstance(
|
||||
name=instance_name,
|
||||
filename=filename,
|
||||
sha=sha,
|
||||
save_dir=save_dir,
|
||||
loader=loader(download_name, download_sha, spec_param, archive_member)
|
||||
)
|
||||
|
||||
def getSaveDir():
|
||||
env_path = os.environ.get("OPENCV_DOWNLOAD_DATA_PATH", None)
|
||||
if env_path:
|
||||
save_dir = env_path
|
||||
else:
|
||||
# TODO reuse binding function cv2.utils.fs.getCacheDirectory when issue #19011 is fixed
|
||||
if platform.system() == "Darwin":
|
||||
#On Apple devices
|
||||
temp_env = os.environ.get("TMPDIR", None)
|
||||
if temp_env is None or not os.path.isdir(temp_env):
|
||||
temp_dir = Path("/tmp")
|
||||
print("Using world accessible cache directory. This may be not secure: ", temp_dir)
|
||||
else:
|
||||
temp_dir = temp_env
|
||||
elif platform.system() == "Windows":
|
||||
temp_dir = tempfile.gettempdir()
|
||||
else:
|
||||
xdg_cache_env = os.environ.get("XDG_CACHE_HOME", None)
|
||||
if (xdg_cache_env and xdg_cache_env[0] and os.path.isdir(xdg_cache_env)):
|
||||
temp_dir = xdg_cache_env
|
||||
else:
|
||||
home_env = os.environ.get("HOME", None)
|
||||
if (home_env and home_env[0] and os.path.isdir(home_env)):
|
||||
home_path = os.path.join(home_env, ".cache/")
|
||||
if os.path.isdir(home_path):
|
||||
temp_dir = home_path
|
||||
else:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
print("Using world accessible cache directory. This may be not secure: ", temp_dir)
|
||||
|
||||
save_dir = os.path.join(temp_dir, "downloads")
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
return save_dir
|
||||
|
||||
def downloadFile(url, sha=None, save_dir=None, filename=None):
|
||||
if save_dir is None:
|
||||
save_dir = getSaveDir()
|
||||
if filename is None:
|
||||
filename = "download_" + datetime.now().__str__()
|
||||
name = filename
|
||||
return produceDownloadInstance(name, filename, sha, url, save_dir).get()
|
||||
|
||||
def parseMetalinkFile(metalink_filepath, save_dir):
|
||||
NS = {'ml': 'urn:ietf:params:xml:ns:metalink'}
|
||||
models = []
|
||||
for file_elem in ET.parse(metalink_filepath).getroot().findall('ml:file', NS):
|
||||
url = file_elem.find('ml:url', NS).text
|
||||
fname = file_elem.attrib['name']
|
||||
name = file_elem.find('ml:identity', NS).text
|
||||
hash_sum = file_elem.find('ml:hash', NS).text
|
||||
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir))
|
||||
return models
|
||||
|
||||
def parseYAMLFile(yaml_filepath, save_dir):
|
||||
models = []
|
||||
with open(yaml_filepath, 'r') as stream:
|
||||
data_loaded = yaml.safe_load(stream)
|
||||
for name, params in data_loaded.items():
|
||||
load_info = params.get("load_info", None)
|
||||
if load_info:
|
||||
fname = os.path.basename(params.get("model"))
|
||||
hash_sum = load_info.get("sha1")
|
||||
url = load_info.get("url")
|
||||
download_sha = load_info.get("download_sha")
|
||||
download_name = load_info.get("download_name")
|
||||
archive_member = load_info.get("member")
|
||||
models.append(produceDownloadInstance(name, fname, hash_sum, url, save_dir,
|
||||
download_name=download_name, download_sha=download_sha, archive_member=archive_member))
|
||||
|
||||
return models
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='This is a utility script for downloading DNN models for samples.')
|
||||
|
||||
parser.add_argument('--save_dir', action="store", default=os.getcwd(),
|
||||
help='Path to the directory to store downloaded files')
|
||||
parser.add_argument('model_name', type=str, default="", nargs='?', action="store",
|
||||
help='name of the model to download')
|
||||
args = parser.parse_args()
|
||||
models = []
|
||||
save_dir = args.save_dir
|
||||
selected_model_name = args.model_name
|
||||
models.extend(parseMetalinkFile('face_detector/weights.meta4', save_dir))
|
||||
models.extend(parseYAMLFile('models.yml', save_dir))
|
||||
for m in models:
|
||||
print(m)
|
||||
if selected_model_name and not m.name.startswith(selected_model_name):
|
||||
continue
|
||||
print('Model: ' + selected_model_name)
|
||||
m.get()
|
69
3rdparty/opencv-4.5.4/samples/dnn/edge_detection.py
vendored
Normal file
69
3rdparty/opencv-4.5.4/samples/dnn/edge_detection.py
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This sample shows how to define custom OpenCV deep learning layers in Python. '
|
||||
'Holistically-Nested Edge Detection (https://arxiv.org/abs/1504.06375) neural network '
|
||||
'is used as an example model. Find a pre-trained model at https://github.com/s9xie/hed.')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--prototxt', help='Path to deploy.prototxt', required=True)
|
||||
parser.add_argument('--caffemodel', help='Path to hed_pretrained_bsds.caffemodel', required=True)
|
||||
parser.add_argument('--width', help='Resize input image to a specific width', default=500, type=int)
|
||||
parser.add_argument('--height', help='Resize input image to a specific height', default=500, type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
#! [CropLayer]
|
||||
class CropLayer(object):
|
||||
def __init__(self, params, blobs):
|
||||
self.xstart = 0
|
||||
self.xend = 0
|
||||
self.ystart = 0
|
||||
self.yend = 0
|
||||
|
||||
# Our layer receives two inputs. We need to crop the first input blob
|
||||
# to match a shape of the second one (keeping batch size and number of channels)
|
||||
def getMemoryShapes(self, inputs):
|
||||
inputShape, targetShape = inputs[0], inputs[1]
|
||||
batchSize, numChannels = inputShape[0], inputShape[1]
|
||||
height, width = targetShape[2], targetShape[3]
|
||||
|
||||
self.ystart = (inputShape[2] - targetShape[2]) // 2
|
||||
self.xstart = (inputShape[3] - targetShape[3]) // 2
|
||||
self.yend = self.ystart + height
|
||||
self.xend = self.xstart + width
|
||||
|
||||
return [[batchSize, numChannels, height, width]]
|
||||
|
||||
def forward(self, inputs):
|
||||
return [inputs[0][:,:,self.ystart:self.yend,self.xstart:self.xend]]
|
||||
#! [CropLayer]
|
||||
|
||||
#! [Register]
|
||||
cv.dnn_registerLayer('Crop', CropLayer)
|
||||
#! [Register]
|
||||
|
||||
# Load the model.
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.prototxt), cv.samples.findFile(args.caffemodel))
|
||||
|
||||
kWinName = 'Holistically-Nested Edge Detection'
|
||||
cv.namedWindow('Input', cv.WINDOW_NORMAL)
|
||||
cv.namedWindow(kWinName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
cv.imshow('Input', frame)
|
||||
|
||||
inp = cv.dnn.blobFromImage(frame, scalefactor=1.0, size=(args.width, args.height),
|
||||
mean=(104.00698793, 116.66876762, 122.67891434),
|
||||
swapRB=False, crop=False)
|
||||
net.setInput(inp)
|
||||
|
||||
out = net.forward()
|
||||
out = out[0, 0]
|
||||
out = cv.resize(out, (frame.shape[1], frame.shape[0]))
|
||||
cv.imshow(kWinName, out)
|
132
3rdparty/opencv-4.5.4/samples/dnn/face_detect.cpp
vendored
Normal file
132
3rdparty/opencv-4.5.4/samples/dnn/face_detect.cpp
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/objdetect.hpp>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using namespace cv;
|
||||
using namespace std;
|
||||
|
||||
static Mat visualize(Mat input, Mat faces, int thickness=2)
|
||||
{
|
||||
Mat output = input.clone();
|
||||
for (int i = 0; i < faces.rows; i++)
|
||||
{
|
||||
// Print results
|
||||
cout << "Face " << i
|
||||
<< ", top-left coordinates: (" << faces.at<float>(i, 0) << ", " << faces.at<float>(i, 1) << "), "
|
||||
<< "box width: " << faces.at<float>(i, 2) << ", box height: " << faces.at<float>(i, 3) << ", "
|
||||
<< "score: " << faces.at<float>(i, 14) << "\n";
|
||||
|
||||
// Draw bounding box
|
||||
rectangle(output, Rect2i(int(faces.at<float>(i, 0)), int(faces.at<float>(i, 1)), int(faces.at<float>(i, 2)), int(faces.at<float>(i, 3))), Scalar(0, 255, 0), thickness);
|
||||
// Draw landmarks
|
||||
circle(output, Point2i(int(faces.at<float>(i, 4)), int(faces.at<float>(i, 5))), 2, Scalar(255, 0, 0), thickness);
|
||||
circle(output, Point2i(int(faces.at<float>(i, 6)), int(faces.at<float>(i, 7))), 2, Scalar( 0, 0, 255), thickness);
|
||||
circle(output, Point2i(int(faces.at<float>(i, 8)), int(faces.at<float>(i, 9))), 2, Scalar( 0, 255, 0), thickness);
|
||||
circle(output, Point2i(int(faces.at<float>(i, 10)), int(faces.at<float>(i, 11))), 2, Scalar(255, 0, 255), thickness);
|
||||
circle(output, Point2i(int(faces.at<float>(i, 12)), int(faces.at<float>(i, 13))), 2, Scalar( 0, 255, 255), thickness);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv,
|
||||
"{help h | | Print this message.}"
|
||||
"{input i | | Path to the input image. Omit for detecting on default camera.}"
|
||||
"{model m | yunet.onnx | Path to the model. Download yunet.onnx in https://github.com/ShiqiYu/libfacedetection.train/tree/master/tasks/task1/onnx.}"
|
||||
"{score_threshold | 0.9 | Filter out faces of score < score_threshold.}"
|
||||
"{nms_threshold | 0.3 | Suppress bounding boxes of iou >= nms_threshold.}"
|
||||
"{top_k | 5000 | Keep top_k bounding boxes before NMS.}"
|
||||
"{save s | false | Set true to save results. This flag is invalid when using camera.}"
|
||||
"{vis v | true | Set true to open a window for result visualization. This flag is invalid when using camera.}"
|
||||
);
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return -1;
|
||||
}
|
||||
|
||||
String modelPath = parser.get<String>("model");
|
||||
|
||||
float scoreThreshold = parser.get<float>("score_threshold");
|
||||
float nmsThreshold = parser.get<float>("nms_threshold");
|
||||
int topK = parser.get<int>("top_k");
|
||||
|
||||
bool save = parser.get<bool>("save");
|
||||
bool vis = parser.get<bool>("vis");
|
||||
|
||||
// Initialize FaceDetectorYN
|
||||
Ptr<FaceDetectorYN> detector = FaceDetectorYN::create(modelPath, "", Size(320, 320), scoreThreshold, nmsThreshold, topK);
|
||||
|
||||
// If input is an image
|
||||
if (parser.has("input"))
|
||||
{
|
||||
String input = parser.get<String>("input");
|
||||
Mat image = imread(input);
|
||||
|
||||
// Set input size before inference
|
||||
detector->setInputSize(image.size());
|
||||
|
||||
// Inference
|
||||
Mat faces;
|
||||
detector->detect(image, faces);
|
||||
|
||||
// Draw results on the input image
|
||||
Mat result = visualize(image, faces);
|
||||
|
||||
// Save results if save is true
|
||||
if(save)
|
||||
{
|
||||
cout << "Results saved to result.jpg\n";
|
||||
imwrite("result.jpg", result);
|
||||
}
|
||||
|
||||
// Visualize results
|
||||
if (vis)
|
||||
{
|
||||
namedWindow(input, WINDOW_AUTOSIZE);
|
||||
imshow(input, result);
|
||||
waitKey(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int deviceId = 0;
|
||||
VideoCapture cap;
|
||||
cap.open(deviceId, CAP_ANY);
|
||||
int frameWidth = int(cap.get(CAP_PROP_FRAME_WIDTH));
|
||||
int frameHeight = int(cap.get(CAP_PROP_FRAME_HEIGHT));
|
||||
detector->setInputSize(Size(frameWidth, frameHeight));
|
||||
|
||||
Mat frame;
|
||||
TickMeter tm;
|
||||
String msg = "FPS: ";
|
||||
while(waitKey(1) < 0) // Press any key to exit
|
||||
{
|
||||
// Get frame
|
||||
if (!cap.read(frame))
|
||||
{
|
||||
cerr << "No frames grabbed!\n";
|
||||
break;
|
||||
}
|
||||
|
||||
// Inference
|
||||
Mat faces;
|
||||
tm.start();
|
||||
detector->detect(frame, faces);
|
||||
tm.stop();
|
||||
|
||||
// Draw results on the input image
|
||||
Mat result = visualize(frame, faces);
|
||||
putText(result, msg + to_string(tm.getFPS()), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
// Visualize results
|
||||
imshow("Live", result);
|
||||
|
||||
tm.reset();
|
||||
}
|
||||
}
|
||||
}
|
101
3rdparty/opencv-4.5.4/samples/dnn/face_detect.py
vendored
Normal file
101
3rdparty/opencv-4.5.4/samples/dnn/face_detect.py
vendored
Normal file
@ -0,0 +1,101 @@
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
def str2bool(v):
|
||||
if v.lower() in ['on', 'yes', 'true', 'y', 't']:
|
||||
return True
|
||||
elif v.lower() in ['off', 'no', 'false', 'n', 'f']:
|
||||
return False
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--input', '-i', type=str, help='Path to the input image.')
|
||||
parser.add_argument('--model', '-m', type=str, default='yunet.onnx', help='Path to the model. Download the model at https://github.com/ShiqiYu/libfacedetection.train/tree/master/tasks/task1/onnx.')
|
||||
parser.add_argument('--score_threshold', type=float, default=0.9, help='Filtering out faces of score < score_threshold.')
|
||||
parser.add_argument('--nms_threshold', type=float, default=0.3, help='Suppress bounding boxes of iou >= nms_threshold.')
|
||||
parser.add_argument('--top_k', type=int, default=5000, help='Keep top_k bounding boxes before NMS.')
|
||||
parser.add_argument('--save', '-s', type=str2bool, default=False, help='Set true to save results. This flag is invalid when using camera.')
|
||||
parser.add_argument('--vis', '-v', type=str2bool, default=True, help='Set true to open a window for result visualization. This flag is invalid when using camera.')
|
||||
args = parser.parse_args()
|
||||
|
||||
def visualize(input, faces, thickness=2):
|
||||
output = input.copy()
|
||||
if faces[1] is not None:
|
||||
for idx, face in enumerate(faces[1]):
|
||||
print('Face {}, top-left coordinates: ({:.0f}, {:.0f}), box width: {:.0f}, box height {:.0f}, score: {:.2f}'.format(idx, face[0], face[1], face[2], face[3], face[-1]))
|
||||
|
||||
coords = face[:-1].astype(np.int32)
|
||||
cv.rectangle(output, (coords[0], coords[1]), (coords[0]+coords[2], coords[1]+coords[3]), (0, 255, 0), 2)
|
||||
cv.circle(output, (coords[4], coords[5]), 2, (255, 0, 0), 2)
|
||||
cv.circle(output, (coords[6], coords[7]), 2, (0, 0, 255), 2)
|
||||
cv.circle(output, (coords[8], coords[9]), 2, (0, 255, 0), 2)
|
||||
cv.circle(output, (coords[10], coords[11]), 2, (255, 0, 255), 2)
|
||||
cv.circle(output, (coords[12], coords[13]), 2, (0, 255, 255), 2)
|
||||
return output
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Instantiate FaceDetectorYN
|
||||
detector = cv.FaceDetectorYN.create(
|
||||
args.model,
|
||||
"",
|
||||
(320, 320),
|
||||
args.score_threshold,
|
||||
args.nms_threshold,
|
||||
args.top_k
|
||||
)
|
||||
|
||||
# If input is an image
|
||||
if args.input is not None:
|
||||
image = cv.imread(args.input)
|
||||
|
||||
# Set input size before inference
|
||||
detector.setInputSize((image.shape[1], image.shape[0]))
|
||||
|
||||
# Inference
|
||||
faces = detector.detect(image)
|
||||
|
||||
# Draw results on the input image
|
||||
result = visualize(image, faces)
|
||||
|
||||
# Save results if save is true
|
||||
if args.save:
|
||||
print('Resutls saved to result.jpg\n')
|
||||
cv.imwrite('result.jpg', result)
|
||||
|
||||
# Visualize results in a new window
|
||||
if args.vis:
|
||||
cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE)
|
||||
cv.imshow(args.input, result)
|
||||
cv.waitKey(0)
|
||||
else: # Omit input to call default camera
|
||||
deviceId = 0
|
||||
cap = cv.VideoCapture(deviceId)
|
||||
frameWidth = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
|
||||
frameHeight = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
|
||||
detector.setInputSize([frameWidth, frameHeight])
|
||||
|
||||
tm = cv.TickMeter()
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
print('No frames grabbed!')
|
||||
break
|
||||
|
||||
# Inference
|
||||
tm.start()
|
||||
faces = detector.detect(frame) # faces is a tuple
|
||||
tm.stop()
|
||||
|
||||
# Draw results on the input image
|
||||
frame = visualize(frame, faces)
|
||||
|
||||
cv.putText(frame, 'FPS: {}'.format(tm.getFPS()), (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
# Visualize results in a new Window
|
||||
cv.imshow('Live', frame)
|
||||
|
||||
tm.reset()
|
1790
3rdparty/opencv-4.5.4/samples/dnn/face_detector/deploy.prototxt
vendored
Normal file
1790
3rdparty/opencv-4.5.4/samples/dnn/face_detector/deploy.prototxt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1790
3rdparty/opencv-4.5.4/samples/dnn/face_detector/deploy_lowres.prototxt
vendored
Normal file
1790
3rdparty/opencv-4.5.4/samples/dnn/face_detector/deploy_lowres.prototxt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
79
3rdparty/opencv-4.5.4/samples/dnn/face_detector/how_to_train_face_detector.txt
vendored
Normal file
79
3rdparty/opencv-4.5.4/samples/dnn/face_detector/how_to_train_face_detector.txt
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
This is a brief description of training process which has been used to get res10_300x300_ssd_iter_140000.caffemodel.
|
||||
The model was created with SSD framework using ResNet-10 like architecture as a backbone. Channels count in ResNet-10 convolution layers was significantly dropped (2x- or 4x- fewer channels).
|
||||
The model was trained in Caffe framework on some huge and available online dataset.
|
||||
|
||||
1. Prepare training tools
|
||||
You need to use "ssd" branch from this repository https://github.com/weiliu89/caffe/tree/ssd . Checkout this branch and built it (see instructions in repo's README)
|
||||
|
||||
2. Prepare training data.
|
||||
The data preparation pipeline can be represented as:
|
||||
|
||||
(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training
|
||||
|
||||
a) Find some datasets with face bounding boxes annotation. For some reasons I can't provide links here, but you easily find them on your own. Also study the data. It may contain small or low quality faces which can spoil training process. Often there are special flags about object quality in annotation. Remove such faces from annotation (smaller when 16 along at least one side, or blurred, of highly-occluded, or something else).
|
||||
|
||||
b) The downloaded dataset will have some format of annotation. It may be one single file for all images, or separate file for each image or something else. But to train SSD in Caffe you need to convert annotation to PASCAL VOC format.
|
||||
PASCAL VOC annotation consist of .xml file for each image. In this xml file all face bounding boxes should be listed as:
|
||||
|
||||
<annotation>
|
||||
<size>
|
||||
<width>300</width>
|
||||
<height>300</height>
|
||||
</size>
|
||||
<object>
|
||||
<name>face</name>
|
||||
<difficult>0</difficult>
|
||||
<bndbox>
|
||||
<xmin>100</xmin>
|
||||
<ymin>100</ymin>
|
||||
<xmax>200</xmax>
|
||||
<ymax>200</ymax>
|
||||
</bndbox>
|
||||
</object>
|
||||
<object>
|
||||
<name>face</name>
|
||||
<difficult>0</difficult>
|
||||
<bndbox>
|
||||
<xmin>0</xmin>
|
||||
<ymin>0</ymin>
|
||||
<xmax>100</xmax>
|
||||
<ymax>100</ymax>
|
||||
</bndbox>
|
||||
</object>
|
||||
</annotation>
|
||||
|
||||
So, convert your dataset's annotation to the format above.
|
||||
Also, you should create labelmap.prototxt file with the following content:
|
||||
item {
|
||||
name: "none_of_the_above"
|
||||
label: 0
|
||||
display_name: "background"
|
||||
}
|
||||
item {
|
||||
name: "face"
|
||||
label: 1
|
||||
display_name: "face"
|
||||
}
|
||||
|
||||
You need this file to establish correspondence between name of class and digital label of class.
|
||||
|
||||
For next step we also need file there all our image-annotation file names pairs are listed. This file should contain similar lines:
|
||||
images_val/0.jpg annotations_val/0.jpg.xml
|
||||
|
||||
c) To create LMDB you need to use create_data.sh tool from caffe/data/VOC0712 Caffe's source code directory.
|
||||
This script calls create_annoset.py inside, so check out what you need to pass as script's arguments
|
||||
|
||||
You need to prepare 2 LMDB databases: one for training images, one for validation images.
|
||||
|
||||
3. Train your detector
|
||||
For training you need to have 3 files: train.prototxt, test.prototxt and solver.prototxt. You can find these files in the same directory as for this readme.
|
||||
Also you need to edit train.prototxt and test.prototxt to replace paths for your LMDB databases to actual databases you've created in step 2.
|
||||
|
||||
Now all is done for launch training process.
|
||||
Execute next lines in Terminal:
|
||||
mkdir -p snapshot
|
||||
mkdir -p log
|
||||
/path_for_caffe_build_dir/tools/caffe train -solver="solver.prototxt" -gpu 0 2>&1 | tee -a log/log.log
|
||||
|
||||
And wait. It will take about 8 hours to finish the process.
|
||||
After it you can use your .caffemodel from snapshot/ subdirectory in resnet_face_ssd_python.py sample.
|
2368
3rdparty/opencv-4.5.4/samples/dnn/face_detector/opencv_face_detector.pbtxt
vendored
Normal file
2368
3rdparty/opencv-4.5.4/samples/dnn/face_detector/opencv_face_detector.pbtxt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
28
3rdparty/opencv-4.5.4/samples/dnn/face_detector/solver.prototxt
vendored
Normal file
28
3rdparty/opencv-4.5.4/samples/dnn/face_detector/solver.prototxt
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
train_net: "train.prototxt"
|
||||
test_net: "test.prototxt"
|
||||
|
||||
test_iter: 2312
|
||||
test_interval: 5000
|
||||
test_initialization: true
|
||||
|
||||
base_lr: 0.01
|
||||
display: 10
|
||||
lr_policy: "multistep"
|
||||
max_iter: 140000
|
||||
stepvalue: 80000
|
||||
stepvalue: 120000
|
||||
gamma: 0.1
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0005
|
||||
average_loss: 500
|
||||
iter_size: 1
|
||||
type: "SGD"
|
||||
|
||||
solver_mode: GPU
|
||||
random_seed: 0
|
||||
debug_info: false
|
||||
snapshot: 1000
|
||||
snapshot_prefix: "snapshot/res10_300x300_ssd"
|
||||
|
||||
eval_type: "detection"
|
||||
ap_version: "11point"
|
1831
3rdparty/opencv-4.5.4/samples/dnn/face_detector/test.prototxt
vendored
Normal file
1831
3rdparty/opencv-4.5.4/samples/dnn/face_detector/test.prototxt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1898
3rdparty/opencv-4.5.4/samples/dnn/face_detector/train.prototxt
vendored
Normal file
1898
3rdparty/opencv-4.5.4/samples/dnn/face_detector/train.prototxt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
13
3rdparty/opencv-4.5.4/samples/dnn/face_detector/weights.meta4
vendored
Normal file
13
3rdparty/opencv-4.5.4/samples/dnn/face_detector/weights.meta4
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<metalink xmlns="urn:ietf:params:xml:ns:metalink">
|
||||
<file name="res10_300x300_ssd_iter_140000_fp16.caffemodel">
|
||||
<identity>opencv_face_detector_fp16</identity>
|
||||
<hash type="sha-1">31fc22bfdd907567a04bb45b7cfad29966caddc1</hash>
|
||||
<url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel</url>
|
||||
</file>
|
||||
<file name="opencv_face_detector_uint8.pb">
|
||||
<identity>opencv_face_detector_uint8</identity>
|
||||
<hash type="sha-1">4f2fdf6f231d759d7bbdb94353c5a68690f3d2ae</hash>
|
||||
<url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180220_uint8/opencv_face_detector_uint8.pb</url>
|
||||
</file>
|
||||
</metalink>
|
103
3rdparty/opencv-4.5.4/samples/dnn/face_match.cpp
vendored
Normal file
103
3rdparty/opencv-4.5.4/samples/dnn/face_match.cpp
vendored
Normal file
@ -0,0 +1,103 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "opencv2/dnn.hpp"
|
||||
#include "opencv2/imgproc.hpp"
|
||||
#include "opencv2/highgui.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "opencv2/objdetect.hpp"
|
||||
|
||||
|
||||
using namespace cv;
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
if (argc != 5)
|
||||
{
|
||||
std::cerr << "Usage " << argv[0] << ": "
|
||||
<< "<det_onnx_path> "
|
||||
<< "<reg_onnx_path> "
|
||||
<< "<image1>"
|
||||
<< "<image2>\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
String det_onnx_path = argv[1];
|
||||
String reg_onnx_path = argv[2];
|
||||
String image1_path = argv[3];
|
||||
String image2_path = argv[4];
|
||||
std::cout<<image1_path<<" "<<image2_path<<std::endl;
|
||||
Mat image1 = imread(image1_path);
|
||||
Mat image2 = imread(image2_path);
|
||||
|
||||
float score_thresh = 0.9f;
|
||||
float nms_thresh = 0.3f;
|
||||
double cosine_similar_thresh = 0.363;
|
||||
double l2norm_similar_thresh = 1.128;
|
||||
int top_k = 5000;
|
||||
|
||||
// Initialize FaceDetector
|
||||
Ptr<FaceDetectorYN> faceDetector;
|
||||
|
||||
faceDetector = FaceDetectorYN::create(det_onnx_path, "", image1.size(), score_thresh, nms_thresh, top_k);
|
||||
Mat faces_1;
|
||||
faceDetector->detect(image1, faces_1);
|
||||
if (faces_1.rows < 1)
|
||||
{
|
||||
std::cerr << "Cannot find a face in " << image1_path << "\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
faceDetector = FaceDetectorYN::create(det_onnx_path, "", image2.size(), score_thresh, nms_thresh, top_k);
|
||||
Mat faces_2;
|
||||
faceDetector->detect(image2, faces_2);
|
||||
if (faces_2.rows < 1)
|
||||
{
|
||||
std::cerr << "Cannot find a face in " << image2_path << "\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Initialize FaceRecognizerSF
|
||||
Ptr<FaceRecognizerSF> faceRecognizer = FaceRecognizerSF::create(reg_onnx_path, "");
|
||||
|
||||
|
||||
Mat aligned_face1, aligned_face2;
|
||||
faceRecognizer->alignCrop(image1, faces_1.row(0), aligned_face1);
|
||||
faceRecognizer->alignCrop(image2, faces_2.row(0), aligned_face2);
|
||||
|
||||
Mat feature1, feature2;
|
||||
faceRecognizer->feature(aligned_face1, feature1);
|
||||
feature1 = feature1.clone();
|
||||
faceRecognizer->feature(aligned_face2, feature2);
|
||||
feature2 = feature2.clone();
|
||||
|
||||
double cos_score = faceRecognizer->match(feature1, feature2, FaceRecognizerSF::DisType::FR_COSINE);
|
||||
double L2_score = faceRecognizer->match(feature1, feature2, FaceRecognizerSF::DisType::FR_NORM_L2);
|
||||
|
||||
if(cos_score >= cosine_similar_thresh)
|
||||
{
|
||||
std::cout << "They have the same identity;";
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "They have different identities;";
|
||||
}
|
||||
std::cout << " Cosine Similarity: " << cos_score << ", threshold: " << cosine_similar_thresh << ". (higher value means higher similarity, max 1.0)\n";
|
||||
|
||||
if(L2_score <= l2norm_similar_thresh)
|
||||
{
|
||||
std::cout << "They have the same identity;";
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "They have different identities.";
|
||||
}
|
||||
std::cout << " NormL2 Distance: " << L2_score << ", threshold: " << l2norm_similar_thresh << ". (lower value means higher similarity, min 0.0)\n";
|
||||
|
||||
return 0;
|
||||
}
|
57
3rdparty/opencv-4.5.4/samples/dnn/face_match.py
vendored
Normal file
57
3rdparty/opencv-4.5.4/samples/dnn/face_match.py
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--input1', '-i1', type=str, help='Path to the input image1.')
|
||||
parser.add_argument('--input2', '-i2', type=str, help='Path to the input image2.')
|
||||
parser.add_argument('--face_detection_model', '-fd', type=str, help='Path to the face detection model. Download the model at https://github.com/ShiqiYu/libfacedetection.train/tree/master/tasks/task1/onnx.')
|
||||
parser.add_argument('--face_recognition_model', '-fr', type=str, help='Path to the face recognition model. Download the model at https://drive.google.com/file/d/1ClK9WiB492c5OZFKveF3XiHCejoOxINW/view.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read the input image
|
||||
img1 = cv.imread(args.input1)
|
||||
img2 = cv.imread(args.input2)
|
||||
|
||||
# Instantiate face detector and recognizer
|
||||
detector = cv.FaceDetectorYN.create(
|
||||
args.face_detection_model,
|
||||
"",
|
||||
(img1.shape[1], img1.shape[0])
|
||||
)
|
||||
recognizer = cv.FaceRecognizerSF.create(
|
||||
args.face_recognition_model,
|
||||
""
|
||||
)
|
||||
|
||||
# Detect face
|
||||
detector.setInputSize((img1.shape[1], img1.shape[0]))
|
||||
face1 = detector.detect(img1)
|
||||
detector.setInputSize((img2.shape[1], img2.shape[0]))
|
||||
face2 = detector.detect(img2)
|
||||
assert face1[1].shape[0] > 0, 'Cannot find a face in {}'.format(args.input1)
|
||||
assert face2[1].shape[0] > 0, 'Cannot find a face in {}'.format(args.input2)
|
||||
|
||||
# Align faces
|
||||
face1_align = recognizer.alignCrop(img1, face1[1][0])
|
||||
face2_align = recognizer.alignCrop(img2, face2[1][0])
|
||||
|
||||
# Extract features
|
||||
face1_feature = recognizer.faceFeature(face1_align)
|
||||
face2_feature = recognizer.faceFeature(face2_align)
|
||||
|
||||
# Calculate distance (0: cosine, 1: L2)
|
||||
cosine_similarity_threshold = 0.363
|
||||
cosine_score = recognizer.faceMatch(face1_feature, face2_feature, 0)
|
||||
msg = 'different identities'
|
||||
if cosine_score >= cosine_similarity_threshold:
|
||||
msg = 'the same identity'
|
||||
print('They have {}. Cosine Similarity: {}, threshold: {} (higher value means higher similarity, max 1.0).'.format(msg, cosine_score, cosine_similarity_threshold))
|
||||
|
||||
l2_similarity_threshold = 1.128
|
||||
l2_score = recognizer.faceMatch(face1_feature, face2_feature, 1)
|
||||
msg = 'different identities'
|
||||
if l2_score <= l2_similarity_threshold:
|
||||
msg = 'the same identity'
|
||||
print('They have {}. NormL2 Distance: {}, threshold: {} (lower value means higher similarity, min 0.0).'.format(msg, l2_score, l2_similarity_threshold))
|
53
3rdparty/opencv-4.5.4/samples/dnn/fast_neural_style.py
vendored
Normal file
53
3rdparty/opencv-4.5.4/samples/dnn/fast_neural_style.py
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
from __future__ import print_function
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This script is used to run style transfer models from '
|
||||
'https://github.com/jcjohnson/fast-neural-style using OpenCV')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--model', help='Path to .t7 model')
|
||||
parser.add_argument('--width', default=-1, type=int, help='Resize input to specific width.')
|
||||
parser.add_argument('--height', default=-1, type=int, help='Resize input to specific height.')
|
||||
parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of postprocessing blurring.')
|
||||
args = parser.parse_args()
|
||||
|
||||
net = cv.dnn.readNetFromTorch(cv.samples.findFile(args.model))
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
|
||||
if args.input:
|
||||
cap = cv.VideoCapture(args.input)
|
||||
else:
|
||||
cap = cv.VideoCapture(0)
|
||||
|
||||
cv.namedWindow('Styled image', cv.WINDOW_NORMAL)
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
inWidth = args.width if args.width != -1 else frame.shape[1]
|
||||
inHeight = args.height if args.height != -1 else frame.shape[0]
|
||||
inp = cv.dnn.blobFromImage(frame, 1.0, (inWidth, inHeight),
|
||||
(103.939, 116.779, 123.68), swapRB=False, crop=False)
|
||||
|
||||
net.setInput(inp)
|
||||
out = net.forward()
|
||||
|
||||
out = out.reshape(3, out.shape[2], out.shape[3])
|
||||
out[0] += 103.939
|
||||
out[1] += 116.779
|
||||
out[2] += 123.68
|
||||
out /= 255
|
||||
out = out.transpose(1, 2, 0)
|
||||
|
||||
t, _ = net.getPerfProfile()
|
||||
freq = cv.getTickFrequency() / 1000
|
||||
print(t / freq, 'ms')
|
||||
|
||||
if args.median_filter:
|
||||
out = cv.medianBlur(out, args.median_filter)
|
||||
|
||||
cv.imshow('Styled image', out)
|
109
3rdparty/opencv-4.5.4/samples/dnn/human_parsing.cpp
vendored
Normal file
109
3rdparty/opencv-4.5.4/samples/dnn/human_parsing.cpp
vendored
Normal file
@ -0,0 +1,109 @@
|
||||
//
|
||||
// this sample demonstrates parsing (segmenting) human body parts from an image using opencv's dnn,
|
||||
// based on https://github.com/Engineering-Course/LIP_JPPNet
|
||||
//
|
||||
// get the pretrained model from: https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
|
||||
//
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
using namespace cv;
|
||||
|
||||
|
||||
static Mat parse_human(const Mat &image, const std::string &model, int backend=dnn::DNN_BACKEND_DEFAULT, int target=dnn::DNN_TARGET_CPU) {
|
||||
// this network expects an image and a flipped copy as input
|
||||
Mat flipped;
|
||||
flip(image, flipped, 1);
|
||||
std::vector<Mat> batch;
|
||||
batch.push_back(image);
|
||||
batch.push_back(flipped);
|
||||
Mat blob = dnn::blobFromImages(batch, 1.0, Size(), Scalar(104.00698793, 116.66876762, 122.67891434));
|
||||
|
||||
dnn::Net net = dnn::readNet(model);
|
||||
net.setPreferableBackend(backend);
|
||||
net.setPreferableTarget(target);
|
||||
net.setInput(blob);
|
||||
Mat out = net.forward();
|
||||
// expected output: [2, 20, 384, 384], (2 lists(orig, flipped) of 20 body part heatmaps 384x384)
|
||||
|
||||
// LIP classes:
|
||||
// 0 Background, 1 Hat, 2 Hair, 3 Glove, 4 Sunglasses, 5 UpperClothes, 6 Dress, 7 Coat, 8 Socks, 9 Pants
|
||||
// 10 Jumpsuits, 11 Scarf, 12 Skirt, 13 Face, 14 LeftArm, 15 RightArm, 16 LeftLeg, 17 RightLeg, 18 LeftShoe. 19 RightShoe
|
||||
Vec3b colors[] = {
|
||||
Vec3b(0, 0, 0), Vec3b(128, 0, 0), Vec3b(255, 0, 0), Vec3b(0, 85, 0), Vec3b(170, 0, 51), Vec3b(255, 85, 0),
|
||||
Vec3b(0, 0, 85), Vec3b(0, 119, 221), Vec3b(85, 85, 0), Vec3b(0, 85, 85), Vec3b(85, 51, 0), Vec3b(52, 86, 128),
|
||||
Vec3b(0, 128, 0), Vec3b(0, 0, 255), Vec3b(51, 170, 221), Vec3b(0, 255, 255), Vec3b(85, 255, 170),
|
||||
Vec3b(170, 255, 85), Vec3b(255, 255, 0), Vec3b(255, 170, 0)
|
||||
};
|
||||
|
||||
Mat segm(image.size(), CV_8UC3, Scalar(0,0,0));
|
||||
Mat maxval(image.size(), CV_32F, Scalar(0));
|
||||
|
||||
// iterate over body part heatmaps (LIP classes)
|
||||
for (int i=0; i<out.size[1]; i++) {
|
||||
// resize heatmaps to original image size
|
||||
// "head" is the original image result, "tail" the flipped copy
|
||||
Mat head, h(out.size[2], out.size[3], CV_32F, out.ptr<float>(0,i));
|
||||
resize(h, head, image.size());
|
||||
|
||||
// we have to swap the last 3 pairs in the "tail" list
|
||||
static int tail_order[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,14,17,16,19,18};
|
||||
Mat tail, t(out.size[2], out.size[3], CV_32F, out.ptr<float>(1,tail_order[i]));
|
||||
resize(t, tail, image.size());
|
||||
flip(tail, tail, 1);
|
||||
|
||||
// mix original and flipped result
|
||||
Mat avg = (head + tail) * 0.5;
|
||||
|
||||
// write color if prob value > maxval
|
||||
Mat cmask;
|
||||
compare(avg, maxval, cmask, CMP_GT);
|
||||
segm.setTo(colors[i], cmask);
|
||||
|
||||
// keep largest values for next iteration
|
||||
max(avg, maxval, maxval);
|
||||
}
|
||||
cvtColor(segm, segm, COLOR_RGB2BGR);
|
||||
return segm;
|
||||
}
|
||||
|
||||
int main(int argc, char**argv)
|
||||
{
|
||||
CommandLineParser parser(argc,argv,
|
||||
"{help h | | show help screen / args}"
|
||||
"{image i | | person image to process }"
|
||||
"{model m |lip_jppnet_384.pb| network model}"
|
||||
"{backend b | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation, "
|
||||
"4: VKCOM, "
|
||||
"5: CUDA }"
|
||||
"{target t | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU, "
|
||||
"4: Vulkan, "
|
||||
"6: CUDA, "
|
||||
"7: CUDA fp16 (half-float preprocess) }"
|
||||
);
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
std::string model = parser.get<std::string>("model");
|
||||
std::string image = parser.get<std::string>("image");
|
||||
int backend = parser.get<int>("backend");
|
||||
int target = parser.get<int>("target");
|
||||
|
||||
Mat input = imread(image);
|
||||
Mat segm = parse_human(input, model, backend, target);
|
||||
|
||||
imshow("human parsing", segm);
|
||||
waitKey();
|
||||
return 0;
|
||||
}
|
190
3rdparty/opencv-4.5.4/samples/dnn/human_parsing.py
vendored
Normal file
190
3rdparty/opencv-4.5.4/samples/dnn/human_parsing.py
vendored
Normal file
@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
You can download the converted pb model from https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
|
||||
or convert the model yourself.
|
||||
|
||||
Follow these steps if you want to convert the original model yourself:
|
||||
To get original .meta pre-trained model download https://drive.google.com/file/d/1BFVXgeln-bek8TCbRjN6utPAgRE0LJZg/view
|
||||
For correct convert .meta to .pb model download original repository https://github.com/Engineering-Course/LIP_JPPNet
|
||||
Change script evaluate_parsing_JPPNet-s2.py for human parsing
|
||||
1. Remove preprocessing to create image_batch_origin:
|
||||
with tf.name_scope("create_inputs"):
|
||||
...
|
||||
Add
|
||||
image_batch_origin = tf.placeholder(tf.float32, shape=(2, None, None, 3), name='input')
|
||||
|
||||
2. Create input
|
||||
image = cv2.imread(path/to/image)
|
||||
image_rev = np.flip(image, axis=1)
|
||||
input = np.stack([image, image_rev], axis=0)
|
||||
|
||||
3. Hardcode image_h and image_w shapes to determine output shapes.
|
||||
We use default INPUT_SIZE = (384, 384) from evaluate_parsing_JPPNet-s2.py.
|
||||
parsing_out1 = tf.reduce_mean(tf.stack([tf.image.resize_images(parsing_out1_100, INPUT_SIZE),
|
||||
tf.image.resize_images(parsing_out1_075, INPUT_SIZE),
|
||||
tf.image.resize_images(parsing_out1_125, INPUT_SIZE)]), axis=0)
|
||||
Do similarly with parsing_out2, parsing_out3
|
||||
4. Remove postprocessing. Last net operation:
|
||||
raw_output = tf.reduce_mean(tf.stack([parsing_out1, parsing_out2, parsing_out3]), axis=0)
|
||||
Change:
|
||||
parsing_ = sess.run(raw_output, feed_dict={'input:0': input})
|
||||
|
||||
5. To save model after sess.run(...) add:
|
||||
input_graph_def = tf.get_default_graph().as_graph_def()
|
||||
output_node = "Mean_3"
|
||||
output_graph_def = tf.graph_util.convert_variables_to_constants(sess, input_graph_def, output_node)
|
||||
|
||||
output_graph = "LIP_JPPNet.pb"
|
||||
with tf.gfile.GFile(output_graph, "wb") as f:
|
||||
f.write(output_graph_def.SerializeToString())'
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
|
||||
cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
|
||||
def preprocess(image):
|
||||
"""
|
||||
Create 4-dimensional blob from image and flip image
|
||||
:param image: input image
|
||||
"""
|
||||
image_rev = np.flip(image, axis=1)
|
||||
input = cv.dnn.blobFromImages([image, image_rev], mean=(104.00698793, 116.66876762, 122.67891434))
|
||||
return input
|
||||
|
||||
|
||||
def run_net(input, model_path, backend, target):
|
||||
"""
|
||||
Read network and infer model
|
||||
:param model_path: path to JPPNet model
|
||||
:param backend: computation backend
|
||||
:param target: computation device
|
||||
"""
|
||||
net = cv.dnn.readNet(model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(input)
|
||||
out = net.forward()
|
||||
return out
|
||||
|
||||
|
||||
def postprocess(out, input_shape):
|
||||
"""
|
||||
Create a grayscale human segmentation
|
||||
:param out: network output
|
||||
:param input_shape: input image width and height
|
||||
"""
|
||||
# LIP classes
|
||||
# 0 Background
|
||||
# 1 Hat
|
||||
# 2 Hair
|
||||
# 3 Glove
|
||||
# 4 Sunglasses
|
||||
# 5 UpperClothes
|
||||
# 6 Dress
|
||||
# 7 Coat
|
||||
# 8 Socks
|
||||
# 9 Pants
|
||||
# 10 Jumpsuits
|
||||
# 11 Scarf
|
||||
# 12 Skirt
|
||||
# 13 Face
|
||||
# 14 LeftArm
|
||||
# 15 RightArm
|
||||
# 16 LeftLeg
|
||||
# 17 RightLeg
|
||||
# 18 LeftShoe
|
||||
# 19 RightShoe
|
||||
head_output, tail_output = np.split(out, indices_or_sections=[1], axis=0)
|
||||
head_output = head_output.squeeze(0)
|
||||
tail_output = tail_output.squeeze(0)
|
||||
|
||||
head_output = np.stack([cv.resize(img, dsize=input_shape) for img in head_output[:, ...]])
|
||||
tail_output = np.stack([cv.resize(img, dsize=input_shape) for img in tail_output[:, ...]])
|
||||
|
||||
tail_list = np.split(tail_output, indices_or_sections=list(range(1, 20)), axis=0)
|
||||
tail_list = [arr.squeeze(0) for arr in tail_list]
|
||||
tail_list_rev = [tail_list[i] for i in range(14)]
|
||||
tail_list_rev.extend([tail_list[15], tail_list[14], tail_list[17], tail_list[16], tail_list[19], tail_list[18]])
|
||||
tail_output_rev = np.stack(tail_list_rev, axis=0)
|
||||
tail_output_rev = np.flip(tail_output_rev, axis=2)
|
||||
raw_output_all = np.mean(np.stack([head_output, tail_output_rev], axis=0), axis=0, keepdims=True)
|
||||
raw_output_all = np.argmax(raw_output_all, axis=1)
|
||||
raw_output_all = raw_output_all.transpose(1, 2, 0)
|
||||
return raw_output_all
|
||||
|
||||
|
||||
def decode_labels(gray_image):
|
||||
"""
|
||||
Colorize image according to labels
|
||||
:param gray_image: grayscale human segmentation result
|
||||
"""
|
||||
height, width, _ = gray_image.shape
|
||||
colors = [(0, 0, 0), (128, 0, 0), (255, 0, 0), (0, 85, 0), (170, 0, 51), (255, 85, 0),
|
||||
(0, 0, 85), (0, 119, 221), (85, 85, 0), (0, 85, 85), (85, 51, 0), (52, 86, 128),
|
||||
(0, 128, 0), (0, 0, 255), (51, 170, 221), (0, 255, 255),(85, 255, 170),
|
||||
(170, 255, 85), (255, 255, 0), (255, 170, 0)]
|
||||
|
||||
segm = np.stack([colors[idx] for idx in gray_image.flatten()])
|
||||
segm = segm.reshape(height, width, 3).astype(np.uint8)
|
||||
segm = cv.cvtColor(segm, cv.COLOR_BGR2RGB)
|
||||
return segm
|
||||
|
||||
|
||||
def parse_human(image, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
|
||||
"""
|
||||
Prepare input for execution, run net and postprocess output to parse human.
|
||||
:param image: input image
|
||||
:param model_path: path to JPPNet model
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
input = preprocess(image)
|
||||
input_h, input_w = input.shape[2:]
|
||||
output = run_net(input, model_path, backend, target)
|
||||
grayscale_out = postprocess(output, (input_w, input_h))
|
||||
segmentation = decode_labels(grayscale_out)
|
||||
return segmentation
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input', '-i', required=True, help='Path to input image.')
|
||||
parser.add_argument('--model', '-m', default='lip_jppnet_384.pb', help='Path to pb model.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA"% backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA fp16 (half-float preprocess)' % targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.model):
|
||||
raise OSError("Model not exist")
|
||||
|
||||
image = cv.imread(args.input)
|
||||
output = parse_human(image, args.model, args.backend, args.target)
|
||||
winName = 'Deep learning human parsing in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cv.imshow(winName, output)
|
||||
cv.waitKey()
|
205
3rdparty/opencv-4.5.4/samples/dnn/js_face_recognition.html
vendored
Normal file
205
3rdparty/opencv-4.5.4/samples/dnn/js_face_recognition.html
vendored
Normal file
@ -0,0 +1,205 @@
|
||||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<script async src="../../opencv.js" type="text/javascript"></script>
|
||||
<script src="../../utils.js" type="text/javascript"></script>
|
||||
|
||||
<script type='text/javascript'>
|
||||
var netDet = undefined, netRecogn = undefined;
|
||||
var persons = {};
|
||||
|
||||
//! [Run face detection model]
|
||||
function detectFaces(img) {
|
||||
var blob = cv.blobFromImage(img, 1, {width: 192, height: 144}, [104, 117, 123, 0], false, false);
|
||||
netDet.setInput(blob);
|
||||
var out = netDet.forward();
|
||||
|
||||
var faces = [];
|
||||
for (var i = 0, n = out.data32F.length; i < n; i += 7) {
|
||||
var confidence = out.data32F[i + 2];
|
||||
var left = out.data32F[i + 3] * img.cols;
|
||||
var top = out.data32F[i + 4] * img.rows;
|
||||
var right = out.data32F[i + 5] * img.cols;
|
||||
var bottom = out.data32F[i + 6] * img.rows;
|
||||
left = Math.min(Math.max(0, left), img.cols - 1);
|
||||
right = Math.min(Math.max(0, right), img.cols - 1);
|
||||
bottom = Math.min(Math.max(0, bottom), img.rows - 1);
|
||||
top = Math.min(Math.max(0, top), img.rows - 1);
|
||||
|
||||
if (confidence > 0.5 && left < right && top < bottom) {
|
||||
faces.push({x: left, y: top, width: right - left, height: bottom - top})
|
||||
}
|
||||
}
|
||||
blob.delete();
|
||||
out.delete();
|
||||
return faces;
|
||||
};
|
||||
//! [Run face detection model]
|
||||
|
||||
//! [Get 128 floating points feature vector]
|
||||
function face2vec(face) {
|
||||
var blob = cv.blobFromImage(face, 1.0 / 255, {width: 96, height: 96}, [0, 0, 0, 0], true, false)
|
||||
netRecogn.setInput(blob);
|
||||
var vec = netRecogn.forward();
|
||||
blob.delete();
|
||||
return vec;
|
||||
};
|
||||
//! [Get 128 floating points feature vector]
|
||||
|
||||
//! [Recognize]
|
||||
function recognize(face) {
|
||||
var vec = face2vec(face);
|
||||
|
||||
var bestMatchName = 'unknown';
|
||||
var bestMatchScore = 0.5; // Actually, the minimum is -1 but we use it as a threshold.
|
||||
for (name in persons) {
|
||||
var personVec = persons[name];
|
||||
var score = vec.dot(personVec);
|
||||
if (score > bestMatchScore) {
|
||||
bestMatchScore = score;
|
||||
bestMatchName = name;
|
||||
}
|
||||
}
|
||||
vec.delete();
|
||||
return bestMatchName;
|
||||
};
|
||||
//! [Recognize]
|
||||
|
||||
function loadModels(callback) {
|
||||
var utils = new Utils('');
|
||||
var proto = 'https://raw.githubusercontent.com/opencv/opencv/master/samples/dnn/face_detector/deploy_lowres.prototxt';
|
||||
var weights = 'https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel';
|
||||
var recognModel = 'https://raw.githubusercontent.com/pyannote/pyannote-data/master/openface.nn4.small2.v1.t7';
|
||||
utils.createFileFromUrl('face_detector.prototxt', proto, () => {
|
||||
document.getElementById('status').innerHTML = 'Downloading face_detector.caffemodel';
|
||||
utils.createFileFromUrl('face_detector.caffemodel', weights, () => {
|
||||
document.getElementById('status').innerHTML = 'Downloading OpenFace model';
|
||||
utils.createFileFromUrl('face_recognition.t7', recognModel, () => {
|
||||
document.getElementById('status').innerHTML = '';
|
||||
netDet = cv.readNetFromCaffe('face_detector.prototxt', 'face_detector.caffemodel');
|
||||
netRecogn = cv.readNetFromTorch('face_recognition.t7');
|
||||
callback();
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
function main() {
|
||||
// Create a camera object.
|
||||
var output = document.getElementById('output');
|
||||
var camera = document.createElement("video");
|
||||
camera.setAttribute("width", output.width);
|
||||
camera.setAttribute("height", output.height);
|
||||
|
||||
// Get a permission from user to use a camera.
|
||||
navigator.mediaDevices.getUserMedia({video: true, audio: false})
|
||||
.then(function(stream) {
|
||||
camera.srcObject = stream;
|
||||
camera.onloadedmetadata = function(e) {
|
||||
camera.play();
|
||||
};
|
||||
});
|
||||
|
||||
//! [Open a camera stream]
|
||||
var cap = new cv.VideoCapture(camera);
|
||||
var frame = new cv.Mat(camera.height, camera.width, cv.CV_8UC4);
|
||||
var frameBGR = new cv.Mat(camera.height, camera.width, cv.CV_8UC3);
|
||||
//! [Open a camera stream]
|
||||
|
||||
//! [Add a person]
|
||||
document.getElementById('addPersonButton').onclick = function() {
|
||||
var rects = detectFaces(frameBGR);
|
||||
if (rects.length > 0) {
|
||||
var face = frameBGR.roi(rects[0]);
|
||||
|
||||
var name = prompt('Say your name:');
|
||||
var cell = document.getElementById("targetNames").insertCell(0);
|
||||
cell.innerHTML = name;
|
||||
|
||||
persons[name] = face2vec(face).clone();
|
||||
|
||||
var canvas = document.createElement("canvas");
|
||||
canvas.setAttribute("width", 96);
|
||||
canvas.setAttribute("height", 96);
|
||||
var cell = document.getElementById("targetImgs").insertCell(0);
|
||||
cell.appendChild(canvas);
|
||||
|
||||
var faceResized = new cv.Mat(canvas.height, canvas.width, cv.CV_8UC3);
|
||||
cv.resize(face, faceResized, {width: canvas.width, height: canvas.height});
|
||||
cv.cvtColor(faceResized, faceResized, cv.COLOR_BGR2RGB);
|
||||
cv.imshow(canvas, faceResized);
|
||||
faceResized.delete();
|
||||
}
|
||||
};
|
||||
//! [Add a person]
|
||||
|
||||
//! [Define frames processing]
|
||||
var isRunning = false;
|
||||
const FPS = 30; // Target number of frames processed per second.
|
||||
function captureFrame() {
|
||||
var begin = Date.now();
|
||||
cap.read(frame); // Read a frame from camera
|
||||
cv.cvtColor(frame, frameBGR, cv.COLOR_RGBA2BGR);
|
||||
|
||||
var faces = detectFaces(frameBGR);
|
||||
faces.forEach(function(rect) {
|
||||
cv.rectangle(frame, {x: rect.x, y: rect.y}, {x: rect.x + rect.width, y: rect.y + rect.height}, [0, 255, 0, 255]);
|
||||
|
||||
var face = frameBGR.roi(rect);
|
||||
var name = recognize(face);
|
||||
cv.putText(frame, name, {x: rect.x, y: rect.y}, cv.FONT_HERSHEY_SIMPLEX, 1.0, [0, 255, 0, 255]);
|
||||
});
|
||||
|
||||
cv.imshow(output, frame);
|
||||
|
||||
// Loop this function.
|
||||
if (isRunning) {
|
||||
var delay = 1000 / FPS - (Date.now() - begin);
|
||||
setTimeout(captureFrame, delay);
|
||||
}
|
||||
};
|
||||
//! [Define frames processing]
|
||||
|
||||
document.getElementById('startStopButton').onclick = function toggle() {
|
||||
if (isRunning) {
|
||||
isRunning = false;
|
||||
document.getElementById('startStopButton').innerHTML = 'Start';
|
||||
document.getElementById('addPersonButton').disabled = true;
|
||||
} else {
|
||||
function run() {
|
||||
isRunning = true;
|
||||
captureFrame();
|
||||
document.getElementById('startStopButton').innerHTML = 'Stop';
|
||||
document.getElementById('startStopButton').disabled = false;
|
||||
document.getElementById('addPersonButton').disabled = false;
|
||||
}
|
||||
if (netDet == undefined || netRecogn == undefined) {
|
||||
document.getElementById('startStopButton').disabled = true;
|
||||
loadModels(run); // Load models and run a pipeline;
|
||||
} else {
|
||||
run();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
document.getElementById('startStopButton').disabled = false;
|
||||
};
|
||||
</script>
|
||||
|
||||
</head>
|
||||
|
||||
<body onload="cv['onRuntimeInitialized']=()=>{ main() }">
|
||||
<button id="startStopButton" type="button" disabled="true">Start</button>
|
||||
<div id="status"></div>
|
||||
<canvas id="output" width=640 height=480 style="max-width: 100%"></canvas>
|
||||
|
||||
<table>
|
||||
<tr id="targetImgs"></tr>
|
||||
<tr id="targetNames"></tr>
|
||||
</table>
|
||||
<button id="addPersonButton" type="button" disabled="true">Add a person</button>
|
||||
</body>
|
||||
|
||||
</html>
|
143
3rdparty/opencv-4.5.4/samples/dnn/mask_rcnn.py
vendored
Normal file
143
3rdparty/opencv-4.5.4/samples/dnn/mask_rcnn.py
vendored
Normal file
@ -0,0 +1,143 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
'Use this script to run Mask-RCNN object detection and semantic '
|
||||
'segmentation network from TensorFlow Object Detection API.')
|
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', required=True, help='Path to a .pb file with weights.')
|
||||
parser.add_argument('--config', required=True, help='Path to a .pxtxt file contains network configuration.')
|
||||
parser.add_argument('--classes', help='Optional path to a text file with names of classes.')
|
||||
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
|
||||
'An every color is represented with three values from 0 to 255 in BGR channels order.')
|
||||
parser.add_argument('--width', type=int, default=800,
|
||||
help='Preprocess input image by resizing to a specific width.')
|
||||
parser.add_argument('--height', type=int, default=800,
|
||||
help='Preprocess input image by resizing to a specific height.')
|
||||
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
|
||||
args = parser.parse_args()
|
||||
|
||||
np.random.seed(324)
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load colors
|
||||
colors = None
|
||||
if args.colors:
|
||||
with open(args.colors, 'rt') as f:
|
||||
colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')]
|
||||
|
||||
legend = None
|
||||
def showLegend(classes):
|
||||
global legend
|
||||
if not classes is None and legend is None:
|
||||
blockHeight = 30
|
||||
assert(len(classes) == len(colors))
|
||||
|
||||
legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8)
|
||||
for i in range(len(classes)):
|
||||
block = legend[i * blockHeight:(i + 1) * blockHeight]
|
||||
block[:,:] = colors[i]
|
||||
cv.putText(block, classes[i], (0, blockHeight//2), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
|
||||
|
||||
cv.namedWindow('Legend', cv.WINDOW_NORMAL)
|
||||
cv.imshow('Legend', legend)
|
||||
classes = None
|
||||
|
||||
|
||||
def drawBox(frame, classId, conf, left, top, right, bottom):
|
||||
# Draw a bounding box.
|
||||
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
|
||||
|
||||
label = '%.2f' % conf
|
||||
|
||||
# Print a label of class.
|
||||
if classes:
|
||||
assert(classId < len(classes))
|
||||
label = '%s: %s' % (classes[classId], label)
|
||||
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
||||
top = max(top, labelSize[1])
|
||||
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
|
||||
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config))
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
|
||||
winName = 'Mask-RCNN in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
|
||||
legend = None
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
frameH = frame.shape[0]
|
||||
frameW = frame.shape[1]
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
blob = cv.dnn.blobFromImage(frame, size=(args.width, args.height), swapRB=True, crop=False)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob)
|
||||
|
||||
boxes, masks = net.forward(['detection_out_final', 'detection_masks'])
|
||||
|
||||
numClasses = masks.shape[1]
|
||||
numDetections = boxes.shape[2]
|
||||
|
||||
# Draw segmentation
|
||||
if not colors:
|
||||
# Generate colors
|
||||
colors = [np.array([0, 0, 0], np.uint8)]
|
||||
for i in range(1, numClasses + 1):
|
||||
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
|
||||
del colors[0]
|
||||
|
||||
boxesToDraw = []
|
||||
for i in range(numDetections):
|
||||
box = boxes[0, 0, i]
|
||||
mask = masks[i]
|
||||
score = box[2]
|
||||
if score > args.thr:
|
||||
classId = int(box[1])
|
||||
left = int(frameW * box[3])
|
||||
top = int(frameH * box[4])
|
||||
right = int(frameW * box[5])
|
||||
bottom = int(frameH * box[6])
|
||||
|
||||
left = max(0, min(left, frameW - 1))
|
||||
top = max(0, min(top, frameH - 1))
|
||||
right = max(0, min(right, frameW - 1))
|
||||
bottom = max(0, min(bottom, frameH - 1))
|
||||
|
||||
boxesToDraw.append([frame, classId, score, left, top, right, bottom])
|
||||
|
||||
classMask = mask[classId]
|
||||
classMask = cv.resize(classMask, (right - left + 1, bottom - top + 1))
|
||||
mask = (classMask > 0.5)
|
||||
|
||||
roi = frame[top:bottom+1, left:right+1][mask]
|
||||
frame[top:bottom+1, left:right+1][mask] = (0.7 * colors[classId] + 0.3 * roi).astype(np.uint8)
|
||||
|
||||
for box in boxesToDraw:
|
||||
drawBox(*box)
|
||||
|
||||
# Put efficiency information.
|
||||
t, _ = net.getPerfProfile()
|
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
showLegend(classes)
|
||||
|
||||
cv.imshow(winName, frame)
|
133
3rdparty/opencv-4.5.4/samples/dnn/mobilenet_ssd_accuracy.py
vendored
Normal file
133
3rdparty/opencv-4.5.4/samples/dnn/mobilenet_ssd_accuracy.py
vendored
Normal file
@ -0,0 +1,133 @@
|
||||
from __future__ import print_function
|
||||
# Script to evaluate MobileNet-SSD object detection model trained in TensorFlow
|
||||
# using both TensorFlow and OpenCV. Example:
|
||||
#
|
||||
# python mobilenet_ssd_accuracy.py \
|
||||
# --weights=frozen_inference_graph.pb \
|
||||
# --prototxt=ssd_mobilenet_v1_coco.pbtxt \
|
||||
# --images=val2017 \
|
||||
# --annotations=annotations/instances_val2017.json
|
||||
#
|
||||
# Tested on COCO 2017 object detection dataset, http://cocodataset.org/#download
|
||||
import os
|
||||
import cv2 as cv
|
||||
import json
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Evaluate MobileNet-SSD model using both TensorFlow and OpenCV. '
|
||||
'COCO evaluation framework is required: http://cocodataset.org')
|
||||
parser.add_argument('--weights', required=True,
|
||||
help='Path to frozen_inference_graph.pb of MobileNet-SSD model. '
|
||||
'Download it from http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_11_06_2017.tar.gz')
|
||||
parser.add_argument('--prototxt', help='Path to ssd_mobilenet_v1_coco.pbtxt from opencv_extra.', required=True)
|
||||
parser.add_argument('--images', help='Path to COCO validation images directory.', required=True)
|
||||
parser.add_argument('--annotations', help='Path to COCO annotations file.', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
### Get OpenCV predictions #####################################################
|
||||
net = cv.dnn.readNetFromTensorflow(cv.samples.findFile(args.weights), cv.samples.findFile(args.prototxt))
|
||||
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
|
||||
detections = []
|
||||
for imgName in os.listdir(args.images):
|
||||
inp = cv.imread(cv.samples.findFile(os.path.join(args.images, imgName)))
|
||||
rows = inp.shape[0]
|
||||
cols = inp.shape[1]
|
||||
inp = cv.resize(inp, (300, 300))
|
||||
|
||||
net.setInput(cv.dnn.blobFromImage(inp, 1.0/127.5, (300, 300), (127.5, 127.5, 127.5), True))
|
||||
out = net.forward()
|
||||
|
||||
for i in range(out.shape[2]):
|
||||
score = float(out[0, 0, i, 2])
|
||||
# Confidence threshold is in prototxt.
|
||||
classId = int(out[0, 0, i, 1])
|
||||
|
||||
x = out[0, 0, i, 3] * cols
|
||||
y = out[0, 0, i, 4] * rows
|
||||
w = out[0, 0, i, 5] * cols - x
|
||||
h = out[0, 0, i, 6] * rows - y
|
||||
detections.append({
|
||||
"image_id": int(imgName.rstrip('0')[:imgName.rfind('.')]),
|
||||
"category_id": classId,
|
||||
"bbox": [x, y, w, h],
|
||||
"score": score
|
||||
})
|
||||
|
||||
with open('cv_result.json', 'wt') as f:
|
||||
json.dump(detections, f)
|
||||
|
||||
### Get TensorFlow predictions #################################################
|
||||
import tensorflow as tf
|
||||
|
||||
with tf.gfile.FastGFile(args.weights) as f:
|
||||
# Load the model
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(f.read())
|
||||
|
||||
with tf.Session() as sess:
|
||||
# Restore session
|
||||
sess.graph.as_default()
|
||||
tf.import_graph_def(graph_def, name='')
|
||||
|
||||
detections = []
|
||||
for imgName in os.listdir(args.images):
|
||||
inp = cv.imread(os.path.join(args.images, imgName))
|
||||
rows = inp.shape[0]
|
||||
cols = inp.shape[1]
|
||||
inp = cv.resize(inp, (300, 300))
|
||||
inp = inp[:, :, [2, 1, 0]] # BGR2RGB
|
||||
out = sess.run([sess.graph.get_tensor_by_name('num_detections:0'),
|
||||
sess.graph.get_tensor_by_name('detection_scores:0'),
|
||||
sess.graph.get_tensor_by_name('detection_boxes:0'),
|
||||
sess.graph.get_tensor_by_name('detection_classes:0')],
|
||||
feed_dict={'image_tensor:0': inp.reshape(1, inp.shape[0], inp.shape[1], 3)})
|
||||
num_detections = int(out[0][0])
|
||||
for i in range(num_detections):
|
||||
classId = int(out[3][0][i])
|
||||
score = float(out[1][0][i])
|
||||
bbox = [float(v) for v in out[2][0][i]]
|
||||
if score > 0.01:
|
||||
x = bbox[1] * cols
|
||||
y = bbox[0] * rows
|
||||
w = bbox[3] * cols - x
|
||||
h = bbox[2] * rows - y
|
||||
detections.append({
|
||||
"image_id": int(imgName.rstrip('0')[:imgName.rfind('.')]),
|
||||
"category_id": classId,
|
||||
"bbox": [x, y, w, h],
|
||||
"score": score
|
||||
})
|
||||
|
||||
with open('tf_result.json', 'wt') as f:
|
||||
json.dump(detections, f)
|
||||
|
||||
### Evaluation part ############################################################
|
||||
|
||||
# %matplotlib inline
|
||||
import matplotlib.pyplot as plt
|
||||
from pycocotools.coco import COCO
|
||||
from pycocotools.cocoeval import COCOeval
|
||||
import numpy as np
|
||||
import skimage.io as io
|
||||
import pylab
|
||||
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
|
||||
|
||||
annType = ['segm','bbox','keypoints']
|
||||
annType = annType[1] #specify type here
|
||||
prefix = 'person_keypoints' if annType=='keypoints' else 'instances'
|
||||
print('Running demo for *%s* results.'%(annType))
|
||||
|
||||
#initialize COCO ground truth api
|
||||
cocoGt=COCO(args.annotations)
|
||||
|
||||
#initialize COCO detections api
|
||||
for resFile in ['tf_result.json', 'cv_result.json']:
|
||||
print(resFile)
|
||||
cocoDt=cocoGt.loadRes(resFile)
|
||||
|
||||
cocoEval = COCOeval(cocoGt,cocoDt,annType)
|
||||
cocoEval.evaluate()
|
||||
cocoEval.accumulate()
|
||||
cocoEval.summarize()
|
166
3rdparty/opencv-4.5.4/samples/dnn/models.yml
vendored
Normal file
166
3rdparty/opencv-4.5.4/samples/dnn/models.yml
vendored
Normal file
@ -0,0 +1,166 @@
|
||||
%YAML 1.0
|
||||
---
|
||||
################################################################################
|
||||
# Object detection models.
|
||||
################################################################################
|
||||
|
||||
# OpenCV's face detection network
|
||||
opencv_fd:
|
||||
load_info:
|
||||
url: "https://github.com/opencv/opencv_3rdparty/raw/dnn_samples_face_detector_20170830/res10_300x300_ssd_iter_140000.caffemodel"
|
||||
sha1: "15aa726b4d46d9f023526d85537db81cbc8dd566"
|
||||
model: "opencv_face_detector.caffemodel"
|
||||
config: "opencv_face_detector.prototxt"
|
||||
mean: [104, 177, 123]
|
||||
scale: 1.0
|
||||
width: 300
|
||||
height: 300
|
||||
rgb: false
|
||||
sample: "object_detection"
|
||||
|
||||
# YOLO4 object detection family from Darknet (https://github.com/AlexeyAB/darknet)
|
||||
# YOLO object detection family from Darknet (https://pjreddie.com/darknet/yolo/)
|
||||
# Might be used for all YOLOv2, TinyYolov2, YOLOv3, YOLOv4 and TinyYolov4
|
||||
yolo:
|
||||
load_info:
|
||||
url: "https://pjreddie.com/media/files/yolov3.weights"
|
||||
sha1: "520878f12e97cf820529daea502acca380f1cb8e"
|
||||
model: "yolov3.weights"
|
||||
config: "yolov3.cfg"
|
||||
mean: [0, 0, 0]
|
||||
scale: 0.00392
|
||||
width: 416
|
||||
height: 416
|
||||
rgb: true
|
||||
classes: "object_detection_classes_yolov3.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
tiny-yolo-voc:
|
||||
load_info:
|
||||
url: "https://pjreddie.com/media/files/yolov2-tiny-voc.weights"
|
||||
sha1: "24b4bd049fc4fa5f5e95f684a8967e65c625dff9"
|
||||
model: "tiny-yolo-voc.weights"
|
||||
config: "tiny-yolo-voc.cfg"
|
||||
mean: [0, 0, 0]
|
||||
scale: 0.00392
|
||||
width: 416
|
||||
height: 416
|
||||
rgb: true
|
||||
classes: "object_detection_classes_pascal_voc.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
# Caffe implementation of SSD model from https://github.com/chuanqi305/MobileNet-SSD
|
||||
ssd_caffe:
|
||||
load_info:
|
||||
url: "https://drive.google.com/uc?export=download&id=0B3gersZ2cHIxRm5PMWRoTkdHdHc"
|
||||
sha1: "994d30a8afaa9e754d17d2373b2d62a7dfbaaf7a"
|
||||
model: "MobileNetSSD_deploy.caffemodel"
|
||||
config: "MobileNetSSD_deploy.prototxt"
|
||||
mean: [127.5, 127.5, 127.5]
|
||||
scale: 0.007843
|
||||
width: 300
|
||||
height: 300
|
||||
rgb: false
|
||||
classes: "object_detection_classes_pascal_voc.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
# TensorFlow implementation of SSD model from https://github.com/tensorflow/models/tree/master/research/object_detection
|
||||
ssd_tf:
|
||||
load_info:
|
||||
url: "http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2017_11_17.tar.gz"
|
||||
sha1: "9e4bcdd98f4c6572747679e4ce570de4f03a70e2"
|
||||
download_sha: "6157ddb6da55db2da89dd561eceb7f944928e317"
|
||||
download_name: "ssd_mobilenet_v1_coco_2017_11_17.tar.gz"
|
||||
member: "ssd_mobilenet_v1_coco_2017_11_17/frozen_inference_graph.pb"
|
||||
model: "ssd_mobilenet_v1_coco_2017_11_17.pb"
|
||||
config: "ssd_mobilenet_v1_coco_2017_11_17.pbtxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 300
|
||||
height: 300
|
||||
rgb: true
|
||||
classes: "object_detection_classes_coco.txt"
|
||||
sample: "object_detection"
|
||||
|
||||
# TensorFlow implementation of Faster-RCNN model from https://github.com/tensorflow/models/tree/master/research/object_detection
|
||||
faster_rcnn_tf:
|
||||
load_info:
|
||||
url: "http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz"
|
||||
sha1: "f2e4bf386b9bb3e25ddfcbbd382c20f417e444f3"
|
||||
download_sha: "c710f25e5c6a3ce85fe793d5bf266d581ab1c230"
|
||||
download_name: "faster_rcnn_inception_v2_coco_2018_01_28.tar.gz"
|
||||
member: "faster_rcnn_inception_v2_coco_2018_01_28/frozen_inference_graph.pb"
|
||||
model: "faster_rcnn_inception_v2_coco_2018_01_28.pb"
|
||||
config: "faster_rcnn_inception_v2_coco_2018_01_28.pbtxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 800
|
||||
height: 600
|
||||
rgb: true
|
||||
sample: "object_detection"
|
||||
|
||||
################################################################################
|
||||
# Image classification models.
|
||||
################################################################################
|
||||
|
||||
# SqueezeNet v1.1 from https://github.com/DeepScale/SqueezeNet
|
||||
squeezenet:
|
||||
load_info:
|
||||
url: "https://raw.githubusercontent.com/DeepScale/SqueezeNet/b5c3f1a23713c8b3fd7b801d229f6b04c64374a5/SqueezeNet_v1.1/squeezenet_v1.1.caffemodel"
|
||||
sha1: "3397f026368a45ae236403ccc81cfcbe8ebe1bd0"
|
||||
model: "squeezenet_v1.1.caffemodel"
|
||||
config: "squeezenet_v1.1.prototxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 227
|
||||
height: 227
|
||||
rgb: false
|
||||
classes: "classification_classes_ILSVRC2012.txt"
|
||||
sample: "classification"
|
||||
|
||||
# Googlenet from https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
|
||||
googlenet:
|
||||
load_info:
|
||||
url: "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel"
|
||||
sha1: "405fc5acd08a3bb12de8ee5e23a96bec22f08204"
|
||||
model: "bvlc_googlenet.caffemodel"
|
||||
config: "bvlc_googlenet.prototxt"
|
||||
mean: [104, 117, 123]
|
||||
scale: 1.0
|
||||
width: 224
|
||||
height: 224
|
||||
rgb: false
|
||||
classes: "classification_classes_ILSVRC2012.txt"
|
||||
sample: "classification"
|
||||
|
||||
################################################################################
|
||||
# Semantic segmentation models.
|
||||
################################################################################
|
||||
|
||||
# ENet road scene segmentation network from https://github.com/e-lab/ENet-training
|
||||
# Works fine for different input sizes.
|
||||
enet:
|
||||
load_info:
|
||||
url: "https://www.dropbox.com/s/tdde0mawbi5dugq/Enet-model-best.net?dl=1"
|
||||
sha1: "b4123a73bf464b9ebe9cfc4ab9c2d5c72b161315"
|
||||
model: "Enet-model-best.net"
|
||||
mean: [0, 0, 0]
|
||||
scale: 0.00392
|
||||
width: 512
|
||||
height: 256
|
||||
rgb: true
|
||||
classes: "enet-classes.txt"
|
||||
sample: "segmentation"
|
||||
|
||||
fcn8s:
|
||||
load_info:
|
||||
url: "http://dl.caffe.berkeleyvision.org/fcn8s-heavy-pascal.caffemodel"
|
||||
sha1: "c449ea74dd7d83751d1357d6a8c323fcf4038962"
|
||||
model: "fcn8s-heavy-pascal.caffemodel"
|
||||
config: "fcn8s-heavy-pascal.prototxt"
|
||||
mean: [0, 0, 0]
|
||||
scale: 1.0
|
||||
width: 500
|
||||
height: 500
|
||||
rgb: false
|
||||
sample: "segmentation"
|
476
3rdparty/opencv-4.5.4/samples/dnn/object_detection.cpp
vendored
Normal file
476
3rdparty/opencv-4.5.4/samples/dnn/object_detection.cpp
vendored
Normal file
@ -0,0 +1,476 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
#ifdef CV_CXX11
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <queue>
|
||||
#endif
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
|
||||
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
|
||||
"{ device | 0 | camera device number. }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
|
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
|
||||
"{ classes | | Optional path to a text file with names of classes to label detected objects. }"
|
||||
"{ thr | .5 | Confidence threshold. }"
|
||||
"{ nms | .4 | Non-maximum suppression threshold. }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation, "
|
||||
"4: VKCOM, "
|
||||
"5: CUDA }"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU, "
|
||||
"4: Vulkan, "
|
||||
"6: CUDA, "
|
||||
"7: CUDA fp16 (half-float preprocess) }"
|
||||
"{ async | 0 | Number of asynchronous forwards at the same time. "
|
||||
"Choose 0 for synchronous mode }";
|
||||
|
||||
using namespace cv;
|
||||
using namespace dnn;
|
||||
|
||||
float confThreshold, nmsThreshold;
|
||||
std::vector<std::string> classes;
|
||||
|
||||
inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
|
||||
const Scalar& mean, bool swapRB);
|
||||
|
||||
void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net, int backend);
|
||||
|
||||
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);
|
||||
|
||||
void callback(int pos, void* userdata);
|
||||
|
||||
#ifdef CV_CXX11
|
||||
template <typename T>
|
||||
class QueueFPS : public std::queue<T>
|
||||
{
|
||||
public:
|
||||
QueueFPS() : counter(0) {}
|
||||
|
||||
void push(const T& entry)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
std::queue<T>::push(entry);
|
||||
counter += 1;
|
||||
if (counter == 1)
|
||||
{
|
||||
// Start counting from a second frame (warmup).
|
||||
tm.reset();
|
||||
tm.start();
|
||||
}
|
||||
}
|
||||
|
||||
T get()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
T entry = this->front();
|
||||
this->pop();
|
||||
return entry;
|
||||
}
|
||||
|
||||
float getFPS()
|
||||
{
|
||||
tm.stop();
|
||||
double fps = counter / tm.getTimeSec();
|
||||
tm.start();
|
||||
return static_cast<float>(fps);
|
||||
}
|
||||
|
||||
void clear()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
while (!this->empty())
|
||||
this->pop();
|
||||
}
|
||||
|
||||
unsigned int counter;
|
||||
|
||||
private:
|
||||
TickMeter tm;
|
||||
std::mutex mutex;
|
||||
};
|
||||
#endif // CV_CXX11
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
const std::string modelName = parser.get<String>("@alias");
|
||||
const std::string zooFile = parser.get<String>("zoo");
|
||||
|
||||
keys += genPreprocArguments(modelName, zooFile);
|
||||
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run object detection deep learning networks using OpenCV.");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
confThreshold = parser.get<float>("thr");
|
||||
nmsThreshold = parser.get<float>("nms");
|
||||
float scale = parser.get<float>("scale");
|
||||
Scalar mean = parser.get<Scalar>("mean");
|
||||
bool swapRB = parser.get<bool>("rgb");
|
||||
int inpWidth = parser.get<int>("width");
|
||||
int inpHeight = parser.get<int>("height");
|
||||
size_t asyncNumReq = parser.get<int>("async");
|
||||
CV_Assert(parser.has("model"));
|
||||
std::string modelPath = findFile(parser.get<String>("model"));
|
||||
std::string configPath = findFile(parser.get<String>("config"));
|
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes"))
|
||||
{
|
||||
std::string file = parser.get<String>("classes");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
classes.push_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Load a model.
|
||||
Net net = readNet(modelPath, configPath, parser.get<String>("framework"));
|
||||
int backend = parser.get<int>("backend");
|
||||
net.setPreferableBackend(backend);
|
||||
net.setPreferableTarget(parser.get<int>("target"));
|
||||
std::vector<String> outNames = net.getUnconnectedOutLayersNames();
|
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning object detection in OpenCV";
|
||||
namedWindow(kWinName, WINDOW_NORMAL);
|
||||
int initialConf = (int)(confThreshold * 100);
|
||||
createTrackbar("Confidence threshold, %", kWinName, &initialConf, 99, callback);
|
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap;
|
||||
if (parser.has("input"))
|
||||
cap.open(parser.get<String>("input"));
|
||||
else
|
||||
cap.open(parser.get<int>("device"));
|
||||
|
||||
#ifdef CV_CXX11
|
||||
bool process = true;
|
||||
|
||||
// Frames capturing thread
|
||||
QueueFPS<Mat> framesQueue;
|
||||
std::thread framesThread([&](){
|
||||
Mat frame;
|
||||
while (process)
|
||||
{
|
||||
cap >> frame;
|
||||
if (!frame.empty())
|
||||
framesQueue.push(frame.clone());
|
||||
else
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
// Frames processing thread
|
||||
QueueFPS<Mat> processedFramesQueue;
|
||||
QueueFPS<std::vector<Mat> > predictionsQueue;
|
||||
std::thread processingThread([&](){
|
||||
std::queue<AsyncArray> futureOutputs;
|
||||
Mat blob;
|
||||
while (process)
|
||||
{
|
||||
// Get a next frame
|
||||
Mat frame;
|
||||
{
|
||||
if (!framesQueue.empty())
|
||||
{
|
||||
frame = framesQueue.get();
|
||||
if (asyncNumReq)
|
||||
{
|
||||
if (futureOutputs.size() == asyncNumReq)
|
||||
frame = Mat();
|
||||
}
|
||||
else
|
||||
framesQueue.clear(); // Skip the rest of frames
|
||||
}
|
||||
}
|
||||
|
||||
// Process the frame
|
||||
if (!frame.empty())
|
||||
{
|
||||
preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
|
||||
processedFramesQueue.push(frame);
|
||||
|
||||
if (asyncNumReq)
|
||||
{
|
||||
futureOutputs.push(net.forwardAsync());
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<Mat> outs;
|
||||
net.forward(outs, outNames);
|
||||
predictionsQueue.push(outs);
|
||||
}
|
||||
}
|
||||
|
||||
while (!futureOutputs.empty() &&
|
||||
futureOutputs.front().wait_for(std::chrono::seconds(0)))
|
||||
{
|
||||
AsyncArray async_out = futureOutputs.front();
|
||||
futureOutputs.pop();
|
||||
Mat out;
|
||||
async_out.get(out);
|
||||
predictionsQueue.push({out});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Postprocessing and rendering loop
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
if (predictionsQueue.empty())
|
||||
continue;
|
||||
|
||||
std::vector<Mat> outs = predictionsQueue.get();
|
||||
Mat frame = processedFramesQueue.get();
|
||||
|
||||
postprocess(frame, outs, net, backend);
|
||||
|
||||
if (predictionsQueue.counter > 1)
|
||||
{
|
||||
std::string label = format("Camera: %.2f FPS", framesQueue.getFPS());
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
label = format("Network: %.2f FPS", predictionsQueue.getFPS());
|
||||
putText(frame, label, Point(0, 30), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
label = format("Skipped frames: %d", framesQueue.counter - predictionsQueue.counter);
|
||||
putText(frame, label, Point(0, 45), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
}
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
|
||||
process = false;
|
||||
framesThread.join();
|
||||
processingThread.join();
|
||||
|
||||
#else // CV_CXX11
|
||||
if (asyncNumReq)
|
||||
CV_Error(Error::StsNotImplemented, "Asynchronous forward is supported only with Inference Engine backend.");
|
||||
|
||||
// Process frames.
|
||||
Mat frame, blob;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);
|
||||
|
||||
std::vector<Mat> outs;
|
||||
net.forward(outs, outNames);
|
||||
|
||||
postprocess(frame, outs, net, backend);
|
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes;
|
||||
double freq = getTickFrequency() / 1000;
|
||||
double t = net.getPerfProfile(layersTimes) / freq;
|
||||
std::string label = format("Inference time: %.2f ms", t);
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
#endif // CV_CXX11
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
|
||||
const Scalar& mean, bool swapRB)
|
||||
{
|
||||
static Mat blob;
|
||||
// Create a 4D blob from a frame.
|
||||
if (inpSize.width <= 0) inpSize.width = frame.cols;
|
||||
if (inpSize.height <= 0) inpSize.height = frame.rows;
|
||||
blobFromImage(frame, blob, 1.0, inpSize, Scalar(), swapRB, false, CV_8U);
|
||||
|
||||
// Run a model.
|
||||
net.setInput(blob, "", scale, mean);
|
||||
if (net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
|
||||
{
|
||||
resize(frame, frame, inpSize);
|
||||
Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
|
||||
net.setInput(imInfo, "im_info");
|
||||
}
|
||||
}
|
||||
|
||||
void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net, int backend)
|
||||
{
|
||||
static std::vector<int> outLayers = net.getUnconnectedOutLayers();
|
||||
static std::string outLayerType = net.getLayer(outLayers[0])->type;
|
||||
|
||||
std::vector<int> classIds;
|
||||
std::vector<float> confidences;
|
||||
std::vector<Rect> boxes;
|
||||
if (outLayerType == "DetectionOutput")
|
||||
{
|
||||
// Network produces output blob with a shape 1x1xNx7 where N is a number of
|
||||
// detections and an every detection is a vector of values
|
||||
// [batchId, classId, confidence, left, top, right, bottom]
|
||||
CV_Assert(outs.size() > 0);
|
||||
for (size_t k = 0; k < outs.size(); k++)
|
||||
{
|
||||
float* data = (float*)outs[k].data;
|
||||
for (size_t i = 0; i < outs[k].total(); i += 7)
|
||||
{
|
||||
float confidence = data[i + 2];
|
||||
if (confidence > confThreshold)
|
||||
{
|
||||
int left = (int)data[i + 3];
|
||||
int top = (int)data[i + 4];
|
||||
int right = (int)data[i + 5];
|
||||
int bottom = (int)data[i + 6];
|
||||
int width = right - left + 1;
|
||||
int height = bottom - top + 1;
|
||||
if (width <= 2 || height <= 2)
|
||||
{
|
||||
left = (int)(data[i + 3] * frame.cols);
|
||||
top = (int)(data[i + 4] * frame.rows);
|
||||
right = (int)(data[i + 5] * frame.cols);
|
||||
bottom = (int)(data[i + 6] * frame.rows);
|
||||
width = right - left + 1;
|
||||
height = bottom - top + 1;
|
||||
}
|
||||
classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
|
||||
boxes.push_back(Rect(left, top, width, height));
|
||||
confidences.push_back(confidence);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (outLayerType == "Region")
|
||||
{
|
||||
for (size_t i = 0; i < outs.size(); ++i)
|
||||
{
|
||||
// Network produces output blob with a shape NxC where N is a number of
|
||||
// detected objects and C is a number of classes + 4 where the first 4
|
||||
// numbers are [center_x, center_y, width, height]
|
||||
float* data = (float*)outs[i].data;
|
||||
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
|
||||
{
|
||||
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
|
||||
Point classIdPoint;
|
||||
double confidence;
|
||||
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
|
||||
if (confidence > confThreshold)
|
||||
{
|
||||
int centerX = (int)(data[0] * frame.cols);
|
||||
int centerY = (int)(data[1] * frame.rows);
|
||||
int width = (int)(data[2] * frame.cols);
|
||||
int height = (int)(data[3] * frame.rows);
|
||||
int left = centerX - width / 2;
|
||||
int top = centerY - height / 2;
|
||||
|
||||
classIds.push_back(classIdPoint.x);
|
||||
confidences.push_back((float)confidence);
|
||||
boxes.push_back(Rect(left, top, width, height));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
|
||||
|
||||
// NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
|
||||
// or NMS is required if number of outputs > 1
|
||||
if (outLayers.size() > 1 || (outLayerType == "Region" && backend != DNN_BACKEND_OPENCV))
|
||||
{
|
||||
std::map<int, std::vector<size_t> > class2indices;
|
||||
for (size_t i = 0; i < classIds.size(); i++)
|
||||
{
|
||||
if (confidences[i] >= confThreshold)
|
||||
{
|
||||
class2indices[classIds[i]].push_back(i);
|
||||
}
|
||||
}
|
||||
std::vector<Rect> nmsBoxes;
|
||||
std::vector<float> nmsConfidences;
|
||||
std::vector<int> nmsClassIds;
|
||||
for (std::map<int, std::vector<size_t> >::iterator it = class2indices.begin(); it != class2indices.end(); ++it)
|
||||
{
|
||||
std::vector<Rect> localBoxes;
|
||||
std::vector<float> localConfidences;
|
||||
std::vector<size_t> classIndices = it->second;
|
||||
for (size_t i = 0; i < classIndices.size(); i++)
|
||||
{
|
||||
localBoxes.push_back(boxes[classIndices[i]]);
|
||||
localConfidences.push_back(confidences[classIndices[i]]);
|
||||
}
|
||||
std::vector<int> nmsIndices;
|
||||
NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, nmsIndices);
|
||||
for (size_t i = 0; i < nmsIndices.size(); i++)
|
||||
{
|
||||
size_t idx = nmsIndices[i];
|
||||
nmsBoxes.push_back(localBoxes[idx]);
|
||||
nmsConfidences.push_back(localConfidences[idx]);
|
||||
nmsClassIds.push_back(it->first);
|
||||
}
|
||||
}
|
||||
boxes = nmsBoxes;
|
||||
classIds = nmsClassIds;
|
||||
confidences = nmsConfidences;
|
||||
}
|
||||
|
||||
for (size_t idx = 0; idx < boxes.size(); ++idx)
|
||||
{
|
||||
Rect box = boxes[idx];
|
||||
drawPred(classIds[idx], confidences[idx], box.x, box.y,
|
||||
box.x + box.width, box.y + box.height, frame);
|
||||
}
|
||||
}
|
||||
|
||||
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
|
||||
{
|
||||
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
|
||||
|
||||
std::string label = format("%.2f", conf);
|
||||
if (!classes.empty())
|
||||
{
|
||||
CV_Assert(classId < (int)classes.size());
|
||||
label = classes[classId] + ": " + label;
|
||||
}
|
||||
|
||||
int baseLine;
|
||||
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
|
||||
|
||||
top = max(top, labelSize.height);
|
||||
rectangle(frame, Point(left, top - labelSize.height),
|
||||
Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
|
||||
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
|
||||
}
|
||||
|
||||
void callback(int pos, void*)
|
||||
{
|
||||
confThreshold = pos * 0.01f;
|
||||
}
|
329
3rdparty/opencv-4.5.4/samples/dnn/object_detection.py
vendored
Normal file
329
3rdparty/opencv-4.5.4/samples/dnn/object_detection.py
vendored
Normal file
@ -0,0 +1,329 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import numpy as np
|
||||
import sys
|
||||
import time
|
||||
from threading import Thread
|
||||
if sys.version_info[0] == 2:
|
||||
import Queue as queue
|
||||
else:
|
||||
import queue
|
||||
|
||||
from common import *
|
||||
from tf_text_graph_common import readTextMessage
|
||||
from tf_text_graph_ssd import createSSDGraph
|
||||
from tf_text_graph_faster_rcnn import createFasterRCNNGraph
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
|
||||
cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
|
||||
help='An optional path to file with preprocessing parameters.')
|
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--out_tf_graph', default='graph.pbtxt',
|
||||
help='For models from TensorFlow Object Detection API, you may '
|
||||
'pass a .config file which was used for training through --config '
|
||||
'argument. This way an additional .pbtxt file with TensorFlow graph will be created.')
|
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt'],
|
||||
help='Optional name of an origin framework of the model. '
|
||||
'Detect it automatically if it does not set.')
|
||||
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
|
||||
parser.add_argument('--nms', type=float, default=0.4, help='Non-maximum suppression threshold')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA fp16 (half-float preprocess)' % targets)
|
||||
parser.add_argument('--async', type=int, default=0,
|
||||
dest='asyncN',
|
||||
help='Number of asynchronous forwards at the same time. '
|
||||
'Choose 0 for synchronous mode')
|
||||
args, _ = parser.parse_known_args()
|
||||
add_preproc_args(args.zoo, parser, 'object_detection')
|
||||
parser = argparse.ArgumentParser(parents=[parser],
|
||||
description='Use this script to run object detection deep learning networks using OpenCV.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.model = findFile(args.model)
|
||||
args.config = findFile(args.config)
|
||||
args.classes = findFile(args.classes)
|
||||
|
||||
# If config specified, try to load it as TensorFlow Object Detection API's pipeline.
|
||||
config = readTextMessage(args.config)
|
||||
if 'model' in config:
|
||||
print('TensorFlow Object Detection API config detected')
|
||||
if 'ssd' in config['model'][0]:
|
||||
print('Preparing text graph representation for SSD model: ' + args.out_tf_graph)
|
||||
createSSDGraph(args.model, args.config, args.out_tf_graph)
|
||||
args.config = args.out_tf_graph
|
||||
elif 'faster_rcnn' in config['model'][0]:
|
||||
print('Preparing text graph representation for Faster-RCNN model: ' + args.out_tf_graph)
|
||||
createFasterRCNNGraph(args.model, args.config, args.out_tf_graph)
|
||||
args.config = args.out_tf_graph
|
||||
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config), args.framework)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
outNames = net.getUnconnectedOutLayersNames()
|
||||
|
||||
confThreshold = args.thr
|
||||
nmsThreshold = args.nms
|
||||
|
||||
def postprocess(frame, outs):
|
||||
frameHeight = frame.shape[0]
|
||||
frameWidth = frame.shape[1]
|
||||
|
||||
def drawPred(classId, conf, left, top, right, bottom):
|
||||
# Draw a bounding box.
|
||||
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))
|
||||
|
||||
label = '%.2f' % conf
|
||||
|
||||
# Print a label of class.
|
||||
if classes:
|
||||
assert(classId < len(classes))
|
||||
label = '%s: %s' % (classes[classId], label)
|
||||
|
||||
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
||||
top = max(top, labelSize[1])
|
||||
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
|
||||
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
|
||||
layerNames = net.getLayerNames()
|
||||
lastLayerId = net.getLayerId(layerNames[-1])
|
||||
lastLayer = net.getLayer(lastLayerId)
|
||||
|
||||
classIds = []
|
||||
confidences = []
|
||||
boxes = []
|
||||
if lastLayer.type == 'DetectionOutput':
|
||||
# Network produces output blob with a shape 1x1xNx7 where N is a number of
|
||||
# detections and an every detection is a vector of values
|
||||
# [batchId, classId, confidence, left, top, right, bottom]
|
||||
for out in outs:
|
||||
for detection in out[0, 0]:
|
||||
confidence = detection[2]
|
||||
if confidence > confThreshold:
|
||||
left = int(detection[3])
|
||||
top = int(detection[4])
|
||||
right = int(detection[5])
|
||||
bottom = int(detection[6])
|
||||
width = right - left + 1
|
||||
height = bottom - top + 1
|
||||
if width <= 2 or height <= 2:
|
||||
left = int(detection[3] * frameWidth)
|
||||
top = int(detection[4] * frameHeight)
|
||||
right = int(detection[5] * frameWidth)
|
||||
bottom = int(detection[6] * frameHeight)
|
||||
width = right - left + 1
|
||||
height = bottom - top + 1
|
||||
classIds.append(int(detection[1]) - 1) # Skip background label
|
||||
confidences.append(float(confidence))
|
||||
boxes.append([left, top, width, height])
|
||||
elif lastLayer.type == 'Region':
|
||||
# Network produces output blob with a shape NxC where N is a number of
|
||||
# detected objects and C is a number of classes + 4 where the first 4
|
||||
# numbers are [center_x, center_y, width, height]
|
||||
for out in outs:
|
||||
for detection in out:
|
||||
scores = detection[5:]
|
||||
classId = np.argmax(scores)
|
||||
confidence = scores[classId]
|
||||
if confidence > confThreshold:
|
||||
center_x = int(detection[0] * frameWidth)
|
||||
center_y = int(detection[1] * frameHeight)
|
||||
width = int(detection[2] * frameWidth)
|
||||
height = int(detection[3] * frameHeight)
|
||||
left = int(center_x - width / 2)
|
||||
top = int(center_y - height / 2)
|
||||
classIds.append(classId)
|
||||
confidences.append(float(confidence))
|
||||
boxes.append([left, top, width, height])
|
||||
else:
|
||||
print('Unknown output layer type: ' + lastLayer.type)
|
||||
exit()
|
||||
|
||||
# NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
|
||||
# or NMS is required if number of outputs > 1
|
||||
if len(outNames) > 1 or lastLayer.type == 'Region' and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
|
||||
indices = []
|
||||
classIds = np.array(classIds)
|
||||
boxes = np.array(boxes)
|
||||
confidences = np.array(confidences)
|
||||
unique_classes = set(classIds)
|
||||
for cl in unique_classes:
|
||||
class_indices = np.where(classIds == cl)[0]
|
||||
conf = confidences[class_indices]
|
||||
box = boxes[class_indices].tolist()
|
||||
nms_indices = cv.dnn.NMSBoxes(box, conf, confThreshold, nmsThreshold)
|
||||
nms_indices = nms_indices[:, 0] if len(nms_indices) else []
|
||||
indices.extend(class_indices[nms_indices])
|
||||
else:
|
||||
indices = np.arange(0, len(classIds))
|
||||
|
||||
for i in indices:
|
||||
box = boxes[i]
|
||||
left = box[0]
|
||||
top = box[1]
|
||||
width = box[2]
|
||||
height = box[3]
|
||||
drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
|
||||
|
||||
# Process inputs
|
||||
winName = 'Deep learning object detection in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
def callback(pos):
|
||||
global confThreshold
|
||||
confThreshold = pos / 100.0
|
||||
|
||||
cv.createTrackbar('Confidence threshold, %', winName, int(confThreshold * 100), 99, callback)
|
||||
|
||||
cap = cv.VideoCapture(cv.samples.findFileOrKeep(args.input) if args.input else 0)
|
||||
|
||||
class QueueFPS(queue.Queue):
|
||||
def __init__(self):
|
||||
queue.Queue.__init__(self)
|
||||
self.startTime = 0
|
||||
self.counter = 0
|
||||
|
||||
def put(self, v):
|
||||
queue.Queue.put(self, v)
|
||||
self.counter += 1
|
||||
if self.counter == 1:
|
||||
self.startTime = time.time()
|
||||
|
||||
def getFPS(self):
|
||||
return self.counter / (time.time() - self.startTime)
|
||||
|
||||
|
||||
process = True
|
||||
|
||||
#
|
||||
# Frames capturing thread
|
||||
#
|
||||
framesQueue = QueueFPS()
|
||||
def framesThreadBody():
|
||||
global framesQueue, process
|
||||
|
||||
while process:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
break
|
||||
framesQueue.put(frame)
|
||||
|
||||
|
||||
#
|
||||
# Frames processing thread
|
||||
#
|
||||
processedFramesQueue = queue.Queue()
|
||||
predictionsQueue = QueueFPS()
|
||||
def processingThreadBody():
|
||||
global processedFramesQueue, predictionsQueue, args, process
|
||||
|
||||
futureOutputs = []
|
||||
while process:
|
||||
# Get a next frame
|
||||
frame = None
|
||||
try:
|
||||
frame = framesQueue.get_nowait()
|
||||
|
||||
if args.asyncN:
|
||||
if len(futureOutputs) == args.asyncN:
|
||||
frame = None # Skip the frame
|
||||
else:
|
||||
framesQueue.queue.clear() # Skip the rest of frames
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
|
||||
if not frame is None:
|
||||
frameHeight = frame.shape[0]
|
||||
frameWidth = frame.shape[1]
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
inpWidth = args.width if args.width else frameWidth
|
||||
inpHeight = args.height if args.height else frameHeight
|
||||
blob = cv.dnn.blobFromImage(frame, size=(inpWidth, inpHeight), swapRB=args.rgb, ddepth=cv.CV_8U)
|
||||
processedFramesQueue.put(frame)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob, scalefactor=args.scale, mean=args.mean)
|
||||
if net.getLayer(0).outputNameToIndex('im_info') != -1: # Faster-RCNN or R-FCN
|
||||
frame = cv.resize(frame, (inpWidth, inpHeight))
|
||||
net.setInput(np.array([[inpHeight, inpWidth, 1.6]], dtype=np.float32), 'im_info')
|
||||
|
||||
if args.asyncN:
|
||||
futureOutputs.append(net.forwardAsync())
|
||||
else:
|
||||
outs = net.forward(outNames)
|
||||
predictionsQueue.put(np.copy(outs))
|
||||
|
||||
while futureOutputs and futureOutputs[0].wait_for(0):
|
||||
out = futureOutputs[0].get()
|
||||
predictionsQueue.put(np.copy([out]))
|
||||
|
||||
del futureOutputs[0]
|
||||
|
||||
|
||||
framesThread = Thread(target=framesThreadBody)
|
||||
framesThread.start()
|
||||
|
||||
processingThread = Thread(target=processingThreadBody)
|
||||
processingThread.start()
|
||||
|
||||
#
|
||||
# Postprocessing and rendering loop
|
||||
#
|
||||
while cv.waitKey(1) < 0:
|
||||
try:
|
||||
# Request prediction first because they put after frames
|
||||
outs = predictionsQueue.get_nowait()
|
||||
frame = processedFramesQueue.get_nowait()
|
||||
|
||||
postprocess(frame, outs)
|
||||
|
||||
# Put efficiency information.
|
||||
if predictionsQueue.counter > 1:
|
||||
label = 'Camera: %.2f FPS' % (framesQueue.getFPS())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
label = 'Network: %.2f FPS' % (predictionsQueue.getFPS())
|
||||
cv.putText(frame, label, (0, 30), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
label = 'Skipped frames: %d' % (framesQueue.counter - predictionsQueue.counter)
|
||||
cv.putText(frame, label, (0, 45), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
cv.imshow(winName, frame)
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
|
||||
process = False
|
||||
framesThread.join()
|
||||
processingThread.join()
|
157
3rdparty/opencv-4.5.4/samples/dnn/openpose.cpp
vendored
Normal file
157
3rdparty/opencv-4.5.4/samples/dnn/openpose.cpp
vendored
Normal file
@ -0,0 +1,157 @@
|
||||
//
|
||||
// this sample demonstrates the use of pretrained openpose networks with opencv's dnn module.
|
||||
//
|
||||
// it can be used for body pose detection, using either the COCO model(18 parts):
|
||||
// http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/coco/pose_iter_440000.caffemodel
|
||||
// https://raw.githubusercontent.com/opencv/opencv_extra/master/testdata/dnn/openpose_pose_coco.prototxt
|
||||
//
|
||||
// or the MPI model(16 parts):
|
||||
// http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/mpi/pose_iter_160000.caffemodel
|
||||
// https://raw.githubusercontent.com/opencv/opencv_extra/master/testdata/dnn/openpose_pose_mpi_faster_4_stages.prototxt
|
||||
//
|
||||
// (to simplify this sample, the body models are restricted to a single person.)
|
||||
//
|
||||
//
|
||||
// you can also try the hand pose model:
|
||||
// http://posefs1.perception.cs.cmu.edu/OpenPose/models/hand/pose_iter_102000.caffemodel
|
||||
// https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/models/hand/pose_deploy.prototxt
|
||||
//
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
// connection table, in the format [model_id][pair_id][from/to]
|
||||
// please look at the nice explanation at the bottom of:
|
||||
// https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/output.md
|
||||
//
|
||||
const int POSE_PAIRS[3][20][2] = {
|
||||
{ // COCO body
|
||||
{1,2}, {1,5}, {2,3},
|
||||
{3,4}, {5,6}, {6,7},
|
||||
{1,8}, {8,9}, {9,10},
|
||||
{1,11}, {11,12}, {12,13},
|
||||
{1,0}, {0,14},
|
||||
{14,16}, {0,15}, {15,17}
|
||||
},
|
||||
{ // MPI body
|
||||
{0,1}, {1,2}, {2,3},
|
||||
{3,4}, {1,5}, {5,6},
|
||||
{6,7}, {1,14}, {14,8}, {8,9},
|
||||
{9,10}, {14,11}, {11,12}, {12,13}
|
||||
},
|
||||
{ // hand
|
||||
{0,1}, {1,2}, {2,3}, {3,4}, // thumb
|
||||
{0,5}, {5,6}, {6,7}, {7,8}, // pinkie
|
||||
{0,9}, {9,10}, {10,11}, {11,12}, // middle
|
||||
{0,13}, {13,14}, {14,15}, {15,16}, // ring
|
||||
{0,17}, {17,18}, {18,19}, {19,20} // small
|
||||
}};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv,
|
||||
"{ h help | false | print this help message }"
|
||||
"{ p proto | | (required) model configuration, e.g. hand/pose.prototxt }"
|
||||
"{ m model | | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
|
||||
"{ i image | | (required) path to image file (containing a single person, or hand) }"
|
||||
"{ d dataset | | specify what kind of model was trained. It could be (COCO, MPI, HAND) depends on dataset. }"
|
||||
"{ width | 368 | Preprocess input image by resizing to a specific width. }"
|
||||
"{ height | 368 | Preprocess input image by resizing to a specific height. }"
|
||||
"{ t threshold | 0.1 | threshold or confidence value for the heatmap }"
|
||||
"{ s scale | 0.003922 | scale for blob }"
|
||||
);
|
||||
|
||||
String modelTxt = samples::findFile(parser.get<string>("proto"));
|
||||
String modelBin = samples::findFile(parser.get<string>("model"));
|
||||
String imageFile = samples::findFile(parser.get<String>("image"));
|
||||
String dataset = parser.get<String>("dataset");
|
||||
int W_in = parser.get<int>("width");
|
||||
int H_in = parser.get<int>("height");
|
||||
float thresh = parser.get<float>("threshold");
|
||||
float scale = parser.get<float>("scale");
|
||||
|
||||
if (parser.get<bool>("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty())
|
||||
{
|
||||
cout << "A sample app to demonstrate human or hand pose detection with a pretrained OpenPose dnn." << endl;
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int midx, npairs, nparts;
|
||||
if (!dataset.compare("COCO")) { midx = 0; npairs = 17; nparts = 18; }
|
||||
else if (!dataset.compare("MPI")) { midx = 1; npairs = 14; nparts = 16; }
|
||||
else if (!dataset.compare("HAND")) { midx = 2; npairs = 20; nparts = 22; }
|
||||
else
|
||||
{
|
||||
std::cerr << "Can't interpret dataset parameter: " << dataset << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// read the network model
|
||||
Net net = readNet(modelBin, modelTxt);
|
||||
// and the image
|
||||
Mat img = imread(imageFile);
|
||||
if (img.empty())
|
||||
{
|
||||
std::cerr << "Can't read image from the file: " << imageFile << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// send it through the network
|
||||
Mat inputBlob = blobFromImage(img, scale, Size(W_in, H_in), Scalar(0, 0, 0), false, false);
|
||||
net.setInput(inputBlob);
|
||||
Mat result = net.forward();
|
||||
// the result is an array of "heatmaps", the probability of a body part being in location x,y
|
||||
|
||||
int H = result.size[2];
|
||||
int W = result.size[3];
|
||||
|
||||
// find the position of the body parts
|
||||
vector<Point> points(22);
|
||||
for (int n=0; n<nparts; n++)
|
||||
{
|
||||
// Slice heatmap of corresponding body's part.
|
||||
Mat heatMap(H, W, CV_32F, result.ptr(0,n));
|
||||
// 1 maximum per heatmap
|
||||
Point p(-1,-1),pm;
|
||||
double conf;
|
||||
minMaxLoc(heatMap, 0, &conf, 0, &pm);
|
||||
if (conf > thresh)
|
||||
p = pm;
|
||||
points[n] = p;
|
||||
}
|
||||
|
||||
// connect body parts and draw it !
|
||||
float SX = float(img.cols) / W;
|
||||
float SY = float(img.rows) / H;
|
||||
for (int n=0; n<npairs; n++)
|
||||
{
|
||||
// lookup 2 connected body/hand parts
|
||||
Point2f a = points[POSE_PAIRS[midx][n][0]];
|
||||
Point2f b = points[POSE_PAIRS[midx][n][1]];
|
||||
|
||||
// we did not find enough confidence before
|
||||
if (a.x<=0 || a.y<=0 || b.x<=0 || b.y<=0)
|
||||
continue;
|
||||
|
||||
// scale to image size
|
||||
a.x*=SX; a.y*=SY;
|
||||
b.x*=SX; b.y*=SY;
|
||||
|
||||
line(img, a, b, Scalar(0,200,0), 2);
|
||||
circle(img, a, 3, Scalar(0,0,200), -1);
|
||||
circle(img, b, 3, Scalar(0,0,200), -1);
|
||||
}
|
||||
|
||||
imshow("OpenPose", img);
|
||||
waitKey();
|
||||
|
||||
return 0;
|
||||
}
|
122
3rdparty/opencv-4.5.4/samples/dnn/openpose.py
vendored
Normal file
122
3rdparty/opencv-4.5.4/samples/dnn/openpose.py
vendored
Normal file
@ -0,0 +1,122 @@
|
||||
# To use Inference Engine backend, specify location of plugins:
|
||||
# source /opt/intel/computer_vision_sdk/bin/setupvars.sh
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This script is used to demonstrate OpenPose human pose estimation network '
|
||||
'from https://github.com/CMU-Perceptual-Computing-Lab/openpose project using OpenCV. '
|
||||
'The sample and model are simplified and could be used for a single person on the frame.')
|
||||
parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
|
||||
parser.add_argument('--proto', help='Path to .prototxt')
|
||||
parser.add_argument('--model', help='Path to .caffemodel')
|
||||
parser.add_argument('--dataset', help='Specify what kind of model was trained. '
|
||||
'It could be (COCO, MPI, HAND) depends on dataset.')
|
||||
parser.add_argument('--thr', default=0.1, type=float, help='Threshold value for pose parts heat map')
|
||||
parser.add_argument('--width', default=368, type=int, help='Resize input to specific width.')
|
||||
parser.add_argument('--height', default=368, type=int, help='Resize input to specific height.')
|
||||
parser.add_argument('--scale', default=0.003922, type=float, help='Scale for blob.')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dataset == 'COCO':
|
||||
BODY_PARTS = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
|
||||
"LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
|
||||
"RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14,
|
||||
"LEye": 15, "REar": 16, "LEar": 17, "Background": 18 }
|
||||
|
||||
POSE_PAIRS = [ ["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
|
||||
["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"],
|
||||
["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
|
||||
["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
|
||||
["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ]
|
||||
elif args.dataset == 'MPI':
|
||||
BODY_PARTS = { "Head": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
|
||||
"LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
|
||||
"RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "Chest": 14,
|
||||
"Background": 15 }
|
||||
|
||||
POSE_PAIRS = [ ["Head", "Neck"], ["Neck", "RShoulder"], ["RShoulder", "RElbow"],
|
||||
["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"],
|
||||
["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"],
|
||||
["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ]
|
||||
elif args.dataset == 'HAND':
|
||||
BODY_PARTS = { "Wrist": 0,
|
||||
"ThumbMetacarpal": 1, "ThumbProximal": 2, "ThumbMiddle": 3, "ThumbDistal": 4,
|
||||
"IndexFingerMetacarpal": 5, "IndexFingerProximal": 6, "IndexFingerMiddle": 7, "IndexFingerDistal": 8,
|
||||
"MiddleFingerMetacarpal": 9, "MiddleFingerProximal": 10, "MiddleFingerMiddle": 11, "MiddleFingerDistal": 12,
|
||||
"RingFingerMetacarpal": 13, "RingFingerProximal": 14, "RingFingerMiddle": 15, "RingFingerDistal": 16,
|
||||
"LittleFingerMetacarpal": 17, "LittleFingerProximal": 18, "LittleFingerMiddle": 19, "LittleFingerDistal": 20,
|
||||
}
|
||||
|
||||
POSE_PAIRS = [ ["Wrist", "ThumbMetacarpal"], ["ThumbMetacarpal", "ThumbProximal"],
|
||||
["ThumbProximal", "ThumbMiddle"], ["ThumbMiddle", "ThumbDistal"],
|
||||
["Wrist", "IndexFingerMetacarpal"], ["IndexFingerMetacarpal", "IndexFingerProximal"],
|
||||
["IndexFingerProximal", "IndexFingerMiddle"], ["IndexFingerMiddle", "IndexFingerDistal"],
|
||||
["Wrist", "MiddleFingerMetacarpal"], ["MiddleFingerMetacarpal", "MiddleFingerProximal"],
|
||||
["MiddleFingerProximal", "MiddleFingerMiddle"], ["MiddleFingerMiddle", "MiddleFingerDistal"],
|
||||
["Wrist", "RingFingerMetacarpal"], ["RingFingerMetacarpal", "RingFingerProximal"],
|
||||
["RingFingerProximal", "RingFingerMiddle"], ["RingFingerMiddle", "RingFingerDistal"],
|
||||
["Wrist", "LittleFingerMetacarpal"], ["LittleFingerMetacarpal", "LittleFingerProximal"],
|
||||
["LittleFingerProximal", "LittleFingerMiddle"], ["LittleFingerMiddle", "LittleFingerDistal"] ]
|
||||
else:
|
||||
raise(Exception("you need to specify either 'COCO', 'MPI', or 'Hand' in args.dataset"))
|
||||
|
||||
inWidth = args.width
|
||||
inHeight = args.height
|
||||
inScale = args.scale
|
||||
|
||||
net = cv.dnn.readNet(cv.samples.findFile(args.proto), cv.samples.findFile(args.model))
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
frameWidth = frame.shape[1]
|
||||
frameHeight = frame.shape[0]
|
||||
inp = cv.dnn.blobFromImage(frame, inScale, (inWidth, inHeight),
|
||||
(0, 0, 0), swapRB=False, crop=False)
|
||||
net.setInput(inp)
|
||||
out = net.forward()
|
||||
|
||||
assert(len(BODY_PARTS) <= out.shape[1])
|
||||
|
||||
points = []
|
||||
for i in range(len(BODY_PARTS)):
|
||||
# Slice heatmap of corresponding body's part.
|
||||
heatMap = out[0, i, :, :]
|
||||
|
||||
# Originally, we try to find all the local maximums. To simplify a sample
|
||||
# we just find a global one. However only a single pose at the same time
|
||||
# could be detected this way.
|
||||
_, conf, _, point = cv.minMaxLoc(heatMap)
|
||||
x = (frameWidth * point[0]) / out.shape[3]
|
||||
y = (frameHeight * point[1]) / out.shape[2]
|
||||
|
||||
# Add a point if it's confidence is higher than threshold.
|
||||
points.append((int(x), int(y)) if conf > args.thr else None)
|
||||
|
||||
for pair in POSE_PAIRS:
|
||||
partFrom = pair[0]
|
||||
partTo = pair[1]
|
||||
assert(partFrom in BODY_PARTS)
|
||||
assert(partTo in BODY_PARTS)
|
||||
|
||||
idFrom = BODY_PARTS[partFrom]
|
||||
idTo = BODY_PARTS[partTo]
|
||||
|
||||
if points[idFrom] and points[idTo]:
|
||||
cv.line(frame, points[idFrom], points[idTo], (0, 255, 0), 3)
|
||||
cv.ellipse(frame, points[idFrom], (3, 3), 0, 0, 360, (0, 0, 255), cv.FILLED)
|
||||
cv.ellipse(frame, points[idTo], (3, 3), 0, 0, 360, (0, 0, 255), cv.FILLED)
|
||||
|
||||
t, _ = net.getPerfProfile()
|
||||
freq = cv.getTickFrequency() / 1000
|
||||
cv.putText(frame, '%.2fms' % (t / freq), (10, 20), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
|
||||
|
||||
cv.imshow('OpenPose using OpenCV', frame)
|
103
3rdparty/opencv-4.5.4/samples/dnn/optical_flow.py
vendored
Normal file
103
3rdparty/opencv-4.5.4/samples/dnn/optical_flow.py
vendored
Normal file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
This sample using FlowNet v2 model to calculate optical flow.
|
||||
Original paper: https://arxiv.org/abs/1612.01925.
|
||||
Original repo: https://github.com/lmb-freiburg/flownet2.
|
||||
|
||||
Download the converted .caffemodel model from https://drive.google.com/open?id=16qvE9VNmU39NttpZwZs81Ga8VYQJDaWZ
|
||||
and .prototxt from https://drive.google.com/file/d/1RyNIUsan1ZOh2hpYIH36A-jofAvJlT6a/view?usp=sharing.
|
||||
Otherwise download original model from https://lmb.informatik.uni-freiburg.de/resources/binaries/flownet2/flownet2-models.tar.gz,
|
||||
convert .h5 model to .caffemodel and modify original .prototxt using .prototxt from link above.
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
|
||||
class OpticalFlow(object):
|
||||
def __init__(self, proto, model, height, width):
|
||||
self.net = cv.dnn.readNetFromCaffe(proto, model)
|
||||
self.net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
|
||||
self.height = height
|
||||
self.width = width
|
||||
|
||||
def compute_flow(self, first_img, second_img):
|
||||
inp0 = cv.dnn.blobFromImage(first_img, size=(self.width, self.height))
|
||||
inp1 = cv.dnn.blobFromImage(second_img, size=(self.width, self.height))
|
||||
self.net.setInput(inp0, "img0")
|
||||
self.net.setInput(inp1, "img1")
|
||||
flow = self.net.forward()
|
||||
output = self.motion_to_color(flow)
|
||||
return output
|
||||
|
||||
def motion_to_color(self, flow):
|
||||
arr = np.arange(0, 255, dtype=np.uint8)
|
||||
colormap = cv.applyColorMap(arr, cv.COLORMAP_HSV)
|
||||
colormap = colormap.squeeze(1)
|
||||
|
||||
flow = flow.squeeze(0)
|
||||
fx, fy = flow[0, ...], flow[1, ...]
|
||||
rad = np.sqrt(fx**2 + fy**2)
|
||||
maxrad = rad.max() if rad.max() != 0 else 1
|
||||
|
||||
ncols = arr.size
|
||||
rad = rad[..., np.newaxis] / maxrad
|
||||
a = np.arctan2(-fy / maxrad, -fx / maxrad) / np.pi
|
||||
fk = (a + 1) / 2.0 * (ncols - 1)
|
||||
k0 = fk.astype(np.int)
|
||||
k1 = (k0 + 1) % ncols
|
||||
f = fk[..., np.newaxis] - k0[..., np.newaxis]
|
||||
|
||||
col0 = colormap[k0] / 255.0
|
||||
col1 = colormap[k1] / 255.0
|
||||
col = (1 - f) * col0 + f * col1
|
||||
col = np.where(rad <= 1, 1 - rad * (1 - col), col * 0.75)
|
||||
output = (255.0 * col).astype(np.uint8)
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Use this script to calculate optical flow using FlowNetv2',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('-input', '-i', required=True, help='Path to input video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--height', default=320, type=int, help='Input height')
|
||||
parser.add_argument('--width', default=448, type=int, help='Input width')
|
||||
parser.add_argument('--proto', '-p', default='FlowNet2_deploy_anysize.prototxt', help='Path to prototxt.')
|
||||
parser.add_argument('--model', '-m', default='FlowNet2_weights.caffemodel', help='Path to caffemodel.')
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.model) or not os.path.isfile(args.proto):
|
||||
raise OSError("Prototxt or caffemodel not exist")
|
||||
|
||||
winName = 'Calculation optical flow in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
hasFrame, first_frame = cap.read()
|
||||
|
||||
divisor = 64.
|
||||
var = {}
|
||||
var['ADAPTED_WIDTH'] = int(np.ceil(args.width/divisor) * divisor)
|
||||
var['ADAPTED_HEIGHT'] = int(np.ceil(args.height/divisor) * divisor)
|
||||
var['SCALE_WIDTH'] = args.width / float(var['ADAPTED_WIDTH'])
|
||||
var['SCALE_HEIGHT'] = args.height / float(var['ADAPTED_HEIGHT'])
|
||||
|
||||
config = ''
|
||||
proto = open(args.proto).readlines()
|
||||
for line in proto:
|
||||
for key, value in var.items():
|
||||
tag = "$%s$" % key
|
||||
line = line.replace(tag, str(value))
|
||||
config += line
|
||||
|
||||
caffemodel = open(args.model, 'rb').read()
|
||||
|
||||
opt_flow = OpticalFlow(bytearray(config.encode()), caffemodel, var['ADAPTED_HEIGHT'], var['ADAPTED_WIDTH'])
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, second_frame = cap.read()
|
||||
if not hasFrame:
|
||||
break
|
||||
flow = opt_flow.compute_flow(first_frame, second_frame)
|
||||
first_frame = second_frame
|
||||
cv.imshow(winName, flow)
|
242
3rdparty/opencv-4.5.4/samples/dnn/person_reid.cpp
vendored
Normal file
242
3rdparty/opencv-4.5.4/samples/dnn/person_reid.cpp
vendored
Normal file
@ -0,0 +1,242 @@
|
||||
//
|
||||
// You can download a baseline ReID model and sample input from:
|
||||
// https://github.com/ReID-Team/ReID_extra_testdata
|
||||
//
|
||||
// Authors of samples and Youtu ReID baseline:
|
||||
// Xing Sun <winfredsun@tencent.com>
|
||||
// Feng Zheng <zhengf@sustech.edu.cn>
|
||||
// Xinyang Jiang <sevjiang@tencent.com>
|
||||
// Fufu Yu <fufuyu@tencent.com>
|
||||
// Enwei Zhang <miyozhang@tencent.com>
|
||||
//
|
||||
// Copyright (C) 2020-2021, Tencent.
|
||||
// Copyright (C) 2020-2021, SUSTech.
|
||||
//
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
const char* keys =
|
||||
"{help h | | show help message}"
|
||||
"{model m | | network model}"
|
||||
"{query_list q | | list of query images}"
|
||||
"{gallery_list g | | list of gallery images}"
|
||||
"{batch_size | 32 | batch size of each inference}"
|
||||
"{resize_h | 256 | resize input to specific height.}"
|
||||
"{resize_w | 128 | resize input to specific width.}"
|
||||
"{topk k | 5 | number of gallery images showed in visualization}"
|
||||
"{output_dir | | path for visualization(it should be existed)}"
|
||||
"{backend b | 0 | choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation, "
|
||||
"4: VKCOM, "
|
||||
"5: CUDA }"
|
||||
"{target t | 0 | choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"4: Vulkan, "
|
||||
"6: CUDA, "
|
||||
"7: CUDA fp16 (half-float preprocess) }";
|
||||
|
||||
namespace cv{
|
||||
namespace reid{
|
||||
|
||||
static Mat preprocess(const Mat& img)
|
||||
{
|
||||
const double mean[3] = {0.485, 0.456, 0.406};
|
||||
const double std[3] = {0.229, 0.224, 0.225};
|
||||
Mat ret = Mat(img.rows, img.cols, CV_32FC3);
|
||||
for (int y = 0; y < ret.rows; y ++)
|
||||
{
|
||||
for (int x = 0; x < ret.cols; x++)
|
||||
{
|
||||
for (int c = 0; c < 3; c++)
|
||||
{
|
||||
ret.at<Vec3f>(y,x)[c] = (float)((img.at<Vec3b>(y,x)[c] / 255.0 - mean[2 - c]) / std[2 - c]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::vector<float> normalization(const std::vector<float>& feature)
|
||||
{
|
||||
std::vector<float> ret;
|
||||
float sum = 0.0;
|
||||
for(int i = 0; i < (int)feature.size(); i++)
|
||||
{
|
||||
sum += feature[i] * feature[i];
|
||||
}
|
||||
sum = sqrt(sum);
|
||||
for(int i = 0; i < (int)feature.size(); i++)
|
||||
{
|
||||
ret.push_back(feature[i] / sum);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void extractFeatures(const std::vector<std::string>& imglist, Net* net, const int& batch_size, const int& resize_h, const int& resize_w, std::vector<std::vector<float>>& features)
|
||||
{
|
||||
for(int st = 0; st < (int)imglist.size(); st += batch_size)
|
||||
{
|
||||
std::vector<Mat> batch;
|
||||
for(int delta = 0; delta < batch_size && st + delta < (int)imglist.size(); delta++)
|
||||
{
|
||||
Mat img = imread(imglist[st + delta]);
|
||||
batch.push_back(preprocess(img));
|
||||
}
|
||||
Mat blob = dnn::blobFromImages(batch, 1.0, Size(resize_w, resize_h), Scalar(0.0,0.0,0.0), true, false, CV_32F);
|
||||
net->setInput(blob);
|
||||
Mat out = net->forward();
|
||||
for(int i = 0; i < (int)out.size().height; i++)
|
||||
{
|
||||
std::vector<float> temp_feature;
|
||||
for(int j = 0; j < (int)out.size().width; j++)
|
||||
{
|
||||
temp_feature.push_back(out.at<float>(i,j));
|
||||
}
|
||||
features.push_back(normalization(temp_feature));
|
||||
}
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
static void getNames(const std::string& ImageList, std::vector<std::string>& result)
|
||||
{
|
||||
std::ifstream img_in(ImageList);
|
||||
std::string img_name;
|
||||
while(img_in >> img_name)
|
||||
{
|
||||
result.push_back(img_name);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
static float similarity(const std::vector<float>& feature1, const std::vector<float>& feature2)
|
||||
{
|
||||
float result = 0.0;
|
||||
for(int i = 0; i < (int)feature1.size(); i++)
|
||||
{
|
||||
result += feature1[i] * feature2[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void getTopK(const std::vector<std::vector<float>>& queryFeatures, const std::vector<std::vector<float>>& galleryFeatures, const int& topk, std::vector<std::vector<int>>& result)
|
||||
{
|
||||
for(int i = 0; i < (int)queryFeatures.size(); i++)
|
||||
{
|
||||
std::vector<float> similarityList;
|
||||
std::vector<int> index;
|
||||
for(int j = 0; j < (int)galleryFeatures.size(); j++)
|
||||
{
|
||||
similarityList.push_back(similarity(queryFeatures[i], galleryFeatures[j]));
|
||||
index.push_back(j);
|
||||
}
|
||||
sort(index.begin(), index.end(), [&](int x,int y){return similarityList[x] > similarityList[y];});
|
||||
std::vector<int> topk_result;
|
||||
for(int j = 0; j < min(topk, (int)index.size()); j++)
|
||||
{
|
||||
topk_result.push_back(index[j]);
|
||||
}
|
||||
result.push_back(topk_result);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
static void addBorder(const Mat& img, const Scalar& color, Mat& result)
|
||||
{
|
||||
const int bordersize = 5;
|
||||
copyMakeBorder(img, result, bordersize, bordersize, bordersize, bordersize, cv::BORDER_CONSTANT, color);
|
||||
return ;
|
||||
}
|
||||
|
||||
static void drawRankList(const std::string& queryName, const std::vector<std::string>& galleryImageNames, const std::vector<int>& topk_index, const int& resize_h, const int& resize_w, Mat& result)
|
||||
{
|
||||
const Size outputSize = Size(resize_w, resize_h);
|
||||
Mat q_img = imread(queryName), temp_img;
|
||||
resize(q_img, temp_img, outputSize);
|
||||
addBorder(temp_img, Scalar(0,0,0), q_img);
|
||||
putText(q_img, "Query", Point(10, 30), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0,255,0), 2);
|
||||
std::vector<Mat> Images;
|
||||
Images.push_back(q_img);
|
||||
for(int i = 0; i < (int)topk_index.size(); i++)
|
||||
{
|
||||
Mat g_img = imread(galleryImageNames[topk_index[i]]);
|
||||
resize(g_img, temp_img, outputSize);
|
||||
addBorder(temp_img, Scalar(255,255,255), g_img);
|
||||
putText(g_img, "G" + std::to_string(i), Point(10, 30), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0,255,0), 2);
|
||||
Images.push_back(g_img);
|
||||
}
|
||||
hconcat(Images, result);
|
||||
return ;
|
||||
}
|
||||
|
||||
static void visualization(const std::vector<std::vector<int>>& topk, const std::vector<std::string>& queryImageNames, const std::vector<std::string>& galleryImageNames, const std::string& output_dir, const int& resize_h, const int& resize_w)
|
||||
{
|
||||
for(int i = 0; i < (int)queryImageNames.size(); i++)
|
||||
{
|
||||
Mat img;
|
||||
drawRankList(queryImageNames[i], galleryImageNames, topk[i], resize_h, resize_w, img);
|
||||
std::string output_path = output_dir + "/" + queryImageNames[i].substr(queryImageNames[i].rfind("/")+1);
|
||||
imwrite(output_path, img);
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run ReID networks using OpenCV.");
|
||||
|
||||
const std::string modelPath = parser.get<String>("model");
|
||||
const std::string queryImageList = parser.get<String>("query_list");
|
||||
const std::string galleryImageList = parser.get<String>("gallery_list");
|
||||
const int backend = parser.get<int>("backend");
|
||||
const int target = parser.get<int>("target");
|
||||
const int batch_size = parser.get<int>("batch_size");
|
||||
const int resize_h = parser.get<int>("resize_h");
|
||||
const int resize_w = parser.get<int>("resize_w");
|
||||
const int topk = parser.get<int>("topk");
|
||||
const std::string output_dir= parser.get<String>("output_dir");
|
||||
|
||||
std::vector<std::string> queryImageNames;
|
||||
reid::getNames(queryImageList, queryImageNames);
|
||||
std::vector<std::string> galleryImageNames;
|
||||
reid::getNames(galleryImageList, galleryImageNames);
|
||||
|
||||
dnn::Net net = dnn::readNet(modelPath);
|
||||
net.setPreferableBackend(backend);
|
||||
net.setPreferableTarget(target);
|
||||
|
||||
std::vector<std::vector<float>> queryFeatures;
|
||||
reid::extractFeatures(queryImageNames, &net, batch_size, resize_h, resize_w, queryFeatures);
|
||||
std::vector<std::vector<float>> galleryFeatures;
|
||||
reid::extractFeatures(galleryImageNames, &net, batch_size, resize_h, resize_w, galleryFeatures);
|
||||
|
||||
std::vector<std::vector<int>> topkResult;
|
||||
reid::getTopK(queryFeatures, galleryFeatures, topk, topkResult);
|
||||
reid::visualization(topkResult, queryImageNames, galleryImageNames, output_dir, resize_h, resize_w);
|
||||
|
||||
return 0;
|
||||
}
|
240
3rdparty/opencv-4.5.4/samples/dnn/person_reid.py
vendored
Normal file
240
3rdparty/opencv-4.5.4/samples/dnn/person_reid.py
vendored
Normal file
@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
You can download a baseline ReID model and sample input from:
|
||||
https://github.com/ReID-Team/ReID_extra_testdata
|
||||
|
||||
Authors of samples and Youtu ReID baseline:
|
||||
Xing Sun <winfredsun@tencent.com>
|
||||
Feng Zheng <zhengf@sustech.edu.cn>
|
||||
Xinyang Jiang <sevjiang@tencent.com>
|
||||
Fufu Yu <fufuyu@tencent.com>
|
||||
Enwei Zhang <miyozhang@tencent.com>
|
||||
|
||||
Copyright (C) 2020-2021, Tencent.
|
||||
Copyright (C) 2020-2021, SUSTech.
|
||||
'''
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT,
|
||||
cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
|
||||
cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_VKCOM,
|
||||
cv.dnn.DNN_BACKEND_CUDA)
|
||||
|
||||
targets = (cv.dnn.DNN_TARGET_CPU,
|
||||
cv.dnn.DNN_TARGET_OPENCL,
|
||||
cv.dnn.DNN_TARGET_OPENCL_FP16,
|
||||
cv.dnn.DNN_TARGET_MYRIAD,
|
||||
cv.dnn.DNN_TARGET_HDDL,
|
||||
cv.dnn.DNN_TARGET_VULKAN,
|
||||
cv.dnn.DNN_TARGET_CUDA,
|
||||
cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
MEAN = (0.485, 0.456, 0.406)
|
||||
STD = (0.229, 0.224, 0.225)
|
||||
|
||||
def preprocess(images, height, width):
|
||||
"""
|
||||
Create 4-dimensional blob from image
|
||||
:param image: input image
|
||||
:param height: the height of the resized input image
|
||||
:param width: the width of the resized input image
|
||||
"""
|
||||
img_list = []
|
||||
for image in images:
|
||||
image = cv.resize(image, (width, height))
|
||||
img_list.append(image[:, :, ::-1])
|
||||
|
||||
images = np.array(img_list)
|
||||
images = (images / 255.0 - MEAN) / STD
|
||||
|
||||
input = cv.dnn.blobFromImages(images.astype(np.float32), ddepth = cv.CV_32F)
|
||||
return input
|
||||
|
||||
def extract_feature(img_dir, model_path, batch_size = 32, resize_h = 384, resize_w = 128, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
|
||||
"""
|
||||
Extract features from images in a target directory
|
||||
:param img_dir: the input image directory
|
||||
:param model_path: path to ReID model
|
||||
:param batch_size: the batch size for each network inference iteration
|
||||
:param resize_h: the height of the input image
|
||||
:param resize_w: the width of the input image
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
feat_list = []
|
||||
path_list = os.listdir(img_dir)
|
||||
path_list = [os.path.join(img_dir, img_name) for img_name in path_list]
|
||||
count = 0
|
||||
|
||||
for i in range(0, len(path_list), batch_size):
|
||||
print('Feature Extraction for images in', img_dir, 'Batch:', count, '/', len(path_list))
|
||||
batch = path_list[i : min(i + batch_size, len(path_list))]
|
||||
imgs = read_data(batch)
|
||||
inputs = preprocess(imgs, resize_h, resize_w)
|
||||
|
||||
feat = run_net(inputs, model_path, backend, target)
|
||||
|
||||
feat_list.append(feat)
|
||||
count += batch_size
|
||||
|
||||
feats = np.concatenate(feat_list, axis = 0)
|
||||
return feats, path_list
|
||||
|
||||
def run_net(inputs, model_path, backend=cv.dnn.DNN_BACKEND_OPENCV, target=cv.dnn.DNN_TARGET_CPU):
|
||||
"""
|
||||
Forword propagation for a batch of images.
|
||||
:param inputs: input batch of images
|
||||
:param model_path: path to ReID model
|
||||
:param backend: name of computation backend
|
||||
:param target: name of computation target
|
||||
"""
|
||||
net = cv.dnn.readNet(model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(inputs)
|
||||
out = net.forward()
|
||||
out = np.reshape(out, (out.shape[0], out.shape[1]))
|
||||
return out
|
||||
|
||||
def read_data(path_list):
|
||||
"""
|
||||
Read all images from a directory into a list
|
||||
:param path_list: the list of image path
|
||||
"""
|
||||
img_list = []
|
||||
for img_path in path_list:
|
||||
img = cv.imread(img_path)
|
||||
if img is None:
|
||||
continue
|
||||
img_list.append(img)
|
||||
return img_list
|
||||
|
||||
def normalize(nparray, order=2, axis=0):
|
||||
"""
|
||||
Normalize a N-D numpy array along the specified axis.
|
||||
:param nparry: the array of vectors to be normalized
|
||||
:param order: order of the norm
|
||||
:param axis: the axis of x along which to compute the vector norms
|
||||
"""
|
||||
norm = np.linalg.norm(nparray, ord=order, axis=axis, keepdims=True)
|
||||
return nparray / (norm + np.finfo(np.float32).eps)
|
||||
|
||||
def similarity(array1, array2):
|
||||
"""
|
||||
Compute the euclidean or cosine distance of all pairs.
|
||||
:param array1: numpy array with shape [m1, n]
|
||||
:param array2: numpy array with shape [m2, n]
|
||||
Returns:
|
||||
numpy array with shape [m1, m2]
|
||||
"""
|
||||
array1 = normalize(array1, axis=1)
|
||||
array2 = normalize(array2, axis=1)
|
||||
dist = np.matmul(array1, array2.T)
|
||||
return dist
|
||||
|
||||
def topk(query_feat, gallery_feat, topk = 5):
|
||||
"""
|
||||
Return the index of top K gallery images most similar to the query images
|
||||
:param query_feat: array of feature vectors of query images
|
||||
:param gallery_feat: array of feature vectors of gallery images
|
||||
:param topk: number of gallery images to return
|
||||
"""
|
||||
sim = similarity(query_feat, gallery_feat)
|
||||
index = np.argsort(-sim, axis = 1)
|
||||
return [i[0:int(topk)] for i in index]
|
||||
|
||||
def drawRankList(query_name, gallery_list, output_size = (128, 384)):
|
||||
"""
|
||||
Draw the rank list
|
||||
:param query_name: path of the query image
|
||||
:param gallery_name: path of the gallery image
|
||||
"param output_size: the output size of each image in the rank list
|
||||
"""
|
||||
def addBorder(im, color):
|
||||
bordersize = 5
|
||||
border = cv.copyMakeBorder(
|
||||
im,
|
||||
top = bordersize,
|
||||
bottom = bordersize,
|
||||
left = bordersize,
|
||||
right = bordersize,
|
||||
borderType = cv.BORDER_CONSTANT,
|
||||
value = color
|
||||
)
|
||||
return border
|
||||
query_img = cv.imread(query_name)
|
||||
query_img = cv.resize(query_img, output_size)
|
||||
query_img = addBorder(query_img, [0, 0, 0])
|
||||
cv.putText(query_img, 'Query', (10, 30), cv.FONT_HERSHEY_COMPLEX, 1., (0,255,0), 2)
|
||||
|
||||
gallery_img_list = []
|
||||
for i, gallery_name in enumerate(gallery_list):
|
||||
gallery_img = cv.imread(gallery_name)
|
||||
gallery_img = cv.resize(gallery_img, output_size)
|
||||
gallery_img = addBorder(gallery_img, [255, 255, 255])
|
||||
cv.putText(gallery_img, 'G%02d'%i, (10, 30), cv.FONT_HERSHEY_COMPLEX, 1., (0,255,0), 2)
|
||||
gallery_img_list.append(gallery_img)
|
||||
ret = np.concatenate([query_img] + gallery_img_list, axis = 1)
|
||||
return ret
|
||||
|
||||
|
||||
def visualization(topk_idx, query_names, gallery_names, output_dir = 'vis'):
|
||||
"""
|
||||
Visualize the retrieval results with the person ReID model
|
||||
:param topk_idx: the index of ranked gallery images for each query image
|
||||
:param query_names: the list of paths of query images
|
||||
:param gallery_names: the list of paths of gallery images
|
||||
:param output_dir: the path to save the visualize results
|
||||
"""
|
||||
if not os.path.exists(output_dir):
|
||||
os.mkdir(output_dir)
|
||||
for i, idx in enumerate(topk_idx):
|
||||
query_name = query_names[i]
|
||||
topk_names = [gallery_names[j] for j in idx]
|
||||
vis_img = drawRankList(query_name, topk_names)
|
||||
output_path = os.path.join(output_dir, '%03d_%s'%(i, os.path.basename(query_name)))
|
||||
cv.imwrite(output_path, vis_img)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Use this script to run human parsing using JPPNet',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--query_dir', '-q', required=True, help='Path to query image.')
|
||||
parser.add_argument('--gallery_dir', '-g', required=True, help='Path to gallery directory.')
|
||||
parser.add_argument('--resize_h', default = 256, help='The height of the input for model inference.')
|
||||
parser.add_argument('--resize_w', default = 128, help='The width of the input for model inference')
|
||||
parser.add_argument('--model', '-m', default='reid.onnx', help='Path to pb model.')
|
||||
parser.add_argument('--visualization_dir', default='vis', help='Path for the visualization results')
|
||||
parser.add_argument('--topk', default=10, help='Number of images visualized in the rank list')
|
||||
parser.add_argument('--batchsize', default=32, help='The batch size of each inference')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA backend"% backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA FP16'
|
||||
% targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if not os.path.isfile(args.model):
|
||||
raise OSError("Model not exist")
|
||||
|
||||
query_feat, query_names = extract_feature(args.query_dir, args.model, args.batchsize, args.resize_h, args.resize_w, args.backend, args.target)
|
||||
gallery_feat, gallery_names = extract_feature(args.gallery_dir, args.model, args.batchsize, args.resize_h, args.resize_w, args.backend, args.target)
|
||||
|
||||
topk_idx = topk(query_feat, gallery_feat, args.topk)
|
||||
visualization(topk_idx, query_names, gallery_names, output_dir = args.visualization_dir)
|
BIN
3rdparty/opencv-4.5.4/samples/dnn/results/audrybt1.jpg
vendored
Normal file
BIN
3rdparty/opencv-4.5.4/samples/dnn/results/audrybt1.jpg
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 47 KiB |
165
3rdparty/opencv-4.5.4/samples/dnn/scene_text_detection.cpp
vendored
Normal file
165
3rdparty/opencv-4.5.4/samples/dnn/scene_text_detection.cpp
vendored
Normal file
@ -0,0 +1,165 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
|
||||
"{ modelPath mp | | Path to a binary .onnx file contains trained DB detector model. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ inputHeight ih |736| image height of the model input. It should be multiple by 32.}"
|
||||
"{ inputWidth iw |736| image width of the model input. It should be multiple by 32.}"
|
||||
"{ binaryThreshold bt |0.3| Confidence threshold of the binary map. }"
|
||||
"{ polygonThreshold pt |0.5| Confidence threshold of polygons. }"
|
||||
"{ maxCandidate max |200| Max candidates of polygons. }"
|
||||
"{ unclipRatio ratio |2.0| unclip ratio. }"
|
||||
"{ evaluate e |false| false: predict with input images; true: evaluate on benchmarks. }"
|
||||
"{ evalDataPath edp | | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
static
|
||||
void split(const std::string& s, char delimiter, std::vector<std::string>& elems)
|
||||
{
|
||||
elems.clear();
|
||||
size_t prev_pos = 0;
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(delimiter, prev_pos)) != std::string::npos)
|
||||
{
|
||||
elems.emplace_back(s.substr(prev_pos, pos - prev_pos));
|
||||
prev_pos = pos + 1;
|
||||
}
|
||||
if (prev_pos < s.size())
|
||||
elems.emplace_back(s.substr(prev_pos, s.size() - prev_pos));
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse arguments
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run the official PyTorch implementation (https://github.com/MhLiao/DB) of "
|
||||
"Real-time Scene Text Detection with Differentiable Binarization (https://arxiv.org/abs/1911.08947)\n"
|
||||
"The current version of this script is a variant of the original network without deformable convolution");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float binThresh = parser.get<float>("binaryThreshold");
|
||||
float polyThresh = parser.get<float>("polygonThreshold");
|
||||
uint maxCandidates = parser.get<uint>("maxCandidate");
|
||||
String modelPath = parser.get<String>("modelPath");
|
||||
double unclipRatio = parser.get<double>("unclipRatio");
|
||||
int height = parser.get<int>("inputHeight");
|
||||
int width = parser.get<int>("inputWidth");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load the network
|
||||
CV_Assert(!modelPath.empty());
|
||||
TextDetectionModel_DB detector(modelPath);
|
||||
detector.setBinaryThreshold(binThresh)
|
||||
.setPolygonThreshold(polyThresh)
|
||||
.setUnclipRatio(unclipRatio)
|
||||
.setMaxCandidates(maxCandidates);
|
||||
|
||||
double scale = 1.0 / 255.0;
|
||||
Size inputSize = Size(width, height);
|
||||
Scalar mean = Scalar(122.67891434, 116.66876762, 104.00698793);
|
||||
detector.setInputParams(scale, inputSize, mean);
|
||||
|
||||
// Create a window
|
||||
static const std::string winName = "TextDetectionModel";
|
||||
|
||||
if (parser.get<bool>("evaluate")) {
|
||||
// for evaluation
|
||||
String evalDataPath = parser.get<String>("evalDataPath");
|
||||
CV_Assert(!evalDataPath.empty());
|
||||
String testListPath = evalDataPath + "/test_list.txt";
|
||||
std::ifstream testList;
|
||||
testList.open(testListPath);
|
||||
CV_Assert(testList.is_open());
|
||||
|
||||
// Create a window for showing groundtruth
|
||||
static const std::string winNameGT = "GT";
|
||||
|
||||
String testImgPath;
|
||||
while (std::getline(testList, testImgPath)) {
|
||||
String imgPath = evalDataPath + "/test_images/" + testImgPath;
|
||||
std::cout << "Image Path: " << imgPath << std::endl;
|
||||
|
||||
Mat frame = imread(samples::findFile(imgPath), IMREAD_COLOR);
|
||||
CV_Assert(!frame.empty());
|
||||
Mat src = frame.clone();
|
||||
|
||||
// Inference
|
||||
std::vector<std::vector<Point>> results;
|
||||
detector.detect(frame, results);
|
||||
|
||||
polylines(frame, results, true, Scalar(0, 255, 0), 2);
|
||||
imshow(winName, frame);
|
||||
|
||||
// load groundtruth
|
||||
String imgName = testImgPath.substr(0, testImgPath.length() - 4);
|
||||
String gtPath = evalDataPath + "/test_gts/" + imgName + ".txt";
|
||||
// std::cout << gtPath << std::endl;
|
||||
std::ifstream gtFile;
|
||||
gtFile.open(gtPath);
|
||||
CV_Assert(gtFile.is_open());
|
||||
|
||||
std::vector<std::vector<Point>> gts;
|
||||
String gtLine;
|
||||
while (std::getline(gtFile, gtLine)) {
|
||||
size_t splitLoc = gtLine.find_last_of(',');
|
||||
String text = gtLine.substr(splitLoc+1);
|
||||
if ( text == "###\r" || text == "1") {
|
||||
// ignore difficult instances
|
||||
continue;
|
||||
}
|
||||
gtLine = gtLine.substr(0, splitLoc);
|
||||
|
||||
std::vector<std::string> v;
|
||||
split(gtLine, ',', v);
|
||||
|
||||
std::vector<int> loc;
|
||||
std::vector<Point> pts;
|
||||
for (auto && s : v) {
|
||||
loc.push_back(atoi(s.c_str()));
|
||||
}
|
||||
for (size_t i = 0; i < loc.size() / 2; i++) {
|
||||
pts.push_back(Point(loc[2 * i], loc[2 * i + 1]));
|
||||
}
|
||||
gts.push_back(pts);
|
||||
}
|
||||
polylines(src, gts, true, Scalar(0, 255, 0), 2);
|
||||
imshow(winNameGT, src);
|
||||
|
||||
waitKey();
|
||||
}
|
||||
} else {
|
||||
// Open an image file
|
||||
CV_Assert(parser.has("inputImage"));
|
||||
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
|
||||
CV_Assert(!frame.empty());
|
||||
|
||||
// Detect
|
||||
std::vector<std::vector<Point>> results;
|
||||
detector.detect(frame, results);
|
||||
|
||||
polylines(frame, results, true, Scalar(0, 255, 0), 2);
|
||||
imshow(winName, frame);
|
||||
waitKey();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
144
3rdparty/opencv-4.5.4/samples/dnn/scene_text_recognition.cpp
vendored
Normal file
144
3rdparty/opencv-4.5.4/samples/dnn/scene_text_recognition.cpp
vendored
Normal file
@ -0,0 +1,144 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
String keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
|
||||
"{ modelPath mp | | Path to a binary .onnx file contains trained CRNN text recognition model. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
|
||||
"{ evaluate e |false| false: predict with input images; true: evaluate on benchmarks. }"
|
||||
"{ evalDataPath edp | | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ vocabularyPath vp | alphabet_36.txt | Path to recognition vocabulary. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
String convertForEval(String &input);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse arguments
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run the PyTorch implementation of "
|
||||
"An End-to-End Trainable Neural Network for Image-based SequenceRecognition and Its Application to Scene Text Recognition "
|
||||
"(https://arxiv.org/abs/1507.05717)");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
String modelPath = parser.get<String>("modelPath");
|
||||
String vocPath = parser.get<String>("vocabularyPath");
|
||||
int imreadRGB = parser.get<int>("RGBInput");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load the network
|
||||
CV_Assert(!modelPath.empty());
|
||||
TextRecognitionModel recognizer(modelPath);
|
||||
|
||||
// Load vocabulary
|
||||
CV_Assert(!vocPath.empty());
|
||||
std::ifstream vocFile;
|
||||
vocFile.open(samples::findFile(vocPath));
|
||||
CV_Assert(vocFile.is_open());
|
||||
String vocLine;
|
||||
std::vector<String> vocabulary;
|
||||
while (std::getline(vocFile, vocLine)) {
|
||||
vocabulary.push_back(vocLine);
|
||||
}
|
||||
recognizer.setVocabulary(vocabulary);
|
||||
recognizer.setDecodeType("CTC-greedy");
|
||||
|
||||
// Set parameters
|
||||
double scale = 1.0 / 127.5;
|
||||
Scalar mean = Scalar(127.5, 127.5, 127.5);
|
||||
Size inputSize = Size(100, 32);
|
||||
recognizer.setInputParams(scale, inputSize, mean);
|
||||
|
||||
if (parser.get<bool>("evaluate"))
|
||||
{
|
||||
// For evaluation
|
||||
String evalDataPath = parser.get<String>("evalDataPath");
|
||||
CV_Assert(!evalDataPath.empty());
|
||||
String gtPath = evalDataPath + "/test_gts.txt";
|
||||
std::ifstream evalGts;
|
||||
evalGts.open(gtPath);
|
||||
CV_Assert(evalGts.is_open());
|
||||
|
||||
String gtLine;
|
||||
int cntRight=0, cntAll=0;
|
||||
TickMeter timer;
|
||||
timer.reset();
|
||||
|
||||
while (std::getline(evalGts, gtLine)) {
|
||||
size_t splitLoc = gtLine.find_first_of(' ');
|
||||
String imgPath = evalDataPath + '/' + gtLine.substr(0, splitLoc);
|
||||
String gt = gtLine.substr(splitLoc+1);
|
||||
|
||||
// Inference
|
||||
Mat frame = imread(samples::findFile(imgPath), imreadRGB);
|
||||
CV_Assert(!frame.empty());
|
||||
timer.start();
|
||||
std::string recognitionResult = recognizer.recognize(frame);
|
||||
timer.stop();
|
||||
|
||||
if (gt == convertForEval(recognitionResult))
|
||||
cntRight++;
|
||||
|
||||
cntAll++;
|
||||
}
|
||||
std::cout << "Accuracy(%): " << (double)(cntRight) / (double)(cntAll) << std::endl;
|
||||
std::cout << "Average Inference Time(ms): " << timer.getTimeMilli() / (double)(cntAll) << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Create a window
|
||||
static const std::string winName = "Input Cropped Image";
|
||||
|
||||
// Open an image file
|
||||
CV_Assert(parser.has("inputImage"));
|
||||
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")), imreadRGB);
|
||||
CV_Assert(!frame.empty());
|
||||
|
||||
// Recognition
|
||||
std::string recognitionResult = recognizer.recognize(frame);
|
||||
|
||||
imshow(winName, frame);
|
||||
std::cout << "Predition: '" << recognitionResult << "'" << std::endl;
|
||||
waitKey();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert the predictions to lower case, and remove other characters.
|
||||
// Only for Evaluation
|
||||
String convertForEval(String & input)
|
||||
{
|
||||
String output;
|
||||
for (uint i = 0; i < input.length(); i++){
|
||||
char ch = input[i];
|
||||
if ((int)ch >= 97 && (int)ch <= 122) {
|
||||
output.push_back(ch);
|
||||
} else if ((int)ch >= 65 && (int)ch <= 90) {
|
||||
output.push_back((char)(ch + 32));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
169
3rdparty/opencv-4.5.4/samples/dnn/scene_text_spotting.cpp
vendored
Normal file
169
3rdparty/opencv-4.5.4/samples/dnn/scene_text_spotting.cpp
vendored
Normal file
@ -0,0 +1,169 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
|
||||
"{ detModelPath dmp | | Path to a binary .onnx model for detection. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ recModelPath rmp | | Path to a binary .onnx model for recognition. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ inputHeight ih |736| image height of the model input. It should be multiple by 32.}"
|
||||
"{ inputWidth iw |736| image width of the model input. It should be multiple by 32.}"
|
||||
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
|
||||
"{ binaryThreshold bt |0.3| Confidence threshold of the binary map. }"
|
||||
"{ polygonThreshold pt |0.5| Confidence threshold of polygons. }"
|
||||
"{ maxCandidate max |200| Max candidates of polygons. }"
|
||||
"{ unclipRatio ratio |2.0| unclip ratio. }"
|
||||
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
|
||||
bool sortPts(const Point& p1, const Point& p2);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse arguments
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run an end-to-end inference sample of textDetectionModel and textRecognitionModel APIs\n"
|
||||
"Use -h for more information");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float binThresh = parser.get<float>("binaryThreshold");
|
||||
float polyThresh = parser.get<float>("polygonThreshold");
|
||||
uint maxCandidates = parser.get<uint>("maxCandidate");
|
||||
String detModelPath = parser.get<String>("detModelPath");
|
||||
String recModelPath = parser.get<String>("recModelPath");
|
||||
String vocPath = parser.get<String>("vocabularyPath");
|
||||
double unclipRatio = parser.get<double>("unclipRatio");
|
||||
int height = parser.get<int>("inputHeight");
|
||||
int width = parser.get<int>("inputWidth");
|
||||
int imreadRGB = parser.get<int>("RGBInput");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load networks
|
||||
CV_Assert(!detModelPath.empty());
|
||||
TextDetectionModel_DB detector(detModelPath);
|
||||
detector.setBinaryThreshold(binThresh)
|
||||
.setPolygonThreshold(polyThresh)
|
||||
.setUnclipRatio(unclipRatio)
|
||||
.setMaxCandidates(maxCandidates);
|
||||
|
||||
CV_Assert(!recModelPath.empty());
|
||||
TextRecognitionModel recognizer(recModelPath);
|
||||
|
||||
// Load vocabulary
|
||||
CV_Assert(!vocPath.empty());
|
||||
std::ifstream vocFile;
|
||||
vocFile.open(samples::findFile(vocPath));
|
||||
CV_Assert(vocFile.is_open());
|
||||
String vocLine;
|
||||
std::vector<String> vocabulary;
|
||||
while (std::getline(vocFile, vocLine)) {
|
||||
vocabulary.push_back(vocLine);
|
||||
}
|
||||
recognizer.setVocabulary(vocabulary);
|
||||
recognizer.setDecodeType("CTC-greedy");
|
||||
|
||||
// Parameters for Detection
|
||||
double detScale = 1.0 / 255.0;
|
||||
Size detInputSize = Size(width, height);
|
||||
Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);
|
||||
detector.setInputParams(detScale, detInputSize, detMean);
|
||||
|
||||
// Parameters for Recognition
|
||||
double recScale = 1.0 / 127.5;
|
||||
Scalar recMean = Scalar(127.5);
|
||||
Size recInputSize = Size(100, 32);
|
||||
recognizer.setInputParams(recScale, recInputSize, recMean);
|
||||
|
||||
// Create a window
|
||||
static const std::string winName = "Text_Spotting";
|
||||
|
||||
// Input data
|
||||
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
|
||||
std::cout << frame.size << std::endl;
|
||||
|
||||
// Inference
|
||||
std::vector< std::vector<Point> > detResults;
|
||||
detector.detect(frame, detResults);
|
||||
|
||||
if (detResults.size() > 0) {
|
||||
// Text Recognition
|
||||
Mat recInput;
|
||||
if (!imreadRGB) {
|
||||
cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
|
||||
} else {
|
||||
recInput = frame;
|
||||
}
|
||||
std::vector< std::vector<Point> > contours;
|
||||
for (uint i = 0; i < detResults.size(); i++)
|
||||
{
|
||||
const auto& quadrangle = detResults[i];
|
||||
CV_CheckEQ(quadrangle.size(), (size_t)4, "");
|
||||
|
||||
contours.emplace_back(quadrangle);
|
||||
|
||||
std::vector<Point2f> quadrangle_2f;
|
||||
for (int j = 0; j < 4; j++)
|
||||
quadrangle_2f.emplace_back(quadrangle[j]);
|
||||
|
||||
// Transform and Crop
|
||||
Mat cropped;
|
||||
fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
|
||||
|
||||
std::string recognitionResult = recognizer.recognize(cropped);
|
||||
std::cout << i << ": '" << recognitionResult << "'" << std::endl;
|
||||
|
||||
putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
|
||||
}
|
||||
polylines(frame, contours, true, Scalar(0, 255, 0), 2);
|
||||
} else {
|
||||
std::cout << "No Text Detected." << std::endl;
|
||||
}
|
||||
imshow(winName, frame);
|
||||
waitKey();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
|
||||
{
|
||||
const Size outputSize = Size(100, 32);
|
||||
|
||||
Point2f targetVertices[4] = {
|
||||
Point(0, outputSize.height - 1),
|
||||
Point(0, 0),
|
||||
Point(outputSize.width - 1, 0),
|
||||
Point(outputSize.width - 1, outputSize.height - 1)
|
||||
};
|
||||
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
|
||||
|
||||
warpPerspective(frame, result, rotationMatrix, outputSize);
|
||||
|
||||
#if 0
|
||||
imshow("roi", result);
|
||||
waitKey();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool sortPts(const Point& p1, const Point& p2)
|
||||
{
|
||||
return p1.x < p2.x;
|
||||
}
|
252
3rdparty/opencv-4.5.4/samples/dnn/segmentation.cpp
vendored
Normal file
252
3rdparty/opencv-4.5.4/samples/dnn/segmentation.cpp
vendored
Normal file
@ -0,0 +1,252 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <opencv2/dnn.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
std::string keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
|
||||
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
|
||||
"{ device | 0 | camera device number. }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
|
||||
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
|
||||
"{ classes | | Optional path to a text file with names of classes. }"
|
||||
"{ colors | | Optional path to a text file with colors for an every class. "
|
||||
"An every color is represented with three values from 0 to 255 in BGR channels order. }"
|
||||
"{ backend | 0 | Choose one of computation backends: "
|
||||
"0: automatically (by default), "
|
||||
"1: Halide language (http://halide-lang.org/), "
|
||||
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"3: OpenCV implementation, "
|
||||
"4: VKCOM, "
|
||||
"5: CUDA }"
|
||||
"{ target | 0 | Choose one of target computation devices: "
|
||||
"0: CPU target (by default), "
|
||||
"1: OpenCL, "
|
||||
"2: OpenCL fp16 (half-float precision), "
|
||||
"3: VPU, "
|
||||
"4: Vulkan, "
|
||||
"6: CUDA, "
|
||||
"7: CUDA fp16 (half-float preprocess) }";
|
||||
|
||||
using namespace cv;
|
||||
using namespace dnn;
|
||||
|
||||
std::vector<std::string> classes;
|
||||
std::vector<Vec3b> colors;
|
||||
|
||||
void showLegend();
|
||||
|
||||
void colorizeSegmentation(const Mat &score, Mat &segm);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
|
||||
const std::string modelName = parser.get<String>("@alias");
|
||||
const std::string zooFile = parser.get<String>("zoo");
|
||||
|
||||
keys += genPreprocArguments(modelName, zooFile);
|
||||
|
||||
parser = CommandLineParser(argc, argv, keys);
|
||||
parser.about("Use this script to run semantic segmentation deep learning networks using OpenCV.");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float scale = parser.get<float>("scale");
|
||||
Scalar mean = parser.get<Scalar>("mean");
|
||||
bool swapRB = parser.get<bool>("rgb");
|
||||
int inpWidth = parser.get<int>("width");
|
||||
int inpHeight = parser.get<int>("height");
|
||||
String model = findFile(parser.get<String>("model"));
|
||||
String config = findFile(parser.get<String>("config"));
|
||||
String framework = parser.get<String>("framework");
|
||||
int backendId = parser.get<int>("backend");
|
||||
int targetId = parser.get<int>("target");
|
||||
|
||||
// Open file with classes names.
|
||||
if (parser.has("classes"))
|
||||
{
|
||||
std::string file = parser.get<String>("classes");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
classes.push_back(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Open file with colors.
|
||||
if (parser.has("colors"))
|
||||
{
|
||||
std::string file = parser.get<String>("colors");
|
||||
std::ifstream ifs(file.c_str());
|
||||
if (!ifs.is_open())
|
||||
CV_Error(Error::StsError, "File " + file + " not found");
|
||||
std::string line;
|
||||
while (std::getline(ifs, line))
|
||||
{
|
||||
std::istringstream colorStr(line.c_str());
|
||||
|
||||
Vec3b color;
|
||||
for (int i = 0; i < 3 && !colorStr.eof(); ++i)
|
||||
colorStr >> color[i];
|
||||
colors.push_back(color);
|
||||
}
|
||||
}
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
CV_Assert(!model.empty());
|
||||
//! [Read and initialize network]
|
||||
Net net = readNet(model, config, framework);
|
||||
net.setPreferableBackend(backendId);
|
||||
net.setPreferableTarget(targetId);
|
||||
//! [Read and initialize network]
|
||||
|
||||
// Create a window
|
||||
static const std::string kWinName = "Deep learning semantic segmentation in OpenCV";
|
||||
namedWindow(kWinName, WINDOW_NORMAL);
|
||||
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
VideoCapture cap;
|
||||
if (parser.has("input"))
|
||||
cap.open(parser.get<String>("input"));
|
||||
else
|
||||
cap.open(parser.get<int>("device"));
|
||||
//! [Open a video file or an image file or a camera stream]
|
||||
|
||||
// Process frames.
|
||||
Mat frame, blob;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
//! [Create a 4D blob from a frame]
|
||||
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, false);
|
||||
//! [Create a 4D blob from a frame]
|
||||
|
||||
//! [Set input blob]
|
||||
net.setInput(blob);
|
||||
//! [Set input blob]
|
||||
//! [Make forward pass]
|
||||
Mat score = net.forward();
|
||||
//! [Make forward pass]
|
||||
|
||||
Mat segm;
|
||||
colorizeSegmentation(score, segm);
|
||||
|
||||
resize(segm, segm, frame.size(), 0, 0, INTER_NEAREST);
|
||||
addWeighted(frame, 0.1, segm, 0.9, 0.0, frame);
|
||||
|
||||
// Put efficiency information.
|
||||
std::vector<double> layersTimes;
|
||||
double freq = getTickFrequency() / 1000;
|
||||
double t = net.getPerfProfile(layersTimes) / freq;
|
||||
std::string label = format("Inference time: %.2f ms", t);
|
||||
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
|
||||
|
||||
imshow(kWinName, frame);
|
||||
if (!classes.empty())
|
||||
showLegend();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void colorizeSegmentation(const Mat &score, Mat &segm)
|
||||
{
|
||||
const int rows = score.size[2];
|
||||
const int cols = score.size[3];
|
||||
const int chns = score.size[1];
|
||||
|
||||
if (colors.empty())
|
||||
{
|
||||
// Generate colors.
|
||||
colors.push_back(Vec3b());
|
||||
for (int i = 1; i < chns; ++i)
|
||||
{
|
||||
Vec3b color;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
color[j] = (colors[i - 1][j] + rand() % 256) / 2;
|
||||
colors.push_back(color);
|
||||
}
|
||||
}
|
||||
else if (chns != (int)colors.size())
|
||||
{
|
||||
CV_Error(Error::StsError, format("Number of output classes does not match "
|
||||
"number of colors (%d != %zu)", chns, colors.size()));
|
||||
}
|
||||
|
||||
Mat maxCl = Mat::zeros(rows, cols, CV_8UC1);
|
||||
Mat maxVal(rows, cols, CV_32FC1, score.data);
|
||||
for (int ch = 1; ch < chns; ch++)
|
||||
{
|
||||
for (int row = 0; row < rows; row++)
|
||||
{
|
||||
const float *ptrScore = score.ptr<float>(0, ch, row);
|
||||
uint8_t *ptrMaxCl = maxCl.ptr<uint8_t>(row);
|
||||
float *ptrMaxVal = maxVal.ptr<float>(row);
|
||||
for (int col = 0; col < cols; col++)
|
||||
{
|
||||
if (ptrScore[col] > ptrMaxVal[col])
|
||||
{
|
||||
ptrMaxVal[col] = ptrScore[col];
|
||||
ptrMaxCl[col] = (uchar)ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
segm.create(rows, cols, CV_8UC3);
|
||||
for (int row = 0; row < rows; row++)
|
||||
{
|
||||
const uchar *ptrMaxCl = maxCl.ptr<uchar>(row);
|
||||
Vec3b *ptrSegm = segm.ptr<Vec3b>(row);
|
||||
for (int col = 0; col < cols; col++)
|
||||
{
|
||||
ptrSegm[col] = colors[ptrMaxCl[col]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void showLegend()
|
||||
{
|
||||
static const int kBlockHeight = 30;
|
||||
static Mat legend;
|
||||
if (legend.empty())
|
||||
{
|
||||
const int numClasses = (int)classes.size();
|
||||
if ((int)colors.size() != numClasses)
|
||||
{
|
||||
CV_Error(Error::StsError, format("Number of output classes does not match "
|
||||
"number of labels (%zu != %zu)", colors.size(), classes.size()));
|
||||
}
|
||||
legend.create(kBlockHeight * numClasses, 200, CV_8UC3);
|
||||
for (int i = 0; i < numClasses; i++)
|
||||
{
|
||||
Mat block = legend.rowRange(i * kBlockHeight, (i + 1) * kBlockHeight);
|
||||
block.setTo(colors[i]);
|
||||
putText(block, classes[i], Point(0, kBlockHeight / 2), FONT_HERSHEY_SIMPLEX, 0.5, Vec3b(255, 255, 255));
|
||||
}
|
||||
namedWindow("Legend", WINDOW_NORMAL);
|
||||
imshow("Legend", legend);
|
||||
}
|
||||
}
|
135
3rdparty/opencv-4.5.4/samples/dnn/segmentation.py
vendored
Normal file
135
3rdparty/opencv-4.5.4/samples/dnn/segmentation.py
vendored
Normal file
@ -0,0 +1,135 @@
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
from common import *
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
|
||||
cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
|
||||
help='An optional path to file with preprocessing parameters.')
|
||||
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
|
||||
help='Optional name of an origin framework of the model. '
|
||||
'Detect it automatically if it does not set.')
|
||||
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
|
||||
'An every color is represented with three values from 0 to 255 in BGR channels order.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA"% backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA fp16 (half-float preprocess)'% targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
add_preproc_args(args.zoo, parser, 'segmentation')
|
||||
parser = argparse.ArgumentParser(parents=[parser],
|
||||
description='Use this script to run semantic segmentation deep learning networks using OpenCV.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.model = findFile(args.model)
|
||||
args.config = findFile(args.config)
|
||||
args.classes = findFile(args.classes)
|
||||
|
||||
np.random.seed(324)
|
||||
|
||||
# Load names of classes
|
||||
classes = None
|
||||
if args.classes:
|
||||
with open(args.classes, 'rt') as f:
|
||||
classes = f.read().rstrip('\n').split('\n')
|
||||
|
||||
# Load colors
|
||||
colors = None
|
||||
if args.colors:
|
||||
with open(args.colors, 'rt') as f:
|
||||
colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')]
|
||||
|
||||
legend = None
|
||||
def showLegend(classes):
|
||||
global legend
|
||||
if not classes is None and legend is None:
|
||||
blockHeight = 30
|
||||
assert(len(classes) == len(colors))
|
||||
|
||||
legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8)
|
||||
for i in range(len(classes)):
|
||||
block = legend[i * blockHeight:(i + 1) * blockHeight]
|
||||
block[:,:] = colors[i]
|
||||
cv.putText(block, classes[i], (0, blockHeight//2), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
|
||||
|
||||
cv.namedWindow('Legend', cv.WINDOW_NORMAL)
|
||||
cv.imshow('Legend', legend)
|
||||
classes = None
|
||||
|
||||
# Load a network
|
||||
net = cv.dnn.readNet(args.model, args.config, args.framework)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
|
||||
winName = 'Deep learning semantic segmentation in OpenCV'
|
||||
cv.namedWindow(winName, cv.WINDOW_NORMAL)
|
||||
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
legend = None
|
||||
while cv.waitKey(1) < 0:
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
frameHeight = frame.shape[0]
|
||||
frameWidth = frame.shape[1]
|
||||
|
||||
# Create a 4D blob from a frame.
|
||||
inpWidth = args.width if args.width else frameWidth
|
||||
inpHeight = args.height if args.height else frameHeight
|
||||
blob = cv.dnn.blobFromImage(frame, args.scale, (inpWidth, inpHeight), args.mean, args.rgb, crop=False)
|
||||
|
||||
# Run a model
|
||||
net.setInput(blob)
|
||||
score = net.forward()
|
||||
|
||||
numClasses = score.shape[1]
|
||||
height = score.shape[2]
|
||||
width = score.shape[3]
|
||||
|
||||
# Draw segmentation
|
||||
if not colors:
|
||||
# Generate colors
|
||||
colors = [np.array([0, 0, 0], np.uint8)]
|
||||
for i in range(1, numClasses):
|
||||
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
|
||||
|
||||
classIds = np.argmax(score[0], axis=0)
|
||||
segm = np.stack([colors[idx] for idx in classIds.flatten()])
|
||||
segm = segm.reshape(height, width, 3)
|
||||
|
||||
segm = cv.resize(segm, (frameWidth, frameHeight), interpolation=cv.INTER_NEAREST)
|
||||
frame = (0.1 * frame + 0.9 * segm).astype(np.uint8)
|
||||
|
||||
# Put efficiency information.
|
||||
t, _ = net.getPerfProfile()
|
||||
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
showLegend(classes)
|
||||
|
||||
cv.imshow(winName, frame)
|
62
3rdparty/opencv-4.5.4/samples/dnn/shrink_tf_graph_weights.py
vendored
Normal file
62
3rdparty/opencv-4.5.4/samples/dnn/shrink_tf_graph_weights.py
vendored
Normal file
@ -0,0 +1,62 @@
|
||||
# This file is part of OpenCV project.
|
||||
# It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
# of this distribution and at http://opencv.org/license.html.
|
||||
#
|
||||
# Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
# Third party copyrights are property of their respective owners.
|
||||
import tensorflow as tf
|
||||
import struct
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
parser = argparse.ArgumentParser(description='Convert weights of a frozen TensorFlow graph to fp16.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output graph.')
|
||||
parser.add_argument('--ops', default=['Conv2D', 'MatMul'], nargs='+',
|
||||
help='List of ops which weights are converted.')
|
||||
args = parser.parse_args()
|
||||
|
||||
DT_FLOAT = 1
|
||||
DT_HALF = 19
|
||||
|
||||
# For the frozen graphs, an every node that uses weights connected to Const nodes
|
||||
# through an Identity node. Usually they're called in the same way with '/read' suffix.
|
||||
# We'll replace all of them to Cast nodes.
|
||||
|
||||
# Load the model
|
||||
with tf.gfile.FastGFile(args.input) as f:
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(f.read())
|
||||
|
||||
# Set of all inputs from desired nodes.
|
||||
inputs = []
|
||||
for node in graph_def.node:
|
||||
if node.op in args.ops:
|
||||
inputs += node.input
|
||||
|
||||
weightsNodes = []
|
||||
for node in graph_def.node:
|
||||
# From the whole inputs we need to keep only an Identity nodes.
|
||||
if node.name in inputs and node.op == 'Identity' and node.attr['T'].type == DT_FLOAT:
|
||||
weightsNodes.append(node.input[0])
|
||||
|
||||
# Replace Identity to Cast.
|
||||
node.op = 'Cast'
|
||||
node.attr['DstT'].type = DT_FLOAT
|
||||
node.attr['SrcT'].type = DT_HALF
|
||||
del node.attr['T']
|
||||
del node.attr['_class']
|
||||
|
||||
# Convert weights to halfs.
|
||||
for node in graph_def.node:
|
||||
if node.name in weightsNodes:
|
||||
node.attr['dtype'].type = DT_HALF
|
||||
node.attr['value'].tensor.dtype = DT_HALF
|
||||
|
||||
floats = node.attr['value'].tensor.tensor_content
|
||||
|
||||
floats = struct.unpack('f' * (len(floats) / 4), floats)
|
||||
halfs = np.array(floats).astype(np.float16).view(np.uint16)
|
||||
node.attr['value'].tensor.tensor_content = struct.pack('H' * len(halfs), *halfs)
|
||||
|
||||
tf.train.write_graph(graph_def, "", args.output, as_text=False)
|
404
3rdparty/opencv-4.5.4/samples/dnn/siamrpnpp.py
vendored
Normal file
404
3rdparty/opencv-4.5.4/samples/dnn/siamrpnpp.py
vendored
Normal file
@ -0,0 +1,404 @@
|
||||
import argparse
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
"""
|
||||
Link to original paper : https://arxiv.org/abs/1812.11703
|
||||
Link to original repo : https://github.com/STVIR/pysot
|
||||
|
||||
You can download the pre-trained weights of the Tracker Model from https://drive.google.com/file/d/11bwgPFVkps9AH2NOD1zBDdpF_tQghAB-/view?usp=sharing
|
||||
You can download the target net (target branch of SiamRPN++) from https://drive.google.com/file/d/1dw_Ne3UMcCnFsaD6xkZepwE4GEpqq7U_/view?usp=sharing
|
||||
You can download the search net (search branch of SiamRPN++) from https://drive.google.com/file/d/1Lt4oE43ZSucJvze3Y-Z87CVDreO-Afwl/view?usp=sharing
|
||||
You can download the head model (RPN Head) from https://drive.google.com/file/d/1zT1yu12mtj3JQEkkfKFJWiZ71fJ-dQTi/view?usp=sharing
|
||||
"""
|
||||
|
||||
class ModelBuilder():
|
||||
""" This class generates the SiamRPN++ Tracker Model by using Imported ONNX Nets
|
||||
"""
|
||||
def __init__(self, target_net, search_net, rpn_head):
|
||||
super(ModelBuilder, self).__init__()
|
||||
# Build the target branch
|
||||
self.target_net = target_net
|
||||
# Build the search branch
|
||||
self.search_net = search_net
|
||||
# Build RPN_Head
|
||||
self.rpn_head = rpn_head
|
||||
|
||||
def template(self, z):
|
||||
""" Takes the template of size (1, 1, 127, 127) as an input to generate kernel
|
||||
"""
|
||||
self.target_net.setInput(z)
|
||||
outNames = self.target_net.getUnconnectedOutLayersNames()
|
||||
self.zfs_1, self.zfs_2, self.zfs_3 = self.target_net.forward(outNames)
|
||||
|
||||
def track(self, x):
|
||||
""" Takes the search of size (1, 1, 255, 255) as an input to generate classification score and bounding box regression
|
||||
"""
|
||||
self.search_net.setInput(x)
|
||||
outNames = self.search_net.getUnconnectedOutLayersNames()
|
||||
xfs_1, xfs_2, xfs_3 = self.search_net.forward(outNames)
|
||||
self.rpn_head.setInput(np.stack([self.zfs_1, self.zfs_2, self.zfs_3]), 'input_1')
|
||||
self.rpn_head.setInput(np.stack([xfs_1, xfs_2, xfs_3]), 'input_2')
|
||||
outNames = self.rpn_head.getUnconnectedOutLayersNames()
|
||||
cls, loc = self.rpn_head.forward(outNames)
|
||||
return {'cls': cls, 'loc': loc}
|
||||
|
||||
class Anchors:
|
||||
""" This class generate anchors.
|
||||
"""
|
||||
def __init__(self, stride, ratios, scales, image_center=0, size=0):
|
||||
self.stride = stride
|
||||
self.ratios = ratios
|
||||
self.scales = scales
|
||||
self.image_center = image_center
|
||||
self.size = size
|
||||
self.anchor_num = len(self.scales) * len(self.ratios)
|
||||
self.anchors = self.generate_anchors()
|
||||
|
||||
def generate_anchors(self):
|
||||
"""
|
||||
generate anchors based on predefined configuration
|
||||
"""
|
||||
anchors = np.zeros((self.anchor_num, 4), dtype=np.float32)
|
||||
size = self.stride**2
|
||||
count = 0
|
||||
for r in self.ratios:
|
||||
ws = int(np.sqrt(size * 1. / r))
|
||||
hs = int(ws * r)
|
||||
|
||||
for s in self.scales:
|
||||
w = ws * s
|
||||
h = hs * s
|
||||
anchors[count][:] = [-w * 0.5, -h * 0.5, w * 0.5, h * 0.5][:]
|
||||
count += 1
|
||||
return anchors
|
||||
|
||||
class SiamRPNTracker:
|
||||
def __init__(self, model):
|
||||
super(SiamRPNTracker, self).__init__()
|
||||
self.anchor_stride = 8
|
||||
self.anchor_ratios = [0.33, 0.5, 1, 2, 3]
|
||||
self.anchor_scales = [8]
|
||||
self.track_base_size = 8
|
||||
self.track_context_amount = 0.5
|
||||
self.track_exemplar_size = 127
|
||||
self.track_instance_size = 255
|
||||
self.track_lr = 0.4
|
||||
self.track_penalty_k = 0.04
|
||||
self.track_window_influence = 0.44
|
||||
self.score_size = (self.track_instance_size - self.track_exemplar_size) // \
|
||||
self.anchor_stride + 1 + self.track_base_size
|
||||
self.anchor_num = len(self.anchor_ratios) * len(self.anchor_scales)
|
||||
hanning = np.hanning(self.score_size)
|
||||
window = np.outer(hanning, hanning)
|
||||
self.window = np.tile(window.flatten(), self.anchor_num)
|
||||
self.anchors = self.generate_anchor(self.score_size)
|
||||
self.model = model
|
||||
|
||||
def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans):
|
||||
"""
|
||||
Args:
|
||||
im: bgr based input image frame
|
||||
pos: position of the center of the frame
|
||||
model_sz: exemplar / target image size
|
||||
s_z: original / search image size
|
||||
avg_chans: channel average
|
||||
Return:
|
||||
im_patch: sub_windows for the given image input
|
||||
"""
|
||||
if isinstance(pos, float):
|
||||
pos = [pos, pos]
|
||||
sz = original_sz
|
||||
im_h, im_w, im_d = im.shape
|
||||
c = (original_sz + 1) / 2
|
||||
cx, cy = pos
|
||||
context_xmin = np.floor(cx - c + 0.5)
|
||||
context_xmax = context_xmin + sz - 1
|
||||
context_ymin = np.floor(cy - c + 0.5)
|
||||
context_ymax = context_ymin + sz - 1
|
||||
left_pad = int(max(0., -context_xmin))
|
||||
top_pad = int(max(0., -context_ymin))
|
||||
right_pad = int(max(0., context_xmax - im_w + 1))
|
||||
bottom_pad = int(max(0., context_ymax - im_h + 1))
|
||||
context_xmin += left_pad
|
||||
context_xmax += left_pad
|
||||
context_ymin += top_pad
|
||||
context_ymax += top_pad
|
||||
|
||||
if any([top_pad, bottom_pad, left_pad, right_pad]):
|
||||
size = (im_h + top_pad + bottom_pad, im_w + left_pad + right_pad, im_d)
|
||||
te_im = np.zeros(size, np.uint8)
|
||||
te_im[top_pad:top_pad + im_h, left_pad:left_pad + im_w, :] = im
|
||||
if top_pad:
|
||||
te_im[0:top_pad, left_pad:left_pad + im_w, :] = avg_chans
|
||||
if bottom_pad:
|
||||
te_im[im_h + top_pad:, left_pad:left_pad + im_w, :] = avg_chans
|
||||
if left_pad:
|
||||
te_im[:, 0:left_pad, :] = avg_chans
|
||||
if right_pad:
|
||||
te_im[:, im_w + left_pad:, :] = avg_chans
|
||||
im_patch = te_im[int(context_ymin):int(context_ymax + 1),
|
||||
int(context_xmin):int(context_xmax + 1), :]
|
||||
else:
|
||||
im_patch = im[int(context_ymin):int(context_ymax + 1),
|
||||
int(context_xmin):int(context_xmax + 1), :]
|
||||
|
||||
if not np.array_equal(model_sz, original_sz):
|
||||
im_patch = cv.resize(im_patch, (model_sz, model_sz))
|
||||
im_patch = im_patch.transpose(2, 0, 1)
|
||||
im_patch = im_patch[np.newaxis, :, :, :]
|
||||
im_patch = im_patch.astype(np.float32)
|
||||
return im_patch
|
||||
|
||||
def generate_anchor(self, score_size):
|
||||
"""
|
||||
Args:
|
||||
im: bgr based input image frame
|
||||
pos: position of the center of the frame
|
||||
model_sz: exemplar / target image size
|
||||
s_z: original / search image size
|
||||
avg_chans: channel average
|
||||
Return:
|
||||
anchor: anchors for pre-determined values of stride, ratio, and scale
|
||||
"""
|
||||
anchors = Anchors(self.anchor_stride, self.anchor_ratios, self.anchor_scales)
|
||||
anchor = anchors.anchors
|
||||
x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3]
|
||||
anchor = np.stack([(x1 + x2) * 0.5, (y1 + y2) * 0.5, x2 - x1, y2 - y1], 1)
|
||||
total_stride = anchors.stride
|
||||
anchor_num = anchors.anchor_num
|
||||
anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
|
||||
ori = - (score_size // 2) * total_stride
|
||||
xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
|
||||
[ori + total_stride * dy for dy in range(score_size)])
|
||||
xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
|
||||
np.tile(yy.flatten(), (anchor_num, 1)).flatten()
|
||||
anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
|
||||
return anchor
|
||||
|
||||
def _convert_bbox(self, delta, anchor):
|
||||
"""
|
||||
Args:
|
||||
delta: localisation
|
||||
anchor: anchor of pre-determined anchor size
|
||||
Return:
|
||||
delta: prediction of bounding box
|
||||
"""
|
||||
delta_transpose = np.transpose(delta, (1, 2, 3, 0))
|
||||
delta_contig = np.ascontiguousarray(delta_transpose)
|
||||
delta = delta_contig.reshape(4, -1)
|
||||
delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
|
||||
delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
|
||||
delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
|
||||
delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
|
||||
return delta
|
||||
|
||||
def _softmax(self, x):
|
||||
"""
|
||||
Softmax in the direction of the depth of the layer
|
||||
"""
|
||||
x = x.astype(dtype=np.float32)
|
||||
x_max = x.max(axis=1)[:, np.newaxis]
|
||||
e_x = np.exp(x-x_max)
|
||||
div = np.sum(e_x, axis=1)[:, np.newaxis]
|
||||
y = e_x / div
|
||||
return y
|
||||
|
||||
def _convert_score(self, score):
|
||||
"""
|
||||
Args:
|
||||
cls: score
|
||||
Return:
|
||||
cls: score for cls
|
||||
"""
|
||||
score_transpose = np.transpose(score, (1, 2, 3, 0))
|
||||
score_con = np.ascontiguousarray(score_transpose)
|
||||
score_view = score_con.reshape(2, -1)
|
||||
score = np.transpose(score_view, (1, 0))
|
||||
score = self._softmax(score)
|
||||
return score[:,1]
|
||||
|
||||
def _bbox_clip(self, cx, cy, width, height, boundary):
|
||||
"""
|
||||
Adjusting the bounding box
|
||||
"""
|
||||
bbox_h, bbox_w = boundary
|
||||
cx = max(0, min(cx, bbox_w))
|
||||
cy = max(0, min(cy, bbox_h))
|
||||
width = max(10, min(width, bbox_w))
|
||||
height = max(10, min(height, bbox_h))
|
||||
return cx, cy, width, height
|
||||
|
||||
def init(self, img, bbox):
|
||||
"""
|
||||
Args:
|
||||
img(np.ndarray): bgr based input image frame
|
||||
bbox: (x, y, w, h): bounding box
|
||||
"""
|
||||
x, y, w, h = bbox
|
||||
self.center_pos = np.array([x + (w - 1) / 2, y + (h - 1) / 2])
|
||||
self.h = h
|
||||
self.w = w
|
||||
w_z = self.w + self.track_context_amount * np.add(h, w)
|
||||
h_z = self.h + self.track_context_amount * np.add(h, w)
|
||||
s_z = round(np.sqrt(w_z * h_z))
|
||||
self.channel_average = np.mean(img, axis=(0, 1))
|
||||
z_crop = self.get_subwindow(img, self.center_pos, self.track_exemplar_size, s_z, self.channel_average)
|
||||
self.model.template(z_crop)
|
||||
|
||||
def track(self, img):
|
||||
"""
|
||||
Args:
|
||||
img(np.ndarray): BGR image
|
||||
Return:
|
||||
bbox(list):[x, y, width, height]
|
||||
"""
|
||||
w_z = self.w + self.track_context_amount * np.add(self.w, self.h)
|
||||
h_z = self.h + self.track_context_amount * np.add(self.w, self.h)
|
||||
s_z = np.sqrt(w_z * h_z)
|
||||
scale_z = self.track_exemplar_size / s_z
|
||||
s_x = s_z * (self.track_instance_size / self.track_exemplar_size)
|
||||
x_crop = self.get_subwindow(img, self.center_pos, self.track_instance_size, round(s_x), self.channel_average)
|
||||
outputs = self.model.track(x_crop)
|
||||
score = self._convert_score(outputs['cls'])
|
||||
pred_bbox = self._convert_bbox(outputs['loc'], self.anchors)
|
||||
|
||||
def change(r):
|
||||
return np.maximum(r, 1. / r)
|
||||
|
||||
def sz(w, h):
|
||||
pad = (w + h) * 0.5
|
||||
return np.sqrt((w + pad) * (h + pad))
|
||||
|
||||
# scale penalty
|
||||
s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
|
||||
(sz(self.w * scale_z, self.h * scale_z)))
|
||||
|
||||
# aspect ratio penalty
|
||||
r_c = change((self.w / self.h) /
|
||||
(pred_bbox[2, :] / pred_bbox[3, :]))
|
||||
penalty = np.exp(-(r_c * s_c - 1) * self.track_penalty_k)
|
||||
pscore = penalty * score
|
||||
|
||||
# window penalty
|
||||
pscore = pscore * (1 - self.track_window_influence) + \
|
||||
self.window * self.track_window_influence
|
||||
best_idx = np.argmax(pscore)
|
||||
bbox = pred_bbox[:, best_idx] / scale_z
|
||||
lr = penalty[best_idx] * score[best_idx] * self.track_lr
|
||||
|
||||
cpx, cpy = self.center_pos
|
||||
x,y,w,h = bbox
|
||||
cx = x + cpx
|
||||
cy = y + cpy
|
||||
|
||||
# smooth bbox
|
||||
width = self.w * (1 - lr) + w * lr
|
||||
height = self.h * (1 - lr) + h * lr
|
||||
|
||||
# clip boundary
|
||||
cx, cy, width, height = self._bbox_clip(cx, cy, width, height, img.shape[:2])
|
||||
|
||||
# udpate state
|
||||
self.center_pos = np.array([cx, cy])
|
||||
self.w = width
|
||||
self.h = height
|
||||
bbox = [cx - width / 2, cy - height / 2, width, height]
|
||||
best_score = score[best_idx]
|
||||
return {'bbox': bbox, 'best_score': best_score}
|
||||
|
||||
def get_frames(video_name):
|
||||
"""
|
||||
Args:
|
||||
Path to input video frame
|
||||
Return:
|
||||
Frame
|
||||
"""
|
||||
cap = cv.VideoCapture(video_name if video_name else 0)
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if ret:
|
||||
yield frame
|
||||
else:
|
||||
break
|
||||
|
||||
def main():
|
||||
""" Sample SiamRPN Tracker
|
||||
"""
|
||||
# Computation backends supported by layers
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
|
||||
# Target Devices for computation
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
|
||||
cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run SiamRPN++ Visual Tracker',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input_video', type=str, help='Path to input video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--target_net', type=str, default='target_net.onnx', help='Path to part of SiamRPN++ ran on target frame.')
|
||||
parser.add_argument('--search_net', type=str, default='search_net.onnx', help='Path to part of SiamRPN++ ran on search frame.')
|
||||
parser.add_argument('--rpn_head', type=str, default='rpn_head.onnx', help='Path to RPN Head ONNX model.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Select a computation backend: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide, "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV Implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Select a target device: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL FP16, '
|
||||
'%d: Myriad, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA fp16 (half-float preprocess)' % targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if args.input_video and not os.path.isfile(args.input_video):
|
||||
raise OSError("Input video file does not exist")
|
||||
if not os.path.isfile(args.target_net):
|
||||
raise OSError("Target Net does not exist")
|
||||
if not os.path.isfile(args.search_net):
|
||||
raise OSError("Search Net does not exist")
|
||||
if not os.path.isfile(args.rpn_head):
|
||||
raise OSError("RPN Head Net does not exist")
|
||||
|
||||
#Load the Networks
|
||||
target_net = cv.dnn.readNetFromONNX(args.target_net)
|
||||
target_net.setPreferableBackend(args.backend)
|
||||
target_net.setPreferableTarget(args.target)
|
||||
search_net = cv.dnn.readNetFromONNX(args.search_net)
|
||||
search_net.setPreferableBackend(args.backend)
|
||||
search_net.setPreferableTarget(args.target)
|
||||
rpn_head = cv.dnn.readNetFromONNX(args.rpn_head)
|
||||
rpn_head.setPreferableBackend(args.backend)
|
||||
rpn_head.setPreferableTarget(args.target)
|
||||
model = ModelBuilder(target_net, search_net, rpn_head)
|
||||
tracker = SiamRPNTracker(model)
|
||||
|
||||
first_frame = True
|
||||
cv.namedWindow('SiamRPN++ Tracker', cv.WINDOW_AUTOSIZE)
|
||||
for frame in get_frames(args.input_video):
|
||||
if first_frame:
|
||||
try:
|
||||
init_rect = cv.selectROI('SiamRPN++ Tracker', frame, False, False)
|
||||
except:
|
||||
exit()
|
||||
tracker.init(frame, init_rect)
|
||||
first_frame = False
|
||||
else:
|
||||
outputs = tracker.track(frame)
|
||||
bbox = list(map(int, outputs['bbox']))
|
||||
x,y,w,h = bbox
|
||||
cv.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 3)
|
||||
cv.imshow('SiamRPN++ Tracker', frame)
|
||||
key = cv.waitKey(1)
|
||||
if key == ord("q"):
|
||||
break
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
506
3rdparty/opencv-4.5.4/samples/dnn/speech_recognition.py
vendored
Normal file
506
3rdparty/opencv-4.5.4/samples/dnn/speech_recognition.py
vendored
Normal file
@ -0,0 +1,506 @@
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
import argparse
|
||||
import os
|
||||
import soundfile as sf # Temporary import to load audio files
|
||||
|
||||
'''
|
||||
You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing
|
||||
or convert the model yourself.
|
||||
|
||||
You can get the original pre-trained Jasper model from NVIDIA : https://ngc.nvidia.com/catalog/models/nvidia:jasper_pyt_onnx_fp16_amp/files
|
||||
Download and unzip : `$ wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/jasper_pyt_onnx_fp16_amp/versions/20.10.0/zip -O jasper_pyt_onnx_fp16_amp_20.10.0.zip && unzip -o ./jasper_pyt_onnx_fp16_amp_20.10.0.zip && unzip -o ./jasper_pyt_onnx_fp16_amp.zip`
|
||||
|
||||
you can get the script to convert the model here : https://gist.github.com/spazewalker/507f1529e19aea7e8417f6e935851a01
|
||||
|
||||
You can convert the model using the following steps:
|
||||
1. Import onnx and load the original model
|
||||
```
|
||||
import onnx
|
||||
model = onnx.load("./jasper-onnx/1/model.onnx")
|
||||
```
|
||||
|
||||
3. Change data type of input layer
|
||||
```
|
||||
inp = model.graph.input[0]
|
||||
model.graph.input.remove(inp)
|
||||
inp.type.tensor_type.elem_type = 1
|
||||
model.graph.input.insert(0,inp)
|
||||
```
|
||||
|
||||
4. Change the data type of output layer
|
||||
```
|
||||
out = model.graph.output[0]
|
||||
model.graph.output.remove(out)
|
||||
out.type.tensor_type.elem_type = 1
|
||||
model.graph.output.insert(0,out)
|
||||
```
|
||||
|
||||
5. Change the data type of every initializer and cast it's values from FP16 to FP32
|
||||
```
|
||||
for i,init in enumerate(model.graph.initializer):
|
||||
model.graph.initializer.remove(init)
|
||||
init.data_type = 1
|
||||
init.raw_data = np.frombuffer(init.raw_data, count=np.product(init.dims), dtype=np.float16).astype(np.float32).tobytes()
|
||||
model.graph.initializer.insert(i,init)
|
||||
```
|
||||
|
||||
6. Add an additional reshape node to handle the inconsistant input from python and c++ of openCV.
|
||||
see https://github.com/opencv/opencv/issues/19091
|
||||
Make & insert a new node with 'Reshape' operation & required initializer
|
||||
```
|
||||
tensor = numpy_helper.from_array(np.array([0,64,-1]),name='shape_reshape')
|
||||
model.graph.initializer.insert(0,tensor)
|
||||
node = onnx.helper.make_node(op_type='Reshape',inputs=['input__0','shape_reshape'], outputs=['input_reshaped'], name='reshape__0')
|
||||
model.graph.node.insert(0,node)
|
||||
model.graph.node[1].input[0] = 'input_reshaped'
|
||||
```
|
||||
|
||||
7. Finally save the model
|
||||
```
|
||||
with open('jasper_dynamic_input_float.onnx','wb') as f:
|
||||
onnx.save_model(model,f)
|
||||
```
|
||||
|
||||
Original Repo : https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper
|
||||
'''
|
||||
|
||||
class FilterbankFeatures:
|
||||
def __init__(self,
|
||||
sample_rate=16000, window_size=0.02, window_stride=0.01,
|
||||
n_fft=512, preemph=0.97, n_filt=64, lowfreq=0,
|
||||
highfreq=None, log=True, dither=1e-5):
|
||||
'''
|
||||
Initializes pre-processing class. Default values are the values used by the Jasper
|
||||
architecture for pre-processing. For more details, refer to the paper here:
|
||||
https://arxiv.org/abs/1904.03288
|
||||
'''
|
||||
self.win_length = int(sample_rate * window_size) # frame size
|
||||
self.hop_length = int(sample_rate * window_stride) # stride
|
||||
self.n_fft = n_fft or 2 ** np.ceil(np.log2(self.win_length))
|
||||
self.log = log
|
||||
self.dither = dither
|
||||
self.n_filt = n_filt
|
||||
self.preemph = preemph
|
||||
highfreq = highfreq or sample_rate / 2
|
||||
self.window_tensor = np.hanning(self.win_length)
|
||||
|
||||
self.filterbanks = self.mel(sample_rate, self.n_fft, n_mels=n_filt, fmin=lowfreq, fmax=highfreq)
|
||||
self.filterbanks.dtype=np.float32
|
||||
self.filterbanks = np.expand_dims(self.filterbanks,0)
|
||||
|
||||
def normalize_batch(self, x, seq_len):
|
||||
'''
|
||||
Normalizes the features.
|
||||
'''
|
||||
x_mean = np.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype)
|
||||
x_std = np.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype)
|
||||
for i in range(x.shape[0]):
|
||||
x_mean[i, :] = np.mean(x[i, :, :seq_len[i]],axis=1)
|
||||
x_std[i, :] = np.std(x[i, :, :seq_len[i]],axis=1)
|
||||
# make sure x_std is not zero
|
||||
x_std += 1e-10
|
||||
return (x - np.expand_dims(x_mean,2)) / np.expand_dims(x_std,2)
|
||||
|
||||
def calculate_features(self, x, seq_len):
|
||||
'''
|
||||
Calculates filterbank features.
|
||||
args:
|
||||
x : mono channel audio
|
||||
seq_len : length of the audio sample
|
||||
returns:
|
||||
x : filterbank features
|
||||
'''
|
||||
dtype = x.dtype
|
||||
|
||||
seq_len = np.ceil(seq_len / self.hop_length)
|
||||
seq_len = np.array(seq_len,dtype=np.int32)
|
||||
|
||||
# dither
|
||||
if self.dither > 0:
|
||||
x += self.dither * np.random.randn(*x.shape)
|
||||
|
||||
# do preemphasis
|
||||
if self.preemph is not None:
|
||||
x = np.concatenate(
|
||||
(np.expand_dims(x[0],-1), x[1:] - self.preemph * x[:-1]), axis=0)
|
||||
|
||||
# Short Time Fourier Transform
|
||||
x = self.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
fft_window=self.window_tensor)
|
||||
|
||||
# get power spectrum
|
||||
x = (x**2).sum(-1)
|
||||
|
||||
# dot with filterbank energies
|
||||
x = np.matmul(np.array(self.filterbanks,dtype=x.dtype), x)
|
||||
|
||||
# log features if required
|
||||
if self.log:
|
||||
x = np.log(x + 1e-20)
|
||||
|
||||
# normalize if required
|
||||
x = self.normalize_batch(x, seq_len).astype(dtype)
|
||||
return x
|
||||
|
||||
# Mel Frequency calculation
|
||||
def hz_to_mel(self, frequencies):
|
||||
'''
|
||||
Converts frequencies from hz to mel scale. Input can be a number or a vector.
|
||||
'''
|
||||
frequencies = np.asanyarray(frequencies)
|
||||
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
|
||||
mels = (frequencies - f_min) / f_sp
|
||||
|
||||
# Fill in the log-scale part
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if frequencies.ndim:
|
||||
# If we have array data, vectorize
|
||||
log_t = frequencies >= min_log_hz
|
||||
mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
|
||||
elif frequencies >= min_log_hz:
|
||||
# If we have scalar data, directly
|
||||
mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep
|
||||
return mels
|
||||
|
||||
def mel_to_hz(self, mels):
|
||||
'''
|
||||
Converts frequencies from mel to hz scale. Input can be a number or a vector.
|
||||
'''
|
||||
mels = np.asanyarray(mels)
|
||||
|
||||
# Fill in the linear scale
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
freqs = f_min + f_sp * mels
|
||||
|
||||
# And now the nonlinear scale
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if mels.ndim:
|
||||
# If we have vector data, vectorize
|
||||
log_t = mels >= min_log_mel
|
||||
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
|
||||
elif mels >= min_log_mel:
|
||||
# If we have scalar data, check directly
|
||||
freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel))
|
||||
|
||||
return freqs
|
||||
|
||||
def mel_frequencies(self, n_mels=128, fmin=0.0, fmax=11025.0):
|
||||
'''
|
||||
Calculates n mel frequencies between 2 frequencies
|
||||
args:
|
||||
n_mels : number of bands
|
||||
fmin : min frequency
|
||||
fmax : max frequency
|
||||
returns:
|
||||
mels : vector of mel frequencies
|
||||
'''
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
min_mel = self.hz_to_mel(fmin)
|
||||
max_mel = self.hz_to_mel(fmax)
|
||||
|
||||
mels = np.linspace(min_mel, max_mel, n_mels)
|
||||
|
||||
return self.mel_to_hz(mels)
|
||||
|
||||
def mel(self, sr, n_fft, n_mels=128, fmin=0.0, fmax=None, dtype=np.float32):
|
||||
'''
|
||||
Generates mel filterbank
|
||||
args:
|
||||
sr : Sampling rate
|
||||
n_fft : number of FFT components
|
||||
n_mels : number of Mel bands to generate
|
||||
fmin : lowest frequency (in Hz)
|
||||
fmax : highest frequency (in Hz). sr/2.0 if None
|
||||
dtype : the data type of the output basis.
|
||||
returns:
|
||||
mels : Mel transform matrix
|
||||
'''
|
||||
# default Max freq = half of sampling rate
|
||||
if fmax is None:
|
||||
fmax = float(sr) / 2
|
||||
|
||||
# Initialize the weights
|
||||
n_mels = int(n_mels)
|
||||
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||
|
||||
# Center freqs of each FFT bin
|
||||
fftfreqs = np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
|
||||
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
mel_f = self.mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax)
|
||||
|
||||
fdiff = np.diff(mel_f)
|
||||
ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||
|
||||
for i in range(n_mels):
|
||||
# lower and upper slopes for all bins
|
||||
lower = -ramps[i] / fdiff[i]
|
||||
upper = ramps[i + 2] / fdiff[i + 1]
|
||||
|
||||
# .. then intersect them with each other and zero
|
||||
weights[i] = np.maximum(0, np.minimum(lower, upper))
|
||||
|
||||
# Using Slaney-style mel which is scaled to be approx constant energy per channel
|
||||
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
|
||||
weights *= enorm[:, np.newaxis]
|
||||
return weights
|
||||
|
||||
# STFT preperation
|
||||
def pad_window_center(self, data, size, axis=-1, **kwargs):
|
||||
'''
|
||||
Centers the data and pads.
|
||||
args:
|
||||
data : Vector to be padded and centered
|
||||
size : Length to pad data
|
||||
axis : Axis along which to pad and center the data
|
||||
kwargs : arguments passed to np.pad
|
||||
return : centered and padded data
|
||||
'''
|
||||
kwargs.setdefault("mode", "constant")
|
||||
n = data.shape[axis]
|
||||
lpad = int((size - n) // 2)
|
||||
lengths = [(0, 0)] * data.ndim
|
||||
lengths[axis] = (lpad, int(size - n - lpad))
|
||||
if lpad < 0:
|
||||
raise Exception(
|
||||
("Target size ({:d}) must be at least input size ({:d})").format(size, n)
|
||||
)
|
||||
return np.pad(data, lengths, **kwargs)
|
||||
|
||||
def frame(self, x, frame_length, hop_length):
|
||||
'''
|
||||
Slices a data array into (overlapping) frames.
|
||||
args:
|
||||
x : array to frame
|
||||
frame_length : length of frame
|
||||
hop_length : Number of steps to advance between frames
|
||||
return : A framed view of `x`
|
||||
'''
|
||||
if x.shape[-1] < frame_length:
|
||||
raise Exception(
|
||||
"Input is too short (n={:d})"
|
||||
" for frame_length={:d}".format(x.shape[-1], frame_length)
|
||||
)
|
||||
x = np.asfortranarray(x)
|
||||
n_frames = 1 + (x.shape[-1] - frame_length) // hop_length
|
||||
strides = np.asarray(x.strides)
|
||||
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
|
||||
shape = list(x.shape)[:-1] + [frame_length, n_frames]
|
||||
strides = list(strides) + [hop_length * new_stride]
|
||||
return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)
|
||||
|
||||
def dtype_r2c(self, d, default=np.complex64):
|
||||
'''
|
||||
Find the complex numpy dtype corresponding to a real dtype.
|
||||
args:
|
||||
d : The real-valued dtype to convert to complex.
|
||||
default : The default complex target type, if `d` does not match a known dtype
|
||||
return : The complex dtype
|
||||
'''
|
||||
mapping = {
|
||||
np.dtype(np.float32): np.complex64,
|
||||
np.dtype(np.float64): np.complex128,
|
||||
}
|
||||
dt = np.dtype(d)
|
||||
if dt.kind == "c":
|
||||
return dt
|
||||
return np.dtype(mapping.get(dt, default))
|
||||
|
||||
def stft(self, y, n_fft, hop_length=None, win_length=None, fft_window=None, pad_mode='reflect', return_complex=False):
|
||||
'''
|
||||
Short Time Fourier Transform. The STFT represents a signal in the time-frequency
|
||||
domain by computing discrete Fourier transforms (DFT) over short overlapping windows.
|
||||
args:
|
||||
y : input signal
|
||||
n_fft : length of the windowed signal after padding with zeros.
|
||||
hop_length : number of audio samples between adjacent STFT columns.
|
||||
win_length : Each frame of audio is windowed by window of length win_length and
|
||||
then padded with zeros to match n_fft
|
||||
fft_window : a vector or array of length `n_fft` having values computed by a
|
||||
window function
|
||||
pad_mode : mode while padding the singnal
|
||||
return_complex : returns array with complex data type if `True`
|
||||
return : Matrix of short-term Fourier transform coefficients.
|
||||
'''
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
if hop_length is None:
|
||||
hop_length = int(win_length // 4)
|
||||
if y.ndim!=1:
|
||||
raise Exception(f'Invalid input shape. Only Mono Channeled audio supported. Input must have shape (Audio,). Got {y.shape}')
|
||||
|
||||
# Pad the window out to n_fft size
|
||||
fft_window = self.pad_window_center(fft_window, n_fft)
|
||||
|
||||
# Reshape so that the window can be broadcast
|
||||
fft_window = fft_window.reshape((-1, 1))
|
||||
|
||||
# Pad the time series so that frames are centered
|
||||
y = np.pad(y, int(n_fft // 2), mode=pad_mode)
|
||||
|
||||
# Window the time series.
|
||||
y_frames = self.frame(y, frame_length=n_fft, hop_length=hop_length)
|
||||
|
||||
# Convert data type to complex
|
||||
dtype = self.dtype_r2c(y.dtype)
|
||||
|
||||
# Pre-allocate the STFT matrix
|
||||
stft_matrix = np.empty( (int(1 + n_fft // 2), y_frames.shape[-1]), dtype=dtype, order="F")
|
||||
|
||||
stft_matrix = np.fft.rfft( fft_window * y_frames, axis=0)
|
||||
return stft_matrix if return_complex==True else np.stack((stft_matrix.real,stft_matrix.imag),axis=-1)
|
||||
|
||||
class Decoder:
|
||||
'''
|
||||
Used for decoding the output of jasper model.
|
||||
'''
|
||||
def __init__(self):
|
||||
labels=[' ','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',"'"]
|
||||
self.labels_map = {i: label for i,label in enumerate(labels)}
|
||||
self.blank_id = 28
|
||||
|
||||
def decode(self,x):
|
||||
"""
|
||||
Takes output of Jasper model and performs ctc decoding algorithm to
|
||||
remove duplicates and special symbol. Returns prediction
|
||||
"""
|
||||
x = np.argmax(x,axis=-1)
|
||||
hypotheses = []
|
||||
prediction = x.tolist()
|
||||
# CTC decoding procedure
|
||||
decoded_prediction = []
|
||||
previous = self.blank_id
|
||||
for p in prediction:
|
||||
if (p != previous or previous == self.blank_id) and p != self.blank_id:
|
||||
decoded_prediction.append(p)
|
||||
previous = p
|
||||
hypothesis = ''.join([self.labels_map[c] for c in decoded_prediction])
|
||||
hypotheses.append(hypothesis)
|
||||
return hypotheses
|
||||
|
||||
def predict(features, net, decoder):
|
||||
'''
|
||||
Passes the features through the Jasper model and decodes the output to english transcripts.
|
||||
args:
|
||||
features : input features, calculated using FilterbankFeatures class
|
||||
net : Jasper model dnn.net object
|
||||
decoder : Decoder object
|
||||
return : Predicted text
|
||||
'''
|
||||
# This is a workaround https://github.com/opencv/opencv/issues/19091
|
||||
# expanding 1 dimentions allows us to pass it to the network
|
||||
# from python. This should be resolved in the future.
|
||||
features = np.expand_dims(features,axis=3)
|
||||
|
||||
# make prediction
|
||||
net.setInput(features)
|
||||
output = net.forward()
|
||||
|
||||
# decode output to transcript
|
||||
prediction = decoder.decode(output.squeeze(0))
|
||||
return prediction[0]
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Computation backends supported by layers
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
|
||||
# Target Devices for computation
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16)
|
||||
|
||||
parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input_audio', type=str, required=True, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines')
|
||||
parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.')
|
||||
parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"')
|
||||
parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help='Select a computation backend: '
|
||||
"%d: automatically (by default) "
|
||||
"%d: OpenVINO Inference Engine "
|
||||
"%d: OpenCV Implementation " % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Select a target device: '
|
||||
"%d: CPU target (by default) "
|
||||
"%d: OpenCL "
|
||||
"%d: OpenCL FP16 " % targets)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
if args.input_audio and not os.path.isfile(args.input_audio):
|
||||
raise OSError("Input audio file does not exist")
|
||||
if not os.path.isfile(args.model):
|
||||
raise OSError("Jasper model file does not exist")
|
||||
if args.input_audio.endswith('.txt'):
|
||||
with open(args.input_audio) as f:
|
||||
content = f.readlines()
|
||||
content = [x.strip() for x in content]
|
||||
audio_file_paths = content
|
||||
for audio_file_path in audio_file_paths:
|
||||
if not os.path.isfile(audio_file_path):
|
||||
raise OSError("Audio file({audio_file_path}) does not exist")
|
||||
else:
|
||||
audio_file_paths = [args.input_audio]
|
||||
audio_file_paths = [os.path.abspath(x) for x in audio_file_paths]
|
||||
|
||||
# Read audio Files
|
||||
features = []
|
||||
try:
|
||||
for audio_file_path in audio_file_paths:
|
||||
audio = sf.read(audio_file_path)
|
||||
# If audio is stereo, just take one channel.
|
||||
X = audio[0] if audio[0].ndim==1 else audio[0][:,0]
|
||||
features.append(X)
|
||||
except:
|
||||
raise Exception(f"Soundfile cannot read {args.input_audio}. Try a different format")
|
||||
|
||||
# Get Filterbank Features
|
||||
feature_extractor = FilterbankFeatures()
|
||||
for i in range(len(features)):
|
||||
X = features[i]
|
||||
seq_len = np.array([X.shape[0]], dtype=np.int32)
|
||||
features[i] = feature_extractor.calculate_features(x=X, seq_len=seq_len)
|
||||
|
||||
# Load Network
|
||||
net = cv.dnn.readNetFromONNX(args.model)
|
||||
net.setPreferableBackend(args.backend)
|
||||
net.setPreferableTarget(args.target)
|
||||
|
||||
# Show spectogram if required
|
||||
if args.show_spectrogram and not args.input_audio.endswith('.txt'):
|
||||
img = cv.normalize(src=features[0][0], dst=None, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U)
|
||||
img = cv.applyColorMap(img, cv.COLORMAP_JET)
|
||||
cv.imshow('spectogram', img)
|
||||
cv.waitKey(0)
|
||||
|
||||
# Initialize decoder
|
||||
decoder = Decoder()
|
||||
|
||||
# Make prediction
|
||||
prediction = []
|
||||
print("Predicting...")
|
||||
for feature in features:
|
||||
print(f"\rAudio file {len(prediction)+1}/{len(features)}", end='')
|
||||
prediction.append(predict(feature, net, decoder))
|
||||
print("")
|
||||
|
||||
# save transcript if required
|
||||
if args.output:
|
||||
with open(args.output,'w') as f:
|
||||
for pred in prediction:
|
||||
f.write(pred+'\n')
|
||||
print("Transcript was written to {}".format(args.output))
|
||||
else:
|
||||
print(prediction)
|
||||
cv.destroyAllWindows()
|
177
3rdparty/opencv-4.5.4/samples/dnn/text_detection.cpp
vendored
Normal file
177
3rdparty/opencv-4.5.4/samples/dnn/text_detection.cpp
vendored
Normal file
@ -0,0 +1,177 @@
|
||||
/*
|
||||
Text detection model: https://github.com/argman/EAST
|
||||
Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
|
||||
|
||||
Text recognition models can be downloaded directly here:
|
||||
Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
|
||||
and doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
|
||||
|
||||
How to convert from pb to onnx:
|
||||
Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
|
||||
import torch
|
||||
from models.crnn import CRNN
|
||||
model = CRNN(32, 1, 37, 256)
|
||||
model.load_state_dict(torch.load('crnn.pth'))
|
||||
dummy_input = torch.randn(1, 1, 32, 100)
|
||||
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
|
||||
|
||||
For more information, please refer to doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown and doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
|
||||
*/
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/dnn.hpp>
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::dnn;
|
||||
|
||||
const char* keys =
|
||||
"{ help h | | Print help message. }"
|
||||
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
|
||||
"{ detModel dmp | | Path to a binary .pb file contains trained detector network.}"
|
||||
"{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
|
||||
"{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
|
||||
"{ thr | 0.5 | Confidence threshold. }"
|
||||
"{ nms | 0.4 | Non-maximum suppression threshold. }"
|
||||
"{ recModel rmp | | Path to a binary .onnx file contains trained CRNN text recognition model. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
|
||||
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
|
||||
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
|
||||
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
// Parse command line arguments.
|
||||
CommandLineParser parser(argc, argv, keys);
|
||||
parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
|
||||
"EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
|
||||
if (argc == 1 || parser.has("help"))
|
||||
{
|
||||
parser.printMessage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
float confThreshold = parser.get<float>("thr");
|
||||
float nmsThreshold = parser.get<float>("nms");
|
||||
int width = parser.get<int>("width");
|
||||
int height = parser.get<int>("height");
|
||||
int imreadRGB = parser.get<int>("RGBInput");
|
||||
String detModelPath = parser.get<String>("detModel");
|
||||
String recModelPath = parser.get<String>("recModel");
|
||||
String vocPath = parser.get<String>("vocabularyPath");
|
||||
|
||||
if (!parser.check())
|
||||
{
|
||||
parser.printErrors();
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Load networks.
|
||||
CV_Assert(!detModelPath.empty() && !recModelPath.empty());
|
||||
TextDetectionModel_EAST detector(detModelPath);
|
||||
detector.setConfidenceThreshold(confThreshold)
|
||||
.setNMSThreshold(nmsThreshold);
|
||||
|
||||
TextRecognitionModel recognizer(recModelPath);
|
||||
|
||||
// Load vocabulary
|
||||
CV_Assert(!vocPath.empty());
|
||||
std::ifstream vocFile;
|
||||
vocFile.open(samples::findFile(vocPath));
|
||||
CV_Assert(vocFile.is_open());
|
||||
String vocLine;
|
||||
std::vector<String> vocabulary;
|
||||
while (std::getline(vocFile, vocLine)) {
|
||||
vocabulary.push_back(vocLine);
|
||||
}
|
||||
recognizer.setVocabulary(vocabulary);
|
||||
recognizer.setDecodeType("CTC-greedy");
|
||||
|
||||
// Parameters for Recognition
|
||||
double recScale = 1.0 / 127.5;
|
||||
Scalar recMean = Scalar(127.5, 127.5, 127.5);
|
||||
Size recInputSize = Size(100, 32);
|
||||
recognizer.setInputParams(recScale, recInputSize, recMean);
|
||||
|
||||
// Parameters for Detection
|
||||
double detScale = 1.0;
|
||||
Size detInputSize = Size(width, height);
|
||||
Scalar detMean = Scalar(123.68, 116.78, 103.94);
|
||||
bool swapRB = true;
|
||||
detector.setInputParams(detScale, detInputSize, detMean, swapRB);
|
||||
|
||||
// Open a video file or an image file or a camera stream.
|
||||
VideoCapture cap;
|
||||
bool openSuccess = parser.has("input") ? cap.open(parser.get<String>("input")) : cap.open(0);
|
||||
CV_Assert(openSuccess);
|
||||
|
||||
static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
|
||||
|
||||
Mat frame;
|
||||
while (waitKey(1) < 0)
|
||||
{
|
||||
cap >> frame;
|
||||
if (frame.empty())
|
||||
{
|
||||
waitKey();
|
||||
break;
|
||||
}
|
||||
|
||||
std::cout << frame.size << std::endl;
|
||||
|
||||
// Detection
|
||||
std::vector< std::vector<Point> > detResults;
|
||||
detector.detect(frame, detResults);
|
||||
|
||||
if (detResults.size() > 0) {
|
||||
// Text Recognition
|
||||
Mat recInput;
|
||||
if (!imreadRGB) {
|
||||
cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
|
||||
} else {
|
||||
recInput = frame;
|
||||
}
|
||||
std::vector< std::vector<Point> > contours;
|
||||
for (uint i = 0; i < detResults.size(); i++)
|
||||
{
|
||||
const auto& quadrangle = detResults[i];
|
||||
CV_CheckEQ(quadrangle.size(), (size_t)4, "");
|
||||
|
||||
contours.emplace_back(quadrangle);
|
||||
|
||||
std::vector<Point2f> quadrangle_2f;
|
||||
for (int j = 0; j < 4; j++)
|
||||
quadrangle_2f.emplace_back(quadrangle[j]);
|
||||
|
||||
Mat cropped;
|
||||
fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
|
||||
|
||||
std::string recognitionResult = recognizer.recognize(cropped);
|
||||
std::cout << i << ": '" << recognitionResult << "'" << std::endl;
|
||||
|
||||
putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255), 2);
|
||||
}
|
||||
polylines(frame, contours, true, Scalar(0, 255, 0), 2);
|
||||
}
|
||||
imshow(kWinName, frame);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
|
||||
{
|
||||
const Size outputSize = Size(100, 32);
|
||||
|
||||
Point2f targetVertices[4] = {
|
||||
Point(0, outputSize.height - 1),
|
||||
Point(0, 0), Point(outputSize.width - 1, 0),
|
||||
Point(outputSize.width - 1, outputSize.height - 1)
|
||||
};
|
||||
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
|
||||
|
||||
warpPerspective(frame, result, rotationMatrix, outputSize);
|
||||
}
|
239
3rdparty/opencv-4.5.4/samples/dnn/text_detection.py
vendored
Normal file
239
3rdparty/opencv-4.5.4/samples/dnn/text_detection.py
vendored
Normal file
@ -0,0 +1,239 @@
|
||||
'''
|
||||
Text detection model: https://github.com/argman/EAST
|
||||
Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
|
||||
|
||||
CRNN Text recognition model taken from here: https://github.com/meijieru/crnn.pytorch
|
||||
How to convert from pb to onnx:
|
||||
Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
|
||||
|
||||
More converted onnx text recognition models can be downloaded directly here:
|
||||
Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
|
||||
And these models taken from here:https://github.com/clovaai/deep-text-recognition-benchmark
|
||||
|
||||
import torch
|
||||
from models.crnn import CRNN
|
||||
|
||||
model = CRNN(32, 1, 37, 256)
|
||||
model.load_state_dict(torch.load('crnn.pth'))
|
||||
dummy_input = torch.randn(1, 1, 32, 100)
|
||||
torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
|
||||
'''
|
||||
|
||||
|
||||
# Import required modules
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
import math
|
||||
import argparse
|
||||
|
||||
############ Add argument parser for command line arguments ############
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
|
||||
"EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)"
|
||||
"The OCR model can be obtained from converting the pretrained CRNN model to .onnx format from the github repository https://github.com/meijieru/crnn.pytorch"
|
||||
"Or you can download trained OCR model directly from https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing")
|
||||
parser.add_argument('--input',
|
||||
help='Path to input image or video file. Skip this argument to capture frames from a camera.')
|
||||
parser.add_argument('--model', '-m', required=True,
|
||||
help='Path to a binary .pb file contains trained detector network.')
|
||||
parser.add_argument('--ocr', default="crnn.onnx",
|
||||
help="Path to a binary .pb or .onnx file contains trained recognition network", )
|
||||
parser.add_argument('--width', type=int, default=320,
|
||||
help='Preprocess input image by resizing to a specific width. It should be multiple by 32.')
|
||||
parser.add_argument('--height', type=int, default=320,
|
||||
help='Preprocess input image by resizing to a specific height. It should be multiple by 32.')
|
||||
parser.add_argument('--thr', type=float, default=0.5,
|
||||
help='Confidence threshold.')
|
||||
parser.add_argument('--nms', type=float, default=0.4,
|
||||
help='Non-maximum suppression threshold.')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
############ Utility functions ############
|
||||
|
||||
def fourPointsTransform(frame, vertices):
|
||||
vertices = np.asarray(vertices)
|
||||
outputSize = (100, 32)
|
||||
targetVertices = np.array([
|
||||
[0, outputSize[1] - 1],
|
||||
[0, 0],
|
||||
[outputSize[0] - 1, 0],
|
||||
[outputSize[0] - 1, outputSize[1] - 1]], dtype="float32")
|
||||
|
||||
rotationMatrix = cv.getPerspectiveTransform(vertices, targetVertices)
|
||||
result = cv.warpPerspective(frame, rotationMatrix, outputSize)
|
||||
return result
|
||||
|
||||
|
||||
def decodeText(scores):
|
||||
text = ""
|
||||
alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
for i in range(scores.shape[0]):
|
||||
c = np.argmax(scores[i][0])
|
||||
if c != 0:
|
||||
text += alphabet[c - 1]
|
||||
else:
|
||||
text += '-'
|
||||
|
||||
# adjacent same letters as well as background text must be removed to get the final output
|
||||
char_list = []
|
||||
for i in range(len(text)):
|
||||
if text[i] != '-' and (not (i > 0 and text[i] == text[i - 1])):
|
||||
char_list.append(text[i])
|
||||
return ''.join(char_list)
|
||||
|
||||
|
||||
def decodeBoundingBoxes(scores, geometry, scoreThresh):
|
||||
detections = []
|
||||
confidences = []
|
||||
|
||||
############ CHECK DIMENSIONS AND SHAPES OF geometry AND scores ############
|
||||
assert len(scores.shape) == 4, "Incorrect dimensions of scores"
|
||||
assert len(geometry.shape) == 4, "Incorrect dimensions of geometry"
|
||||
assert scores.shape[0] == 1, "Invalid dimensions of scores"
|
||||
assert geometry.shape[0] == 1, "Invalid dimensions of geometry"
|
||||
assert scores.shape[1] == 1, "Invalid dimensions of scores"
|
||||
assert geometry.shape[1] == 5, "Invalid dimensions of geometry"
|
||||
assert scores.shape[2] == geometry.shape[2], "Invalid dimensions of scores and geometry"
|
||||
assert scores.shape[3] == geometry.shape[3], "Invalid dimensions of scores and geometry"
|
||||
height = scores.shape[2]
|
||||
width = scores.shape[3]
|
||||
for y in range(0, height):
|
||||
|
||||
# Extract data from scores
|
||||
scoresData = scores[0][0][y]
|
||||
x0_data = geometry[0][0][y]
|
||||
x1_data = geometry[0][1][y]
|
||||
x2_data = geometry[0][2][y]
|
||||
x3_data = geometry[0][3][y]
|
||||
anglesData = geometry[0][4][y]
|
||||
for x in range(0, width):
|
||||
score = scoresData[x]
|
||||
|
||||
# If score is lower than threshold score, move to next x
|
||||
if (score < scoreThresh):
|
||||
continue
|
||||
|
||||
# Calculate offset
|
||||
offsetX = x * 4.0
|
||||
offsetY = y * 4.0
|
||||
angle = anglesData[x]
|
||||
|
||||
# Calculate cos and sin of angle
|
||||
cosA = math.cos(angle)
|
||||
sinA = math.sin(angle)
|
||||
h = x0_data[x] + x2_data[x]
|
||||
w = x1_data[x] + x3_data[x]
|
||||
|
||||
# Calculate offset
|
||||
offset = ([offsetX + cosA * x1_data[x] + sinA * x2_data[x], offsetY - sinA * x1_data[x] + cosA * x2_data[x]])
|
||||
|
||||
# Find points for rectangle
|
||||
p1 = (-sinA * h + offset[0], -cosA * h + offset[1])
|
||||
p3 = (-cosA * w + offset[0], sinA * w + offset[1])
|
||||
center = (0.5 * (p1[0] + p3[0]), 0.5 * (p1[1] + p3[1]))
|
||||
detections.append((center, (w, h), -1 * angle * 180.0 / math.pi))
|
||||
confidences.append(float(score))
|
||||
|
||||
# Return detections and confidences
|
||||
return [detections, confidences]
|
||||
|
||||
|
||||
def main():
|
||||
# Read and store arguments
|
||||
confThreshold = args.thr
|
||||
nmsThreshold = args.nms
|
||||
inpWidth = args.width
|
||||
inpHeight = args.height
|
||||
modelDetector = args.model
|
||||
modelRecognition = args.ocr
|
||||
|
||||
# Load network
|
||||
detector = cv.dnn.readNet(modelDetector)
|
||||
recognizer = cv.dnn.readNet(modelRecognition)
|
||||
|
||||
# Create a new named window
|
||||
kWinName = "EAST: An Efficient and Accurate Scene Text Detector"
|
||||
cv.namedWindow(kWinName, cv.WINDOW_NORMAL)
|
||||
outNames = []
|
||||
outNames.append("feature_fusion/Conv_7/Sigmoid")
|
||||
outNames.append("feature_fusion/concat_3")
|
||||
|
||||
# Open a video file or an image file or a camera stream
|
||||
cap = cv.VideoCapture(args.input if args.input else 0)
|
||||
|
||||
tickmeter = cv.TickMeter()
|
||||
while cv.waitKey(1) < 0:
|
||||
# Read frame
|
||||
hasFrame, frame = cap.read()
|
||||
if not hasFrame:
|
||||
cv.waitKey()
|
||||
break
|
||||
|
||||
# Get frame height and width
|
||||
height_ = frame.shape[0]
|
||||
width_ = frame.shape[1]
|
||||
rW = width_ / float(inpWidth)
|
||||
rH = height_ / float(inpHeight)
|
||||
|
||||
# Create a 4D blob from frame.
|
||||
blob = cv.dnn.blobFromImage(frame, 1.0, (inpWidth, inpHeight), (123.68, 116.78, 103.94), True, False)
|
||||
|
||||
# Run the detection model
|
||||
detector.setInput(blob)
|
||||
|
||||
tickmeter.start()
|
||||
outs = detector.forward(outNames)
|
||||
tickmeter.stop()
|
||||
|
||||
# Get scores and geometry
|
||||
scores = outs[0]
|
||||
geometry = outs[1]
|
||||
[boxes, confidences] = decodeBoundingBoxes(scores, geometry, confThreshold)
|
||||
|
||||
# Apply NMS
|
||||
indices = cv.dnn.NMSBoxesRotated(boxes, confidences, confThreshold, nmsThreshold)
|
||||
for i in indices:
|
||||
# get 4 corners of the rotated rect
|
||||
vertices = cv.boxPoints(boxes[i[0]])
|
||||
# scale the bounding box coordinates based on the respective ratios
|
||||
for j in range(4):
|
||||
vertices[j][0] *= rW
|
||||
vertices[j][1] *= rH
|
||||
|
||||
|
||||
# get cropped image using perspective transform
|
||||
if modelRecognition:
|
||||
cropped = fourPointsTransform(frame, vertices)
|
||||
cropped = cv.cvtColor(cropped, cv.COLOR_BGR2GRAY)
|
||||
|
||||
# Create a 4D blob from cropped image
|
||||
blob = cv.dnn.blobFromImage(cropped, size=(100, 32), mean=127.5, scalefactor=1 / 127.5)
|
||||
recognizer.setInput(blob)
|
||||
|
||||
# Run the recognition model
|
||||
tickmeter.start()
|
||||
result = recognizer.forward()
|
||||
tickmeter.stop()
|
||||
|
||||
# decode the result into text
|
||||
wordRecognized = decodeText(result)
|
||||
cv.putText(frame, wordRecognized, (int(vertices[1][0]), int(vertices[1][1])), cv.FONT_HERSHEY_SIMPLEX,
|
||||
0.5, (255, 0, 0))
|
||||
|
||||
for j in range(4):
|
||||
p1 = (int(vertices[j][0]), int(vertices[j][1]))
|
||||
p2 = (int(vertices[(j + 1) % 4][0]), int(vertices[(j + 1) % 4][1]))
|
||||
cv.line(frame, p1, p2, (0, 255, 0), 1)
|
||||
|
||||
# Put efficiency information
|
||||
label = 'Inference time: %.2f ms' % (tickmeter.getTimeMilli())
|
||||
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
|
||||
|
||||
# Display the frame
|
||||
cv.imshow(kWinName, frame)
|
||||
tickmeter.reset()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
333
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_common.py
vendored
Normal file
333
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_common.py
vendored
Normal file
@ -0,0 +1,333 @@
|
||||
def tokenize(s):
|
||||
tokens = []
|
||||
token = ""
|
||||
isString = False
|
||||
isComment = False
|
||||
for symbol in s:
|
||||
isComment = (isComment and symbol != '\n') or (not isString and symbol == '#')
|
||||
if isComment:
|
||||
continue
|
||||
|
||||
if symbol == ' ' or symbol == '\t' or symbol == '\r' or symbol == '\'' or \
|
||||
symbol == '\n' or symbol == ':' or symbol == '\"' or symbol == ';' or \
|
||||
symbol == ',':
|
||||
|
||||
if (symbol == '\"' or symbol == '\'') and isString:
|
||||
tokens.append(token)
|
||||
token = ""
|
||||
else:
|
||||
if isString:
|
||||
token += symbol
|
||||
elif token:
|
||||
tokens.append(token)
|
||||
token = ""
|
||||
isString = (symbol == '\"' or symbol == '\'') ^ isString
|
||||
|
||||
elif symbol == '{' or symbol == '}' or symbol == '[' or symbol == ']':
|
||||
if token:
|
||||
tokens.append(token)
|
||||
token = ""
|
||||
tokens.append(symbol)
|
||||
else:
|
||||
token += symbol
|
||||
if token:
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
|
||||
def parseMessage(tokens, idx):
|
||||
msg = {}
|
||||
assert(tokens[idx] == '{')
|
||||
|
||||
isArray = False
|
||||
while True:
|
||||
if not isArray:
|
||||
idx += 1
|
||||
if idx < len(tokens):
|
||||
fieldName = tokens[idx]
|
||||
else:
|
||||
return None
|
||||
if fieldName == '}':
|
||||
break
|
||||
|
||||
idx += 1
|
||||
fieldValue = tokens[idx]
|
||||
|
||||
if fieldValue == '{':
|
||||
embeddedMsg, idx = parseMessage(tokens, idx)
|
||||
if fieldName in msg:
|
||||
msg[fieldName].append(embeddedMsg)
|
||||
else:
|
||||
msg[fieldName] = [embeddedMsg]
|
||||
elif fieldValue == '[':
|
||||
isArray = True
|
||||
elif fieldValue == ']':
|
||||
isArray = False
|
||||
else:
|
||||
if fieldName in msg:
|
||||
msg[fieldName].append(fieldValue)
|
||||
else:
|
||||
msg[fieldName] = [fieldValue]
|
||||
return msg, idx
|
||||
|
||||
|
||||
def readTextMessage(filePath):
|
||||
if not filePath:
|
||||
return {}
|
||||
with open(filePath, 'rt') as f:
|
||||
content = f.read()
|
||||
|
||||
tokens = tokenize('{' + content + '}')
|
||||
msg = parseMessage(tokens, 0)
|
||||
return msg[0] if msg else {}
|
||||
|
||||
|
||||
def listToTensor(values):
|
||||
if all([isinstance(v, float) for v in values]):
|
||||
dtype = 'DT_FLOAT'
|
||||
field = 'float_val'
|
||||
elif all([isinstance(v, int) for v in values]):
|
||||
dtype = 'DT_INT32'
|
||||
field = 'int_val'
|
||||
else:
|
||||
raise Exception('Wrong values types')
|
||||
|
||||
msg = {
|
||||
'tensor': {
|
||||
'dtype': dtype,
|
||||
'tensor_shape': {
|
||||
'dim': {
|
||||
'size': len(values)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
msg['tensor'][field] = values
|
||||
return msg
|
||||
|
||||
|
||||
def addConstNode(name, values, graph_def):
|
||||
node = NodeDef()
|
||||
node.name = name
|
||||
node.op = 'Const'
|
||||
node.addAttr('value', values)
|
||||
graph_def.node.extend([node])
|
||||
|
||||
|
||||
def addSlice(inp, out, begins, sizes, graph_def):
|
||||
beginsNode = NodeDef()
|
||||
beginsNode.name = out + '/begins'
|
||||
beginsNode.op = 'Const'
|
||||
beginsNode.addAttr('value', begins)
|
||||
graph_def.node.extend([beginsNode])
|
||||
|
||||
sizesNode = NodeDef()
|
||||
sizesNode.name = out + '/sizes'
|
||||
sizesNode.op = 'Const'
|
||||
sizesNode.addAttr('value', sizes)
|
||||
graph_def.node.extend([sizesNode])
|
||||
|
||||
sliced = NodeDef()
|
||||
sliced.name = out
|
||||
sliced.op = 'Slice'
|
||||
sliced.input.append(inp)
|
||||
sliced.input.append(beginsNode.name)
|
||||
sliced.input.append(sizesNode.name)
|
||||
graph_def.node.extend([sliced])
|
||||
|
||||
|
||||
def addReshape(inp, out, shape, graph_def):
|
||||
shapeNode = NodeDef()
|
||||
shapeNode.name = out + '/shape'
|
||||
shapeNode.op = 'Const'
|
||||
shapeNode.addAttr('value', shape)
|
||||
graph_def.node.extend([shapeNode])
|
||||
|
||||
reshape = NodeDef()
|
||||
reshape.name = out
|
||||
reshape.op = 'Reshape'
|
||||
reshape.input.append(inp)
|
||||
reshape.input.append(shapeNode.name)
|
||||
graph_def.node.extend([reshape])
|
||||
|
||||
|
||||
def addSoftMax(inp, out, graph_def):
|
||||
softmax = NodeDef()
|
||||
softmax.name = out
|
||||
softmax.op = 'Softmax'
|
||||
softmax.addAttr('axis', -1)
|
||||
softmax.input.append(inp)
|
||||
graph_def.node.extend([softmax])
|
||||
|
||||
|
||||
def addFlatten(inp, out, graph_def):
|
||||
flatten = NodeDef()
|
||||
flatten.name = out
|
||||
flatten.op = 'Flatten'
|
||||
flatten.input.append(inp)
|
||||
graph_def.node.extend([flatten])
|
||||
|
||||
|
||||
class NodeDef:
|
||||
def __init__(self):
|
||||
self.input = []
|
||||
self.name = ""
|
||||
self.op = ""
|
||||
self.attr = {}
|
||||
|
||||
def addAttr(self, key, value):
|
||||
assert(not key in self.attr)
|
||||
if isinstance(value, bool):
|
||||
self.attr[key] = {'b': value}
|
||||
elif isinstance(value, int):
|
||||
self.attr[key] = {'i': value}
|
||||
elif isinstance(value, float):
|
||||
self.attr[key] = {'f': value}
|
||||
elif isinstance(value, str):
|
||||
self.attr[key] = {'s': value}
|
||||
elif isinstance(value, list):
|
||||
self.attr[key] = listToTensor(value)
|
||||
else:
|
||||
raise Exception('Unknown type of attribute ' + key)
|
||||
|
||||
def Clear(self):
|
||||
self.input = []
|
||||
self.name = ""
|
||||
self.op = ""
|
||||
self.attr = {}
|
||||
|
||||
|
||||
class GraphDef:
|
||||
def __init__(self):
|
||||
self.node = []
|
||||
|
||||
def save(self, filePath):
|
||||
with open(filePath, 'wt') as f:
|
||||
|
||||
def printAttr(d, indent):
|
||||
indent = ' ' * indent
|
||||
for key, value in sorted(d.items(), key=lambda x:x[0].lower()):
|
||||
value = value if isinstance(value, list) else [value]
|
||||
for v in value:
|
||||
if isinstance(v, dict):
|
||||
f.write(indent + key + ' {\n')
|
||||
printAttr(v, len(indent) + 2)
|
||||
f.write(indent + '}\n')
|
||||
else:
|
||||
isString = False
|
||||
if isinstance(v, str) and not v.startswith('DT_'):
|
||||
try:
|
||||
float(v)
|
||||
except:
|
||||
isString = True
|
||||
|
||||
if isinstance(v, bool):
|
||||
printed = 'true' if v else 'false'
|
||||
elif v == 'true' or v == 'false':
|
||||
printed = 'true' if v == 'true' else 'false'
|
||||
elif isString:
|
||||
printed = '\"%s\"' % v
|
||||
else:
|
||||
printed = str(v)
|
||||
f.write(indent + key + ': ' + printed + '\n')
|
||||
|
||||
for node in self.node:
|
||||
f.write('node {\n')
|
||||
f.write(' name: \"%s\"\n' % node.name)
|
||||
f.write(' op: \"%s\"\n' % node.op)
|
||||
for inp in node.input:
|
||||
f.write(' input: \"%s\"\n' % inp)
|
||||
for key, value in sorted(node.attr.items(), key=lambda x:x[0].lower()):
|
||||
f.write(' attr {\n')
|
||||
f.write(' key: \"%s\"\n' % key)
|
||||
f.write(' value {\n')
|
||||
printAttr(value, 6)
|
||||
f.write(' }\n')
|
||||
f.write(' }\n')
|
||||
f.write('}\n')
|
||||
|
||||
|
||||
def parseTextGraph(filePath):
|
||||
msg = readTextMessage(filePath)
|
||||
|
||||
graph = GraphDef()
|
||||
for node in msg['node']:
|
||||
graphNode = NodeDef()
|
||||
graphNode.name = node['name'][0]
|
||||
graphNode.op = node['op'][0]
|
||||
graphNode.input = node['input'] if 'input' in node else []
|
||||
|
||||
if 'attr' in node:
|
||||
for attr in node['attr']:
|
||||
graphNode.attr[attr['key'][0]] = attr['value'][0]
|
||||
|
||||
graph.node.append(graphNode)
|
||||
return graph
|
||||
|
||||
|
||||
# Removes Identity nodes
|
||||
def removeIdentity(graph_def):
|
||||
identities = {}
|
||||
for node in graph_def.node:
|
||||
if node.op == 'Identity' or node.op == 'IdentityN':
|
||||
inp = node.input[0]
|
||||
if inp in identities:
|
||||
identities[node.name] = identities[inp]
|
||||
else:
|
||||
identities[node.name] = inp
|
||||
graph_def.node.remove(node)
|
||||
|
||||
for node in graph_def.node:
|
||||
for i in range(len(node.input)):
|
||||
if node.input[i] in identities:
|
||||
node.input[i] = identities[node.input[i]]
|
||||
|
||||
|
||||
def removeUnusedNodesAndAttrs(to_remove, graph_def):
|
||||
unusedAttrs = ['T', 'Tshape', 'N', 'Tidx', 'Tdim', 'use_cudnn_on_gpu',
|
||||
'Index', 'Tperm', 'is_training', 'Tpaddings']
|
||||
|
||||
removedNodes = []
|
||||
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
op = graph_def.node[i].op
|
||||
name = graph_def.node[i].name
|
||||
|
||||
if to_remove(name, op):
|
||||
if op != 'Const':
|
||||
removedNodes.append(name)
|
||||
|
||||
del graph_def.node[i]
|
||||
else:
|
||||
for attr in unusedAttrs:
|
||||
if attr in graph_def.node[i].attr:
|
||||
del graph_def.node[i].attr[attr]
|
||||
|
||||
# Remove references to removed nodes except Const nodes.
|
||||
for node in graph_def.node:
|
||||
for i in reversed(range(len(node.input))):
|
||||
if node.input[i] in removedNodes:
|
||||
del node.input[i]
|
||||
|
||||
|
||||
def writeTextGraph(modelPath, outputPath, outNodes):
|
||||
try:
|
||||
import cv2 as cv
|
||||
|
||||
cv.dnn.writeTextGraph(modelPath, outputPath)
|
||||
except:
|
||||
import tensorflow as tf
|
||||
from tensorflow.tools.graph_transforms import TransformGraph
|
||||
|
||||
with tf.gfile.FastGFile(modelPath, 'rb') as f:
|
||||
graph_def = tf.GraphDef()
|
||||
graph_def.ParseFromString(f.read())
|
||||
|
||||
graph_def = TransformGraph(graph_def, ['image_tensor'], outNodes, ['sort_by_execution_order'])
|
||||
|
||||
for node in graph_def.node:
|
||||
if node.op == 'Const':
|
||||
if 'value' in node.attr and node.attr['value'].tensor.tensor_content:
|
||||
node.attr['value'].tensor.tensor_content = b''
|
||||
|
||||
tf.train.write_graph(graph_def, "", outputPath, as_text=True)
|
236
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_efficientdet.py
vendored
Normal file
236
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_efficientdet.py
vendored
Normal file
@ -0,0 +1,236 @@
|
||||
# This file is a part of OpenCV project.
|
||||
# It is a subject to the license terms in the LICENSE file found in the top-level directory
|
||||
# of this distribution and at http://opencv.org/license.html.
|
||||
#
|
||||
# Copyright (C) 2020, Intel Corporation, all rights reserved.
|
||||
# Third party copyrights are property of their respective owners.
|
||||
#
|
||||
# Use this script to get the text graph representation (.pbtxt) of EfficientDet
|
||||
# deep learning network trained in https://github.com/google/automl.
|
||||
# Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
|
||||
# See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
|
||||
import argparse
|
||||
import re
|
||||
from math import sqrt
|
||||
from tf_text_graph_common import *
|
||||
|
||||
|
||||
class AnchorGenerator:
|
||||
def __init__(self, min_level, aspect_ratios, num_scales, anchor_scale):
|
||||
self.min_level = min_level
|
||||
self.aspect_ratios = aspect_ratios
|
||||
self.anchor_scale = anchor_scale
|
||||
self.scales = [2**(float(s) / num_scales) for s in range(num_scales)]
|
||||
|
||||
def get(self, layer_id):
|
||||
widths = []
|
||||
heights = []
|
||||
for s in self.scales:
|
||||
for a in self.aspect_ratios:
|
||||
base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
|
||||
heights.append(base_anchor_size * s * a[1])
|
||||
widths.append(base_anchor_size * s * a[0])
|
||||
return widths, heights
|
||||
|
||||
|
||||
def createGraph(modelPath, outputPath, min_level, aspect_ratios, num_scales,
|
||||
anchor_scale, num_classes, image_width, image_height):
|
||||
print('Min level: %d' % min_level)
|
||||
print('Anchor scale: %f' % anchor_scale)
|
||||
print('Num scales: %d' % num_scales)
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Input image size: %dx%d' % (image_width, image_height))
|
||||
|
||||
# Read the graph.
|
||||
_inpNames = ['image_arrays']
|
||||
outNames = ['detections']
|
||||
|
||||
writeTextGraph(modelPath, outputPath, outNames)
|
||||
graph_def = parseTextGraph(outputPath)
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = []
|
||||
for node in graph_def.node:
|
||||
if node.op == 'Const':
|
||||
continue
|
||||
unconnected.append(node.name)
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
|
||||
nodesToKeep = ['truediv'] # Keep preprocessing nodes
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
scopesToKeep = ('image_arrays', 'efficientnet', 'resample_p6', 'resample_p7',
|
||||
'fpn_cells', 'class_net', 'box_net', 'Reshape', 'concat')
|
||||
|
||||
addConstNode('scale_w', [2.0], graph_def)
|
||||
addConstNode('scale_h', [2.0], graph_def)
|
||||
nodesToKeep += ['scale_w', 'scale_h']
|
||||
|
||||
for node in graph_def.node:
|
||||
if re.match('efficientnet-(.*)/blocks_\d+/se/mul_1', node.name):
|
||||
node.input[0], node.input[1] = node.input[1], node.input[0]
|
||||
|
||||
if re.match('fpn_cells/cell_\d+/fnode\d+/resample(.*)/nearest_upsampling/Reshape_1$', node.name):
|
||||
node.op = 'ResizeNearestNeighbor'
|
||||
node.input[1] = 'scale_w'
|
||||
node.input.append('scale_h')
|
||||
|
||||
for inpNode in graph_def.node:
|
||||
if inpNode.name == node.name[:node.name.rfind('_')]:
|
||||
node.input[0] = inpNode.input[0]
|
||||
|
||||
if re.match('box_net/box-predict(_\d)*/separable_conv2d$', node.name):
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
|
||||
# Replace RealDiv to Mul with inversed scale for compatibility
|
||||
if node.op == 'RealDiv':
|
||||
for inpNode in graph_def.node:
|
||||
if inpNode.name != node.input[1] or not 'value' in inpNode.attr:
|
||||
continue
|
||||
|
||||
tensor = inpNode.attr['value']['tensor'][0]
|
||||
if not 'float_val' in tensor:
|
||||
continue
|
||||
scale = float(inpNode.attr['value']['tensor'][0]['float_val'][0])
|
||||
|
||||
addConstNode(inpNode.name + '/inv', [1.0 / scale], graph_def)
|
||||
nodesToKeep.append(inpNode.name + '/inv')
|
||||
node.input[1] = inpNode.name + '/inv'
|
||||
node.op = 'Mul'
|
||||
break
|
||||
|
||||
|
||||
def to_remove(name, op):
|
||||
if name in nodesToKeep:
|
||||
return False
|
||||
return op == 'Const' or not name.startswith(scopesToKeep)
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
# Attach unconnected preprocessing
|
||||
assert(graph_def.node[1].name == 'truediv' and graph_def.node[1].op == 'RealDiv')
|
||||
graph_def.node[1].input.insert(0, 'image_arrays')
|
||||
graph_def.node[2].input.insert(0, 'truediv')
|
||||
|
||||
priors_generator = AnchorGenerator(min_level, aspect_ratios, num_scales, anchor_scale)
|
||||
priorBoxes = []
|
||||
for i in range(5):
|
||||
inpName = ''
|
||||
for node in graph_def.node:
|
||||
if node.name == 'Reshape_%d' % (i * 2 + 1):
|
||||
inpName = node.input[0]
|
||||
break
|
||||
|
||||
priorBox = NodeDef()
|
||||
priorBox.name = 'PriorBox_%d' % i
|
||||
priorBox.op = 'PriorBox'
|
||||
priorBox.input.append(inpName)
|
||||
priorBox.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
priorBox.addAttr('flip', False)
|
||||
priorBox.addAttr('clip', False)
|
||||
|
||||
widths, heights = priors_generator.get(i)
|
||||
|
||||
priorBox.addAttr('width', widths)
|
||||
priorBox.addAttr('height', heights)
|
||||
priorBox.addAttr('variance', [1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
graph_def.node.extend([priorBox])
|
||||
priorBoxes.append(priorBox.name)
|
||||
|
||||
addConstNode('concat/axis_flatten', [-1], graph_def)
|
||||
|
||||
def addConcatNode(name, inputs, axisNodeName):
|
||||
concat = NodeDef()
|
||||
concat.name = name
|
||||
concat.op = 'ConcatV2'
|
||||
for inp in inputs:
|
||||
concat.input.append(inp)
|
||||
concat.input.append(axisNodeName)
|
||||
graph_def.node.extend([concat])
|
||||
|
||||
addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')
|
||||
|
||||
sigmoid = NodeDef()
|
||||
sigmoid.name = 'concat/sigmoid'
|
||||
sigmoid.op = 'Sigmoid'
|
||||
sigmoid.input.append('concat')
|
||||
graph_def.node.extend([sigmoid])
|
||||
|
||||
addFlatten(sigmoid.name, sigmoid.name + '/Flatten', graph_def)
|
||||
addFlatten('concat_1', 'concat_1/Flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('concat_1/Flatten')
|
||||
detectionOut.input.append(sigmoid.name + '/Flatten')
|
||||
detectionOut.input.append('PriorBox/concat')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', num_classes + 1)
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
detectionOut.addAttr('confidence_threshold', 0.2)
|
||||
detectionOut.addAttr('top_k', 100)
|
||||
detectionOut.addAttr('keep_top_k', 100)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
graph_def.node[0].attr['shape'] = {
|
||||
'shape': {
|
||||
'dim': [
|
||||
{'size': -1},
|
||||
{'size': image_height},
|
||||
{'size': image_width},
|
||||
{'size': 3}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(detectionOut.name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text
|
||||
graph_def.save(outputPath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'SSD model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--min_level', default=3, type=int, help='Parameter from training config')
|
||||
parser.add_argument('--num_scales', default=3, type=int, help='Parameter from training config')
|
||||
parser.add_argument('--anchor_scale', default=4.0, type=float, help='Parameter from training config')
|
||||
parser.add_argument('--aspect_ratios', default=[1.0, 1.0, 1.4, 0.7, 0.7, 1.4],
|
||||
nargs='+', type=float, help='Parameter from training config')
|
||||
parser.add_argument('--num_classes', default=90, type=int, help='Number of classes to detect')
|
||||
parser.add_argument('--width', default=512, type=int, help='Network input width')
|
||||
parser.add_argument('--height', default=512, type=int, help='Network input height')
|
||||
args = parser.parse_args()
|
||||
|
||||
ar = args.aspect_ratios
|
||||
assert(len(ar) % 2 == 0)
|
||||
ar = list(zip(ar[::2], ar[1::2]))
|
||||
|
||||
createGraph(args.input, args.output, args.min_level, ar, args.num_scales,
|
||||
args.anchor_scale, args.num_classes, args.width, args.height)
|
299
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_faster_rcnn.py
vendored
Normal file
299
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_faster_rcnn.py
vendored
Normal file
@ -0,0 +1,299 @@
|
||||
import argparse
|
||||
import numpy as np
|
||||
from tf_text_graph_common import *
|
||||
|
||||
|
||||
def createFasterRCNNGraph(modelPath, configPath, outputPath):
|
||||
scopesToKeep = ('FirstStageFeatureExtractor', 'Conv',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor',
|
||||
'FirstStageBoxPredictor/ClassPredictor',
|
||||
'CropAndResize',
|
||||
'MaxPool2D',
|
||||
'SecondStageFeatureExtractor',
|
||||
'SecondStageBoxPredictor',
|
||||
'Preprocessor/sub',
|
||||
'Preprocessor/mul',
|
||||
'image_tensor')
|
||||
|
||||
scopesToIgnore = ('FirstStageFeatureExtractor/Assert',
|
||||
'FirstStageFeatureExtractor/Shape',
|
||||
'FirstStageFeatureExtractor/strided_slice',
|
||||
'FirstStageFeatureExtractor/GreaterEqual',
|
||||
'FirstStageFeatureExtractor/LogicalAnd')
|
||||
|
||||
# Load a config file.
|
||||
config = readTextMessage(configPath)
|
||||
config = config['model'][0]['faster_rcnn'][0]
|
||||
num_classes = int(config['num_classes'][0])
|
||||
|
||||
grid_anchor_generator = config['first_stage_anchor_generator'][0]['grid_anchor_generator'][0]
|
||||
scales = [float(s) for s in grid_anchor_generator['scales']]
|
||||
aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
|
||||
width_stride = float(grid_anchor_generator['width_stride'][0])
|
||||
height_stride = float(grid_anchor_generator['height_stride'][0])
|
||||
|
||||
feature_extractor = config['feature_extractor'][0]
|
||||
if 'type' in feature_extractor and feature_extractor['type'][0] == 'faster_rcnn_nas':
|
||||
features_stride = 16.0
|
||||
else:
|
||||
features_stride = float(feature_extractor['first_stage_features_stride'][0])
|
||||
|
||||
first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
|
||||
first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
|
||||
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Scales: %s' % str(scales))
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Width stride: %f' % width_stride)
|
||||
print('Height stride: %f' % height_stride)
|
||||
print('Features stride: %f' % features_stride)
|
||||
|
||||
# Read the graph.
|
||||
writeTextGraph(modelPath, outputPath, ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes'])
|
||||
graph_def = parseTextGraph(outputPath)
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
nodesToKeep = []
|
||||
def to_remove(name, op):
|
||||
if name in nodesToKeep:
|
||||
return False
|
||||
return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
|
||||
(name.startswith('CropAndResize') and op != 'CropAndResize')
|
||||
|
||||
# Fuse atrous convolutions (with dilations).
|
||||
nodesMap = {node.name: node for node in graph_def.node}
|
||||
for node in reversed(graph_def.node):
|
||||
if node.op == 'BatchToSpaceND':
|
||||
del node.input[2]
|
||||
conv = nodesMap[node.input[0]]
|
||||
spaceToBatchND = nodesMap[conv.input[0]]
|
||||
|
||||
# Extract paddings
|
||||
stridedSlice = nodesMap[spaceToBatchND.input[2]]
|
||||
assert(stridedSlice.op == 'StridedSlice')
|
||||
pack = nodesMap[stridedSlice.input[0]]
|
||||
assert(pack.op == 'Pack')
|
||||
|
||||
padNodeH = nodesMap[nodesMap[pack.input[0]].input[0]]
|
||||
padNodeW = nodesMap[nodesMap[pack.input[1]].input[0]]
|
||||
padH = int(padNodeH.attr['value']['tensor'][0]['int_val'][0])
|
||||
padW = int(padNodeW.attr['value']['tensor'][0]['int_val'][0])
|
||||
|
||||
paddingsNode = NodeDef()
|
||||
paddingsNode.name = conv.name + '/paddings'
|
||||
paddingsNode.op = 'Const'
|
||||
paddingsNode.addAttr('value', [padH, padH, padW, padW])
|
||||
graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
|
||||
nodesToKeep.append(paddingsNode.name)
|
||||
|
||||
spaceToBatchND.input[2] = paddingsNode.name
|
||||
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
|
||||
# Connect input node to the first layer
|
||||
assert(graph_def.node[0].op == 'Placeholder')
|
||||
graph_def.node[1].input.insert(0, graph_def.node[0].name)
|
||||
|
||||
# Temporarily remove top nodes.
|
||||
topNodes = []
|
||||
while True:
|
||||
node = graph_def.node.pop()
|
||||
topNodes.append(node)
|
||||
if node.op == 'CropAndResize':
|
||||
break
|
||||
|
||||
addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2], graph_def)
|
||||
|
||||
addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax', graph_def) # Compare with Reshape_4
|
||||
|
||||
addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax/flatten', graph_def)
|
||||
|
||||
# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
|
||||
addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor/flatten', graph_def)
|
||||
|
||||
proposals = NodeDef()
|
||||
proposals.name = 'proposals' # Compare with ClipToWindow/Gather/Gather (NOTE: normalized)
|
||||
proposals.op = 'PriorBox'
|
||||
proposals.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd')
|
||||
proposals.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
proposals.addAttr('flip', False)
|
||||
proposals.addAttr('clip', True)
|
||||
proposals.addAttr('step', features_stride)
|
||||
proposals.addAttr('offset', 0.0)
|
||||
proposals.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
|
||||
|
||||
widths = []
|
||||
heights = []
|
||||
for a in aspect_ratios:
|
||||
for s in scales:
|
||||
ar = np.sqrt(a)
|
||||
heights.append((height_stride**2) * s / ar)
|
||||
widths.append((width_stride**2) * s * ar)
|
||||
|
||||
proposals.addAttr('width', widths)
|
||||
proposals.addAttr('height', heights)
|
||||
|
||||
graph_def.node.extend([proposals])
|
||||
|
||||
# Compare with Reshape_5
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
|
||||
detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
|
||||
detectionOut.input.append('proposals')
|
||||
|
||||
detectionOut.addAttr('num_classes', 2)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', 0)
|
||||
detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
|
||||
detectionOut.addAttr('top_k', 6000)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
|
||||
detectionOut.addAttr('clip', False)
|
||||
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
addConstNode('clip_by_value/lower', [0.0], graph_def)
|
||||
addConstNode('clip_by_value/upper', [1.0], graph_def)
|
||||
|
||||
clipByValueNode = NodeDef()
|
||||
clipByValueNode.name = 'detection_out/clip_by_value'
|
||||
clipByValueNode.op = 'ClipByValue'
|
||||
clipByValueNode.input.append('detection_out')
|
||||
clipByValueNode.input.append('clip_by_value/lower')
|
||||
clipByValueNode.input.append('clip_by_value/upper')
|
||||
graph_def.node.extend([clipByValueNode])
|
||||
|
||||
# Save as text.
|
||||
for node in reversed(topNodes):
|
||||
graph_def.node.extend([node])
|
||||
|
||||
addSoftMax('SecondStageBoxPredictor/Reshape_1', 'SecondStageBoxPredictor/Reshape_1/softmax', graph_def)
|
||||
|
||||
addSlice('SecondStageBoxPredictor/Reshape_1/softmax',
|
||||
'SecondStageBoxPredictor/Reshape_1/slice',
|
||||
[0, 0, 1], [-1, -1, -1], graph_def)
|
||||
|
||||
addReshape('SecondStageBoxPredictor/Reshape_1/slice',
|
||||
'SecondStageBoxPredictor/Reshape_1/Reshape', [1, -1], graph_def)
|
||||
|
||||
# Replace Flatten subgraph onto a single node.
|
||||
cropAndResizeNodeName = ''
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
if graph_def.node[i].op == 'CropAndResize':
|
||||
graph_def.node[i].input.insert(1, 'detection_out/clip_by_value')
|
||||
cropAndResizeNodeName = graph_def.node[i].name
|
||||
|
||||
if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
|
||||
addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)
|
||||
|
||||
graph_def.node[i].input.pop()
|
||||
graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2')
|
||||
|
||||
if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
|
||||
del graph_def.node[i]
|
||||
|
||||
for node in graph_def.node:
|
||||
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
|
||||
node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
|
||||
node.op = 'Flatten'
|
||||
node.input.pop()
|
||||
|
||||
if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D',
|
||||
'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
|
||||
if node.name.startswith('MaxPool2D'):
|
||||
assert(node.op == 'MaxPool')
|
||||
assert(cropAndResizeNodeName)
|
||||
node.input = [cropAndResizeNodeName]
|
||||
|
||||
################################################################################
|
||||
### Postprocessing
|
||||
################################################################################
|
||||
addSlice('detection_out/clip_by_value', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4], graph_def)
|
||||
|
||||
variance = NodeDef()
|
||||
variance.name = 'proposals/variance'
|
||||
variance.op = 'Const'
|
||||
variance.addAttr('value', [0.1, 0.1, 0.2, 0.2])
|
||||
graph_def.node.extend([variance])
|
||||
|
||||
varianceEncoder = NodeDef()
|
||||
varianceEncoder.name = 'variance_encoded'
|
||||
varianceEncoder.op = 'Mul'
|
||||
varianceEncoder.input.append('SecondStageBoxPredictor/Reshape')
|
||||
varianceEncoder.input.append(variance.name)
|
||||
varianceEncoder.addAttr('axis', 2)
|
||||
graph_def.node.extend([varianceEncoder])
|
||||
|
||||
addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1], graph_def)
|
||||
addFlatten('variance_encoded', 'variance_encoded/flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out_final'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('variance_encoded/flatten')
|
||||
detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape')
|
||||
detectionOut.input.append('detection_out/slice/reshape')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes)
|
||||
detectionOut.addAttr('share_location', False)
|
||||
detectionOut.addAttr('background_label_id', num_classes + 1)
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k', 100)
|
||||
detectionOut.addAttr('clip', True)
|
||||
detectionOut.addAttr('variance_encoded_in_target', True)
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = [node.name for node in graph_def.node]
|
||||
for node in graph_def.node:
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(detectionOut.name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text.
|
||||
graph_def.save(outputPath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'Faster-RCNN model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--config', required=True, help='Path to a *.config file is used for training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
createFasterRCNNGraph(args.input, args.config, args.output)
|
297
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_mask_rcnn.py
vendored
Normal file
297
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_mask_rcnn.py
vendored
Normal file
@ -0,0 +1,297 @@
|
||||
import argparse
|
||||
import numpy as np
|
||||
from tf_text_graph_common import *
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'Mask-RCNN model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--config', required=True, help='Path to a *.config file is used for training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
scopesToKeep = ('FirstStageFeatureExtractor', 'Conv',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor',
|
||||
'FirstStageBoxPredictor/ClassPredictor',
|
||||
'CropAndResize',
|
||||
'MaxPool2D',
|
||||
'SecondStageFeatureExtractor',
|
||||
'SecondStageBoxPredictor',
|
||||
'Preprocessor/sub',
|
||||
'Preprocessor/mul',
|
||||
'image_tensor')
|
||||
|
||||
scopesToIgnore = ('FirstStageFeatureExtractor/Assert',
|
||||
'FirstStageFeatureExtractor/Shape',
|
||||
'FirstStageFeatureExtractor/strided_slice',
|
||||
'FirstStageFeatureExtractor/GreaterEqual',
|
||||
'FirstStageFeatureExtractor/LogicalAnd',
|
||||
'Conv/required_space_to_batch_paddings')
|
||||
|
||||
# Load a config file.
|
||||
config = readTextMessage(args.config)
|
||||
config = config['model'][0]['faster_rcnn'][0]
|
||||
num_classes = int(config['num_classes'][0])
|
||||
|
||||
grid_anchor_generator = config['first_stage_anchor_generator'][0]['grid_anchor_generator'][0]
|
||||
scales = [float(s) for s in grid_anchor_generator['scales']]
|
||||
aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
|
||||
width_stride = float(grid_anchor_generator['width_stride'][0])
|
||||
height_stride = float(grid_anchor_generator['height_stride'][0])
|
||||
features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0])
|
||||
first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
|
||||
first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
|
||||
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Scales: %s' % str(scales))
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Width stride: %f' % width_stride)
|
||||
print('Height stride: %f' % height_stride)
|
||||
print('Features stride: %f' % features_stride)
|
||||
|
||||
# Read the graph.
|
||||
writeTextGraph(args.input, args.output, ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes', 'detection_masks'])
|
||||
graph_def = parseTextGraph(args.output)
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
nodesToKeep = []
|
||||
def to_remove(name, op):
|
||||
if name in nodesToKeep:
|
||||
return False
|
||||
return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
|
||||
(name.startswith('CropAndResize') and op != 'CropAndResize')
|
||||
|
||||
# Fuse atrous convolutions (with dilations).
|
||||
nodesMap = {node.name: node for node in graph_def.node}
|
||||
for node in reversed(graph_def.node):
|
||||
if node.op == 'BatchToSpaceND':
|
||||
del node.input[2]
|
||||
conv = nodesMap[node.input[0]]
|
||||
spaceToBatchND = nodesMap[conv.input[0]]
|
||||
|
||||
paddingsNode = NodeDef()
|
||||
paddingsNode.name = conv.name + '/paddings'
|
||||
paddingsNode.op = 'Const'
|
||||
paddingsNode.addAttr('value', [2, 2, 2, 2])
|
||||
graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
|
||||
nodesToKeep.append(paddingsNode.name)
|
||||
|
||||
spaceToBatchND.input[2] = paddingsNode.name
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
|
||||
# Connect input node to the first layer
|
||||
assert(graph_def.node[0].op == 'Placeholder')
|
||||
graph_def.node[1].input.insert(0, graph_def.node[0].name)
|
||||
|
||||
# Temporarily remove top nodes.
|
||||
topNodes = []
|
||||
numCropAndResize = 0
|
||||
while True:
|
||||
node = graph_def.node.pop()
|
||||
topNodes.append(node)
|
||||
if node.op == 'CropAndResize':
|
||||
numCropAndResize += 1
|
||||
if numCropAndResize == 2:
|
||||
break
|
||||
|
||||
addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2], graph_def)
|
||||
|
||||
addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax', graph_def) # Compare with Reshape_4
|
||||
|
||||
addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax',
|
||||
'FirstStageBoxPredictor/ClassPredictor/softmax/flatten', graph_def)
|
||||
|
||||
# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
|
||||
addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd',
|
||||
'FirstStageBoxPredictor/BoxEncodingPredictor/flatten', graph_def)
|
||||
|
||||
proposals = NodeDef()
|
||||
proposals.name = 'proposals' # Compare with ClipToWindow/Gather/Gather (NOTE: normalized)
|
||||
proposals.op = 'PriorBox'
|
||||
proposals.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd')
|
||||
proposals.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
proposals.addAttr('flip', False)
|
||||
proposals.addAttr('clip', True)
|
||||
proposals.addAttr('step', features_stride)
|
||||
proposals.addAttr('offset', 0.0)
|
||||
proposals.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
|
||||
|
||||
widths = []
|
||||
heights = []
|
||||
for a in aspect_ratios:
|
||||
for s in scales:
|
||||
ar = np.sqrt(a)
|
||||
heights.append((height_stride**2) * s / ar)
|
||||
widths.append((width_stride**2) * s * ar)
|
||||
|
||||
proposals.addAttr('width', widths)
|
||||
proposals.addAttr('height', heights)
|
||||
|
||||
graph_def.node.extend([proposals])
|
||||
|
||||
# Compare with Reshape_5
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
|
||||
detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
|
||||
detectionOut.input.append('proposals')
|
||||
|
||||
detectionOut.addAttr('num_classes', 2)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', 0)
|
||||
detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
|
||||
detectionOut.addAttr('top_k', 6000)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
|
||||
detectionOut.addAttr('clip', True)
|
||||
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
# Save as text.
|
||||
cropAndResizeNodesNames = []
|
||||
for node in reversed(topNodes):
|
||||
if node.op != 'CropAndResize':
|
||||
graph_def.node.extend([node])
|
||||
topNodes.pop()
|
||||
else:
|
||||
cropAndResizeNodesNames.append(node.name)
|
||||
if numCropAndResize == 1:
|
||||
break
|
||||
else:
|
||||
graph_def.node.extend([node])
|
||||
topNodes.pop()
|
||||
numCropAndResize -= 1
|
||||
|
||||
addSoftMax('SecondStageBoxPredictor/Reshape_1', 'SecondStageBoxPredictor/Reshape_1/softmax', graph_def)
|
||||
|
||||
addSlice('SecondStageBoxPredictor/Reshape_1/softmax',
|
||||
'SecondStageBoxPredictor/Reshape_1/slice',
|
||||
[0, 0, 1], [-1, -1, -1], graph_def)
|
||||
|
||||
addReshape('SecondStageBoxPredictor/Reshape_1/slice',
|
||||
'SecondStageBoxPredictor/Reshape_1/Reshape', [1, -1], graph_def)
|
||||
|
||||
# Replace Flatten subgraph onto a single node.
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
if graph_def.node[i].op == 'CropAndResize':
|
||||
graph_def.node[i].input.insert(1, 'detection_out')
|
||||
|
||||
if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
|
||||
addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)
|
||||
|
||||
graph_def.node[i].input.pop()
|
||||
graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2')
|
||||
|
||||
if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
|
||||
'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
|
||||
del graph_def.node[i]
|
||||
|
||||
for node in graph_def.node:
|
||||
if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
|
||||
node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
|
||||
node.op = 'Flatten'
|
||||
node.input.pop()
|
||||
|
||||
if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D',
|
||||
'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
|
||||
if node.name.startswith('MaxPool2D'):
|
||||
assert(node.op == 'MaxPool')
|
||||
assert(len(cropAndResizeNodesNames) == 2)
|
||||
node.input = [cropAndResizeNodesNames[0]]
|
||||
del cropAndResizeNodesNames[0]
|
||||
|
||||
################################################################################
|
||||
### Postprocessing
|
||||
################################################################################
|
||||
addSlice('detection_out', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4], graph_def)
|
||||
|
||||
variance = NodeDef()
|
||||
variance.name = 'proposals/variance'
|
||||
variance.op = 'Const'
|
||||
variance.addAttr('value', [0.1, 0.1, 0.2, 0.2])
|
||||
graph_def.node.extend([variance])
|
||||
|
||||
varianceEncoder = NodeDef()
|
||||
varianceEncoder.name = 'variance_encoded'
|
||||
varianceEncoder.op = 'Mul'
|
||||
varianceEncoder.input.append('SecondStageBoxPredictor/Reshape')
|
||||
varianceEncoder.input.append(variance.name)
|
||||
varianceEncoder.addAttr('axis', 2)
|
||||
graph_def.node.extend([varianceEncoder])
|
||||
|
||||
addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1], graph_def)
|
||||
addFlatten('variance_encoded', 'variance_encoded/flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out_final'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
detectionOut.input.append('variance_encoded/flatten')
|
||||
detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape')
|
||||
detectionOut.input.append('detection_out/slice/reshape')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes)
|
||||
detectionOut.addAttr('share_location', False)
|
||||
detectionOut.addAttr('background_label_id', num_classes + 1)
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
detectionOut.addAttr('keep_top_k',100)
|
||||
detectionOut.addAttr('clip', True)
|
||||
detectionOut.addAttr('variance_encoded_in_target', True)
|
||||
detectionOut.addAttr('confidence_threshold', 0.3)
|
||||
detectionOut.addAttr('group_by_classes', False)
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
for node in reversed(topNodes):
|
||||
graph_def.node.extend([node])
|
||||
|
||||
if node.name.startswith('MaxPool2D'):
|
||||
assert(node.op == 'MaxPool')
|
||||
assert(len(cropAndResizeNodesNames) == 1)
|
||||
node.input = [cropAndResizeNodesNames[0]]
|
||||
|
||||
for i in reversed(range(len(graph_def.node))):
|
||||
if graph_def.node[i].op == 'CropAndResize':
|
||||
graph_def.node[i].input.insert(1, 'detection_out_final')
|
||||
break
|
||||
|
||||
graph_def.node[-1].name = 'detection_masks'
|
||||
graph_def.node[-1].op = 'Sigmoid'
|
||||
graph_def.node[-1].input.pop()
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = [node.name for node in graph_def.node]
|
||||
for node in graph_def.node:
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(graph_def.node[-1].name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text.
|
||||
graph_def.save(args.output)
|
413
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_ssd.py
vendored
Normal file
413
3rdparty/opencv-4.5.4/samples/dnn/tf_text_graph_ssd.py
vendored
Normal file
@ -0,0 +1,413 @@
|
||||
# This file is a part of OpenCV project.
|
||||
# It is a subject to the license terms in the LICENSE file found in the top-level directory
|
||||
# of this distribution and at http://opencv.org/license.html.
|
||||
#
|
||||
# Copyright (C) 2018, Intel Corporation, all rights reserved.
|
||||
# Third party copyrights are property of their respective owners.
|
||||
#
|
||||
# Use this script to get the text graph representation (.pbtxt) of SSD-based
|
||||
# deep learning network trained in TensorFlow Object Detection API.
|
||||
# Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
|
||||
# See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
|
||||
import argparse
|
||||
import re
|
||||
from math import sqrt
|
||||
from tf_text_graph_common import *
|
||||
|
||||
class SSDAnchorGenerator:
|
||||
def __init__(self, min_scale, max_scale, num_layers, aspect_ratios,
|
||||
reduce_boxes_in_lowest_layer, image_width, image_height):
|
||||
self.min_scale = min_scale
|
||||
self.aspect_ratios = aspect_ratios
|
||||
self.reduce_boxes_in_lowest_layer = reduce_boxes_in_lowest_layer
|
||||
self.image_width = image_width
|
||||
self.image_height = image_height
|
||||
self.scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
|
||||
for i in range(num_layers)] + [1.0]
|
||||
|
||||
def get(self, layer_id):
|
||||
if layer_id == 0 and self.reduce_boxes_in_lowest_layer:
|
||||
widths = [0.1, self.min_scale * sqrt(2.0), self.min_scale * sqrt(0.5)]
|
||||
heights = [0.1, self.min_scale / sqrt(2.0), self.min_scale / sqrt(0.5)]
|
||||
else:
|
||||
widths = [self.scales[layer_id] * sqrt(ar) for ar in self.aspect_ratios]
|
||||
heights = [self.scales[layer_id] / sqrt(ar) for ar in self.aspect_ratios]
|
||||
|
||||
widths += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
|
||||
heights += [sqrt(self.scales[layer_id] * self.scales[layer_id + 1])]
|
||||
min_size = min(self.image_width, self.image_height)
|
||||
widths = [w * min_size for w in widths]
|
||||
heights = [h * min_size for h in heights]
|
||||
return widths, heights
|
||||
|
||||
|
||||
class MultiscaleAnchorGenerator:
|
||||
def __init__(self, min_level, aspect_ratios, scales_per_octave, anchor_scale):
|
||||
self.min_level = min_level
|
||||
self.aspect_ratios = aspect_ratios
|
||||
self.anchor_scale = anchor_scale
|
||||
self.scales = [2**(float(s) / scales_per_octave) for s in range(scales_per_octave)]
|
||||
|
||||
def get(self, layer_id):
|
||||
widths = []
|
||||
heights = []
|
||||
for a in self.aspect_ratios:
|
||||
for s in self.scales:
|
||||
base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
|
||||
ar = sqrt(a)
|
||||
heights.append(base_anchor_size * s / ar)
|
||||
widths.append(base_anchor_size * s * ar)
|
||||
return widths, heights
|
||||
|
||||
|
||||
def createSSDGraph(modelPath, configPath, outputPath):
|
||||
# Nodes that should be kept.
|
||||
keepOps = ['Conv2D', 'BiasAdd', 'Add', 'AddV2', 'Relu', 'Relu6', 'Placeholder', 'FusedBatchNorm',
|
||||
'DepthwiseConv2dNative', 'ConcatV2', 'Mul', 'MaxPool', 'AvgPool', 'Identity',
|
||||
'Sub', 'ResizeNearestNeighbor', 'Pad', 'FusedBatchNormV3', 'Mean']
|
||||
|
||||
# Node with which prefixes should be removed
|
||||
prefixesToRemove = ('MultipleGridAnchorGenerator/', 'Concatenate/', 'Postprocessor/', 'Preprocessor/map')
|
||||
|
||||
# Load a config file.
|
||||
config = readTextMessage(configPath)
|
||||
config = config['model'][0]['ssd'][0]
|
||||
num_classes = int(config['num_classes'][0])
|
||||
|
||||
fixed_shape_resizer = config['image_resizer'][0]['fixed_shape_resizer'][0]
|
||||
image_width = int(fixed_shape_resizer['width'][0])
|
||||
image_height = int(fixed_shape_resizer['height'][0])
|
||||
|
||||
box_predictor = 'convolutional' if 'convolutional_box_predictor' in config['box_predictor'][0] else 'weight_shared_convolutional'
|
||||
|
||||
anchor_generator = config['anchor_generator'][0]
|
||||
if 'ssd_anchor_generator' in anchor_generator:
|
||||
ssd_anchor_generator = anchor_generator['ssd_anchor_generator'][0]
|
||||
min_scale = float(ssd_anchor_generator['min_scale'][0])
|
||||
max_scale = float(ssd_anchor_generator['max_scale'][0])
|
||||
num_layers = int(ssd_anchor_generator['num_layers'][0])
|
||||
aspect_ratios = [float(ar) for ar in ssd_anchor_generator['aspect_ratios']]
|
||||
reduce_boxes_in_lowest_layer = True
|
||||
if 'reduce_boxes_in_lowest_layer' in ssd_anchor_generator:
|
||||
reduce_boxes_in_lowest_layer = ssd_anchor_generator['reduce_boxes_in_lowest_layer'][0] == 'true'
|
||||
priors_generator = SSDAnchorGenerator(min_scale, max_scale, num_layers,
|
||||
aspect_ratios, reduce_boxes_in_lowest_layer,
|
||||
image_width, image_height)
|
||||
|
||||
|
||||
print('Scale: [%f-%f]' % (min_scale, max_scale))
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
print('Reduce boxes in the lowest layer: %s' % str(reduce_boxes_in_lowest_layer))
|
||||
elif 'multiscale_anchor_generator' in anchor_generator:
|
||||
multiscale_anchor_generator = anchor_generator['multiscale_anchor_generator'][0]
|
||||
min_level = int(multiscale_anchor_generator['min_level'][0])
|
||||
max_level = int(multiscale_anchor_generator['max_level'][0])
|
||||
anchor_scale = float(multiscale_anchor_generator['anchor_scale'][0])
|
||||
aspect_ratios = [float(ar) for ar in multiscale_anchor_generator['aspect_ratios']]
|
||||
scales_per_octave = int(multiscale_anchor_generator['scales_per_octave'][0])
|
||||
num_layers = max_level - min_level + 1
|
||||
priors_generator = MultiscaleAnchorGenerator(min_level, aspect_ratios,
|
||||
scales_per_octave, anchor_scale)
|
||||
print('Levels: [%d-%d]' % (min_level, max_level))
|
||||
print('Anchor scale: %f' % anchor_scale)
|
||||
print('Scales per octave: %d' % scales_per_octave)
|
||||
print('Aspect ratios: %s' % str(aspect_ratios))
|
||||
else:
|
||||
print('Unknown anchor_generator')
|
||||
exit(0)
|
||||
|
||||
print('Number of classes: %d' % num_classes)
|
||||
print('Number of layers: %d' % num_layers)
|
||||
print('box predictor: %s' % box_predictor)
|
||||
print('Input image size: %dx%d' % (image_width, image_height))
|
||||
|
||||
# Read the graph.
|
||||
outNames = ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes']
|
||||
|
||||
writeTextGraph(modelPath, outputPath, outNames)
|
||||
graph_def = parseTextGraph(outputPath)
|
||||
|
||||
def getUnconnectedNodes():
|
||||
unconnected = []
|
||||
for node in graph_def.node:
|
||||
unconnected.append(node.name)
|
||||
for inp in node.input:
|
||||
if inp in unconnected:
|
||||
unconnected.remove(inp)
|
||||
return unconnected
|
||||
|
||||
|
||||
def fuse_nodes(nodesToKeep):
|
||||
# Detect unfused batch normalization nodes and fuse them.
|
||||
# Add_0 <-- moving_variance, add_y
|
||||
# Rsqrt <-- Add_0
|
||||
# Mul_0 <-- Rsqrt, gamma
|
||||
# Mul_1 <-- input, Mul_0
|
||||
# Mul_2 <-- moving_mean, Mul_0
|
||||
# Sub_0 <-- beta, Mul_2
|
||||
# Add_1 <-- Mul_1, Sub_0
|
||||
nodesMap = {node.name: node for node in graph_def.node}
|
||||
subgraphBatchNorm = ['Add',
|
||||
['Mul', 'input', ['Mul', ['Rsqrt', ['Add', 'moving_variance', 'add_y']], 'gamma']],
|
||||
['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]]
|
||||
subgraphBatchNormV2 = ['AddV2',
|
||||
['Mul', 'input', ['Mul', ['Rsqrt', ['AddV2', 'moving_variance', 'add_y']], 'gamma']],
|
||||
['Sub', 'beta', ['Mul', 'moving_mean', 'Mul_0']]]
|
||||
# Detect unfused nearest neighbor resize.
|
||||
subgraphResizeNN = ['Reshape',
|
||||
['Mul', ['Reshape', 'input', ['Pack', 'shape_1', 'shape_2', 'shape_3', 'shape_4', 'shape_5']],
|
||||
'ones'],
|
||||
['Pack', ['StridedSlice', ['Shape', 'input'], 'stack', 'stack_1', 'stack_2'],
|
||||
'out_height', 'out_width', 'out_channels']]
|
||||
def checkSubgraph(node, targetNode, inputs, fusedNodes):
|
||||
op = targetNode[0]
|
||||
if node.op == op and (len(node.input) >= len(targetNode) - 1):
|
||||
fusedNodes.append(node)
|
||||
for i, inpOp in enumerate(targetNode[1:]):
|
||||
if isinstance(inpOp, list):
|
||||
if not node.input[i] in nodesMap or \
|
||||
not checkSubgraph(nodesMap[node.input[i]], inpOp, inputs, fusedNodes):
|
||||
return False
|
||||
else:
|
||||
inputs[inpOp] = node.input[i]
|
||||
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
nodesToRemove = []
|
||||
for node in graph_def.node:
|
||||
inputs = {}
|
||||
fusedNodes = []
|
||||
if checkSubgraph(node, subgraphBatchNorm, inputs, fusedNodes) or \
|
||||
checkSubgraph(node, subgraphBatchNormV2, inputs, fusedNodes):
|
||||
name = node.name
|
||||
node.Clear()
|
||||
node.name = name
|
||||
node.op = 'FusedBatchNorm'
|
||||
node.input.append(inputs['input'])
|
||||
node.input.append(inputs['gamma'])
|
||||
node.input.append(inputs['beta'])
|
||||
node.input.append(inputs['moving_mean'])
|
||||
node.input.append(inputs['moving_variance'])
|
||||
node.addAttr('epsilon', 0.001)
|
||||
nodesToRemove += fusedNodes[1:]
|
||||
|
||||
inputs = {}
|
||||
fusedNodes = []
|
||||
if checkSubgraph(node, subgraphResizeNN, inputs, fusedNodes):
|
||||
name = node.name
|
||||
node.Clear()
|
||||
node.name = name
|
||||
node.op = 'ResizeNearestNeighbor'
|
||||
node.input.append(inputs['input'])
|
||||
node.input.append(name + '/output_shape')
|
||||
|
||||
out_height_node = nodesMap[inputs['out_height']]
|
||||
out_width_node = nodesMap[inputs['out_width']]
|
||||
out_height = int(out_height_node.attr['value']['tensor'][0]['int_val'][0])
|
||||
out_width = int(out_width_node.attr['value']['tensor'][0]['int_val'][0])
|
||||
|
||||
shapeNode = NodeDef()
|
||||
shapeNode.name = name + '/output_shape'
|
||||
shapeNode.op = 'Const'
|
||||
shapeNode.addAttr('value', [out_height, out_width])
|
||||
graph_def.node.insert(graph_def.node.index(node), shapeNode)
|
||||
nodesToKeep.append(shapeNode.name)
|
||||
|
||||
nodesToRemove += fusedNodes[1:]
|
||||
for node in nodesToRemove:
|
||||
graph_def.node.remove(node)
|
||||
|
||||
nodesToKeep = []
|
||||
fuse_nodes(nodesToKeep)
|
||||
|
||||
removeIdentity(graph_def)
|
||||
|
||||
def to_remove(name, op):
|
||||
return (not name in nodesToKeep) and \
|
||||
(op == 'Const' or (not op in keepOps) or name.startswith(prefixesToRemove))
|
||||
|
||||
removeUnusedNodesAndAttrs(to_remove, graph_def)
|
||||
|
||||
|
||||
# Connect input node to the first layer
|
||||
assert(graph_def.node[0].op == 'Placeholder')
|
||||
try:
|
||||
input_shape = graph_def.node[0].attr['shape']['shape'][0]['dim']
|
||||
input_shape[1]['size'] = image_height
|
||||
input_shape[2]['size'] = image_width
|
||||
except:
|
||||
print("Input shapes are undefined")
|
||||
# assert(graph_def.node[1].op == 'Conv2D')
|
||||
weights = graph_def.node[1].input[-1]
|
||||
for i in range(len(graph_def.node[1].input)):
|
||||
graph_def.node[1].input.pop()
|
||||
graph_def.node[1].input.append(graph_def.node[0].name)
|
||||
graph_def.node[1].input.append(weights)
|
||||
|
||||
# check and correct the case when preprocessing block is after input
|
||||
preproc_id = "Preprocessor/"
|
||||
if graph_def.node[2].name.startswith(preproc_id) and \
|
||||
graph_def.node[2].input[0].startswith(preproc_id):
|
||||
|
||||
if not any(preproc_id in inp for inp in graph_def.node[3].input):
|
||||
graph_def.node[3].input.insert(0, graph_def.node[2].name)
|
||||
|
||||
|
||||
# Create SSD postprocessing head ###############################################
|
||||
|
||||
# Concatenate predictions of classes, predictions of bounding boxes and proposals.
|
||||
def addConcatNode(name, inputs, axisNodeName):
|
||||
concat = NodeDef()
|
||||
concat.name = name
|
||||
concat.op = 'ConcatV2'
|
||||
for inp in inputs:
|
||||
concat.input.append(inp)
|
||||
concat.input.append(axisNodeName)
|
||||
graph_def.node.extend([concat])
|
||||
|
||||
addConstNode('concat/axis_flatten', [-1], graph_def)
|
||||
addConstNode('PriorBox/concat/axis', [-2], graph_def)
|
||||
|
||||
for label in ['ClassPredictor', 'BoxEncodingPredictor' if box_predictor is 'convolutional' else 'BoxPredictor']:
|
||||
concatInputs = []
|
||||
for i in range(num_layers):
|
||||
# Flatten predictions
|
||||
flatten = NodeDef()
|
||||
if box_predictor is 'convolutional':
|
||||
inpName = 'BoxPredictor_%d/%s/BiasAdd' % (i, label)
|
||||
else:
|
||||
if i == 0:
|
||||
inpName = 'WeightSharedConvolutionalBoxPredictor/%s/BiasAdd' % label
|
||||
else:
|
||||
inpName = 'WeightSharedConvolutionalBoxPredictor_%d/%s/BiasAdd' % (i, label)
|
||||
flatten.input.append(inpName)
|
||||
flatten.name = inpName + '/Flatten'
|
||||
flatten.op = 'Flatten'
|
||||
|
||||
concatInputs.append(flatten.name)
|
||||
graph_def.node.extend([flatten])
|
||||
addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten')
|
||||
|
||||
num_matched_layers = 0
|
||||
for node in graph_def.node:
|
||||
if re.match('BoxPredictor_\d/BoxEncodingPredictor/convolution', node.name) or \
|
||||
re.match('BoxPredictor_\d/BoxEncodingPredictor/Conv2D', node.name) or \
|
||||
re.match('WeightSharedConvolutionalBoxPredictor(_\d)*/BoxPredictor/Conv2D', node.name):
|
||||
node.addAttr('loc_pred_transposed', True)
|
||||
num_matched_layers += 1
|
||||
assert(num_matched_layers == num_layers)
|
||||
|
||||
# Add layers that generate anchors (bounding boxes proposals).
|
||||
priorBoxes = []
|
||||
boxCoder = config['box_coder'][0]
|
||||
fasterRcnnBoxCoder = boxCoder['faster_rcnn_box_coder'][0]
|
||||
boxCoderVariance = [1.0/float(fasterRcnnBoxCoder['x_scale'][0]), 1.0/float(fasterRcnnBoxCoder['y_scale'][0]), 1.0/float(fasterRcnnBoxCoder['width_scale'][0]), 1.0/float(fasterRcnnBoxCoder['height_scale'][0])]
|
||||
for i in range(num_layers):
|
||||
priorBox = NodeDef()
|
||||
priorBox.name = 'PriorBox_%d' % i
|
||||
priorBox.op = 'PriorBox'
|
||||
if box_predictor is 'convolutional':
|
||||
priorBox.input.append('BoxPredictor_%d/BoxEncodingPredictor/BiasAdd' % i)
|
||||
else:
|
||||
if i == 0:
|
||||
priorBox.input.append('WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D')
|
||||
else:
|
||||
priorBox.input.append('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/BiasAdd' % i)
|
||||
priorBox.input.append(graph_def.node[0].name) # image_tensor
|
||||
|
||||
priorBox.addAttr('flip', False)
|
||||
priorBox.addAttr('clip', False)
|
||||
|
||||
widths, heights = priors_generator.get(i)
|
||||
|
||||
priorBox.addAttr('width', widths)
|
||||
priorBox.addAttr('height', heights)
|
||||
priorBox.addAttr('variance', boxCoderVariance)
|
||||
|
||||
graph_def.node.extend([priorBox])
|
||||
priorBoxes.append(priorBox.name)
|
||||
|
||||
# Compare this layer's output with Postprocessor/Reshape
|
||||
addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')
|
||||
|
||||
# Sigmoid for classes predictions and DetectionOutput layer
|
||||
addReshape('ClassPredictor/concat', 'ClassPredictor/concat3d', [0, -1, num_classes + 1], graph_def)
|
||||
|
||||
sigmoid = NodeDef()
|
||||
sigmoid.name = 'ClassPredictor/concat/sigmoid'
|
||||
sigmoid.op = 'Sigmoid'
|
||||
sigmoid.input.append('ClassPredictor/concat3d')
|
||||
graph_def.node.extend([sigmoid])
|
||||
|
||||
addFlatten(sigmoid.name, sigmoid.name + '/Flatten', graph_def)
|
||||
|
||||
detectionOut = NodeDef()
|
||||
detectionOut.name = 'detection_out'
|
||||
detectionOut.op = 'DetectionOutput'
|
||||
|
||||
if box_predictor == 'convolutional':
|
||||
detectionOut.input.append('BoxEncodingPredictor/concat')
|
||||
else:
|
||||
detectionOut.input.append('BoxPredictor/concat')
|
||||
detectionOut.input.append(sigmoid.name + '/Flatten')
|
||||
detectionOut.input.append('PriorBox/concat')
|
||||
|
||||
detectionOut.addAttr('num_classes', num_classes + 1)
|
||||
detectionOut.addAttr('share_location', True)
|
||||
detectionOut.addAttr('background_label_id', 0)
|
||||
|
||||
postProcessing = config['post_processing'][0]
|
||||
batchNMS = postProcessing['batch_non_max_suppression'][0]
|
||||
|
||||
if 'iou_threshold' in batchNMS:
|
||||
detectionOut.addAttr('nms_threshold', float(batchNMS['iou_threshold'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('nms_threshold', 0.6)
|
||||
|
||||
if 'score_threshold' in batchNMS:
|
||||
detectionOut.addAttr('confidence_threshold', float(batchNMS['score_threshold'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('confidence_threshold', 0.01)
|
||||
|
||||
if 'max_detections_per_class' in batchNMS:
|
||||
detectionOut.addAttr('top_k', int(batchNMS['max_detections_per_class'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('top_k', 100)
|
||||
|
||||
if 'max_total_detections' in batchNMS:
|
||||
detectionOut.addAttr('keep_top_k', int(batchNMS['max_total_detections'][0]))
|
||||
else:
|
||||
detectionOut.addAttr('keep_top_k', 100)
|
||||
|
||||
detectionOut.addAttr('code_type', "CENTER_SIZE")
|
||||
|
||||
graph_def.node.extend([detectionOut])
|
||||
|
||||
while True:
|
||||
unconnectedNodes = getUnconnectedNodes()
|
||||
unconnectedNodes.remove(detectionOut.name)
|
||||
if not unconnectedNodes:
|
||||
break
|
||||
|
||||
for name in unconnectedNodes:
|
||||
for i in range(len(graph_def.node)):
|
||||
if graph_def.node[i].name == name:
|
||||
del graph_def.node[i]
|
||||
break
|
||||
|
||||
# Save as text.
|
||||
graph_def.save(outputPath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
|
||||
'SSD model from TensorFlow Object Detection API. '
|
||||
'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
|
||||
parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
|
||||
parser.add_argument('--output', required=True, help='Path to output text graph.')
|
||||
parser.add_argument('--config', required=True, help='Path to a *.config file is used for training.')
|
||||
args = parser.parse_args()
|
||||
|
||||
createSSDGraph(args.input, args.config, args.output)
|
472
3rdparty/opencv-4.5.4/samples/dnn/virtual_try_on.py
vendored
Normal file
472
3rdparty/opencv-4.5.4/samples/dnn/virtual_try_on.py
vendored
Normal file
@ -0,0 +1,472 @@
|
||||
#!/usr/bin/env python3
|
||||
'''
|
||||
You can download the Geometric Matching Module model from https://www.dropbox.com/s/tyhc73xa051grjp/cp_vton_gmm.onnx?dl=0
|
||||
You can download the Try-On Module model from https://www.dropbox.com/s/q2x97ve2h53j66k/cp_vton_tom.onnx?dl=0
|
||||
You can download the cloth segmentation model from https://www.dropbox.com/s/qag9vzambhhkvxr/lip_jppnet_384.pb?dl=0
|
||||
You can find the OpenPose proto in opencv_extra/testdata/dnn/openpose_pose_coco.prototxt
|
||||
and get .caffemodel using opencv_extra/testdata/dnn/download_models.py
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import numpy as np
|
||||
import cv2 as cv
|
||||
|
||||
from numpy import linalg
|
||||
from common import findFile
|
||||
from human_parsing import parse_human
|
||||
|
||||
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
|
||||
cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
|
||||
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
|
||||
cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Use this script to run virtial try-on using CP-VTON',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--input_image', '-i', required=True, help='Path to image with person.')
|
||||
parser.add_argument('--input_cloth', '-c', required=True, help='Path to target cloth image')
|
||||
parser.add_argument('--gmm_model', '-gmm', default='cp_vton_gmm.onnx', help='Path to Geometric Matching Module .onnx model.')
|
||||
parser.add_argument('--tom_model', '-tom', default='cp_vton_tom.onnx', help='Path to Try-On Module .onnx model.')
|
||||
parser.add_argument('--segmentation_model', default='lip_jppnet_384.pb', help='Path to cloth segmentation .pb model.')
|
||||
parser.add_argument('--openpose_proto', default='openpose_pose_coco.prototxt', help='Path to OpenPose .prototxt model was trained on COCO dataset.')
|
||||
parser.add_argument('--openpose_model', default='openpose_pose_coco.caffemodel', help='Path to OpenPose .caffemodel model was trained on COCO dataset.')
|
||||
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
|
||||
help="Choose one of computation backends: "
|
||||
"%d: automatically (by default), "
|
||||
"%d: Halide language (http://halide-lang.org/), "
|
||||
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
|
||||
"%d: OpenCV implementation, "
|
||||
"%d: VKCOM, "
|
||||
"%d: CUDA" % backends)
|
||||
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
|
||||
help='Choose one of target computation devices: '
|
||||
'%d: CPU target (by default), '
|
||||
'%d: OpenCL, '
|
||||
'%d: OpenCL fp16 (half-float precision), '
|
||||
'%d: NCS2 VPU, '
|
||||
'%d: HDDL VPU, '
|
||||
'%d: Vulkan, '
|
||||
'%d: CUDA, '
|
||||
'%d: CUDA fp16 (half-float preprocess)'% targets)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
|
||||
def get_pose_map(image, proto_path, model_path, backend, target, height=256, width=192):
|
||||
radius = 5
|
||||
inp = cv.dnn.blobFromImage(image, 1.0 / 255, (width, height))
|
||||
|
||||
net = cv.dnn.readNet(proto_path, model_path)
|
||||
net.setPreferableBackend(backend)
|
||||
net.setPreferableTarget(target)
|
||||
net.setInput(inp)
|
||||
out = net.forward()
|
||||
|
||||
threshold = 0.1
|
||||
_, out_c, out_h, out_w = out.shape
|
||||
pose_map = np.zeros((height, width, out_c - 1))
|
||||
# last label: Background
|
||||
for i in range(0, out.shape[1] - 1):
|
||||
heatMap = out[0, i, :, :]
|
||||
keypoint = np.full((height, width), -1)
|
||||
_, conf, _, point = cv.minMaxLoc(heatMap)
|
||||
x = width * point[0] // out_w
|
||||
y = height * point[1] // out_h
|
||||
if conf > threshold and x > 0 and y > 0:
|
||||
keypoint[y - radius:y + radius, x - radius:x + radius] = 1
|
||||
pose_map[:, :, i] = keypoint
|
||||
|
||||
pose_map = pose_map.transpose(2, 0, 1)
|
||||
return pose_map
|
||||
|
||||
|
||||
class BilinearFilter(object):
|
||||
"""
|
||||
PIL bilinear resize implementation
|
||||
image = image.resize((image_width // 16, image_height // 16), Image.BILINEAR)
|
||||
"""
|
||||
def _precompute_coeffs(self, inSize, outSize):
|
||||
filterscale = max(1.0, inSize / outSize)
|
||||
ksize = int(np.ceil(filterscale)) * 2 + 1
|
||||
|
||||
kk = np.zeros(shape=(outSize * ksize, ), dtype=np.float32)
|
||||
bounds = np.empty(shape=(outSize * 2, ), dtype=np.int32)
|
||||
|
||||
centers = (np.arange(outSize) + 0.5) * filterscale + 0.5
|
||||
bounds[::2] = np.where(centers - filterscale < 0, 0, centers - filterscale)
|
||||
bounds[1::2] = np.where(centers + filterscale > inSize, inSize, centers + filterscale) - bounds[::2]
|
||||
xmins = bounds[::2] - centers + 1
|
||||
|
||||
points = np.array([np.arange(row) + xmins[i] for i, row in enumerate(bounds[1::2])]) / filterscale
|
||||
for xx in range(0, outSize):
|
||||
point = points[xx]
|
||||
bilinear = np.where(point < 1.0, 1.0 - abs(point), 0.0)
|
||||
ww = np.sum(bilinear)
|
||||
kk[xx * ksize : xx * ksize + bilinear.size] = np.where(ww == 0.0, bilinear, bilinear / ww)
|
||||
return bounds, kk, ksize
|
||||
|
||||
def _resample_horizontal(self, out, img, ksize, bounds, kk):
|
||||
for yy in range(0, out.shape[0]):
|
||||
for xx in range(0, out.shape[1]):
|
||||
xmin = bounds[xx * 2 + 0]
|
||||
xmax = bounds[xx * 2 + 1]
|
||||
k = kk[xx * ksize : xx * ksize + xmax]
|
||||
out[yy, xx] = np.round(np.sum(img[yy, xmin : xmin + xmax] * k))
|
||||
|
||||
def _resample_vertical(self, out, img, ksize, bounds, kk):
|
||||
for yy in range(0, out.shape[0]):
|
||||
ymin = bounds[yy * 2 + 0]
|
||||
ymax = bounds[yy * 2 + 1]
|
||||
k = kk[yy * ksize: yy * ksize + ymax]
|
||||
out[yy] = np.round(np.sum(img[ymin : ymin + ymax, 0:out.shape[1]] * k[:, np.newaxis], axis=0))
|
||||
|
||||
def imaging_resample(self, img, xsize, ysize):
|
||||
height, width = img.shape[0:2]
|
||||
bounds_horiz, kk_horiz, ksize_horiz = self._precompute_coeffs(width, xsize)
|
||||
bounds_vert, kk_vert, ksize_vert = self._precompute_coeffs(height, ysize)
|
||||
|
||||
out_hor = np.empty((img.shape[0], xsize), dtype=np.uint8)
|
||||
self._resample_horizontal(out_hor, img, ksize_horiz, bounds_horiz, kk_horiz)
|
||||
out = np.empty((ysize, xsize), dtype=np.uint8)
|
||||
self._resample_vertical(out, out_hor, ksize_vert, bounds_vert, kk_vert)
|
||||
return out
|
||||
|
||||
|
||||
class CpVton(object):
|
||||
def __init__(self, gmm_model, tom_model, backend, target):
|
||||
super(CpVton, self).__init__()
|
||||
self.gmm_net = cv.dnn.readNet(gmm_model)
|
||||
self.tom_net = cv.dnn.readNet(tom_model)
|
||||
self.gmm_net.setPreferableBackend(backend)
|
||||
self.gmm_net.setPreferableTarget(target)
|
||||
self.tom_net.setPreferableBackend(backend)
|
||||
self.tom_net.setPreferableTarget(target)
|
||||
|
||||
def prepare_agnostic(self, segm_image, input_image, pose_map, height=256, width=192):
|
||||
palette = {
|
||||
'Background' : (0, 0, 0),
|
||||
'Hat' : (128, 0, 0),
|
||||
'Hair' : (255, 0, 0),
|
||||
'Glove' : (0, 85, 0),
|
||||
'Sunglasses' : (170, 0, 51),
|
||||
'UpperClothes' : (255, 85, 0),
|
||||
'Dress' : (0, 0, 85),
|
||||
'Coat' : (0, 119, 221),
|
||||
'Socks' : (85, 85, 0),
|
||||
'Pants' : (0, 85, 85),
|
||||
'Jumpsuits' : (85, 51, 0),
|
||||
'Scarf' : (52, 86, 128),
|
||||
'Skirt' : (0, 128, 0),
|
||||
'Face' : (0, 0, 255),
|
||||
'Left-arm' : (51, 170, 221),
|
||||
'Right-arm' : (0, 255, 255),
|
||||
'Left-leg' : (85, 255, 170),
|
||||
'Right-leg' : (170, 255, 85),
|
||||
'Left-shoe' : (255, 255, 0),
|
||||
'Right-shoe' : (255, 170, 0)
|
||||
}
|
||||
color2label = {val: key for key, val in palette.items()}
|
||||
head_labels = ['Hat', 'Hair', 'Sunglasses', 'Face', 'Pants', 'Skirt']
|
||||
|
||||
segm_image = cv.cvtColor(segm_image, cv.COLOR_BGR2RGB)
|
||||
phead = np.zeros((1, height, width), dtype=np.float32)
|
||||
pose_shape = np.zeros((height, width), dtype=np.uint8)
|
||||
for r in range(height):
|
||||
for c in range(width):
|
||||
pixel = tuple(segm_image[r, c])
|
||||
if tuple(pixel) in color2label:
|
||||
if color2label[pixel] in head_labels:
|
||||
phead[0, r, c] = 1
|
||||
if color2label[pixel] != 'Background':
|
||||
pose_shape[r, c] = 255
|
||||
|
||||
input_image = cv.dnn.blobFromImage(input_image, 1.0 / 127.5, (width, height), mean=(127.5, 127.5, 127.5), swapRB=True)
|
||||
input_image = input_image.squeeze(0)
|
||||
|
||||
img_head = input_image * phead - (1 - phead)
|
||||
|
||||
downsample = BilinearFilter()
|
||||
down = downsample.imaging_resample(pose_shape, width // 16, height // 16)
|
||||
res_shape = cv.resize(down, (width, height), cv.INTER_LINEAR)
|
||||
|
||||
res_shape = cv.dnn.blobFromImage(res_shape, 1.0 / 127.5, mean=(127.5, 127.5, 127.5), swapRB=True)
|
||||
res_shape = res_shape.squeeze(0)
|
||||
|
||||
agnostic = np.concatenate((res_shape, img_head, pose_map), axis=0)
|
||||
agnostic = np.expand_dims(agnostic, axis=0)
|
||||
return agnostic.astype(np.float32)
|
||||
|
||||
def get_warped_cloth(self, cloth_img, agnostic, height=256, width=192):
|
||||
cloth = cv.dnn.blobFromImage(cloth_img, 1.0 / 127.5, (width, height), mean=(127.5, 127.5, 127.5), swapRB=True)
|
||||
|
||||
self.gmm_net.setInput(agnostic, "input.1")
|
||||
self.gmm_net.setInput(cloth, "input.18")
|
||||
theta = self.gmm_net.forward()
|
||||
|
||||
grid = self._generate_grid(theta)
|
||||
warped_cloth = self._bilinear_sampler(cloth, grid).astype(np.float32)
|
||||
return warped_cloth
|
||||
|
||||
def get_tryon(self, agnostic, warp_cloth):
|
||||
inp = np.concatenate([agnostic, warp_cloth], axis=1)
|
||||
self.tom_net.setInput(inp)
|
||||
out = self.tom_net.forward()
|
||||
|
||||
p_rendered, m_composite = np.split(out, [3], axis=1)
|
||||
p_rendered = np.tanh(p_rendered)
|
||||
m_composite = 1 / (1 + np.exp(-m_composite))
|
||||
|
||||
p_tryon = warp_cloth * m_composite + p_rendered * (1 - m_composite)
|
||||
rgb_p_tryon = cv.cvtColor(p_tryon.squeeze(0).transpose(1, 2, 0), cv.COLOR_BGR2RGB)
|
||||
rgb_p_tryon = (rgb_p_tryon + 1) / 2
|
||||
return rgb_p_tryon
|
||||
|
||||
def _compute_L_inverse(self, X, Y):
|
||||
N = X.shape[0]
|
||||
|
||||
Xmat = np.tile(X, (1, N))
|
||||
Ymat = np.tile(Y, (1, N))
|
||||
P_dist_squared = np.power(Xmat - Xmat.transpose(1, 0), 2) + np.power(Ymat - Ymat.transpose(1, 0), 2)
|
||||
|
||||
P_dist_squared[P_dist_squared == 0] = 1
|
||||
K = np.multiply(P_dist_squared, np.log(P_dist_squared))
|
||||
|
||||
O = np.ones([N, 1], dtype=np.float32)
|
||||
Z = np.zeros([3, 3], dtype=np.float32)
|
||||
P = np.concatenate([O, X, Y], axis=1)
|
||||
first = np.concatenate((K, P), axis=1)
|
||||
second = np.concatenate((P.transpose(1, 0), Z), axis=1)
|
||||
L = np.concatenate((first, second), axis=0)
|
||||
Li = linalg.inv(L)
|
||||
return Li
|
||||
|
||||
def _prepare_to_transform(self, out_h=256, out_w=192, grid_size=5):
|
||||
grid_X, grid_Y = np.meshgrid(np.linspace(-1, 1, out_w), np.linspace(-1, 1, out_h))
|
||||
grid_X = np.expand_dims(np.expand_dims(grid_X, axis=0), axis=3)
|
||||
grid_Y = np.expand_dims(np.expand_dims(grid_Y, axis=0), axis=3)
|
||||
|
||||
axis_coords = np.linspace(-1, 1, grid_size)
|
||||
N = grid_size ** 2
|
||||
P_Y, P_X = np.meshgrid(axis_coords, axis_coords)
|
||||
|
||||
P_X = np.reshape(P_X,(-1, 1))
|
||||
P_Y = np.reshape(P_Y,(-1, 1))
|
||||
|
||||
P_X = np.expand_dims(np.expand_dims(np.expand_dims(P_X, axis=2), axis=3), axis=4).transpose(4, 1, 2, 3, 0)
|
||||
P_Y = np.expand_dims(np.expand_dims(np.expand_dims(P_Y, axis=2), axis=3), axis=4).transpose(4, 1, 2, 3, 0)
|
||||
return grid_X, grid_Y, N, P_X, P_Y
|
||||
|
||||
def _expand_torch(self, X, shape):
|
||||
if len(X.shape) != len(shape):
|
||||
return X.flatten().reshape(shape)
|
||||
else:
|
||||
axis = [1 if src == dst else dst for src, dst in zip(X.shape, shape)]
|
||||
return np.tile(X, axis)
|
||||
|
||||
def _apply_transformation(self, theta, points, N, P_X, P_Y):
|
||||
if len(theta.shape) == 2:
|
||||
theta = np.expand_dims(np.expand_dims(theta, axis=2), axis=3)
|
||||
|
||||
batch_size = theta.shape[0]
|
||||
|
||||
P_X_base = np.copy(P_X)
|
||||
P_Y_base = np.copy(P_Y)
|
||||
|
||||
Li = self._compute_L_inverse(np.reshape(P_X, (N, -1)), np.reshape(P_Y, (N, -1)))
|
||||
Li = np.expand_dims(Li, axis=0)
|
||||
|
||||
# split theta into point coordinates
|
||||
Q_X = np.squeeze(theta[:, :N, :, :], axis=3)
|
||||
Q_Y = np.squeeze(theta[:, N:, :, :], axis=3)
|
||||
|
||||
Q_X += self._expand_torch(P_X_base, Q_X.shape)
|
||||
Q_Y += self._expand_torch(P_Y_base, Q_Y.shape)
|
||||
|
||||
points_b = points.shape[0]
|
||||
points_h = points.shape[1]
|
||||
points_w = points.shape[2]
|
||||
|
||||
P_X = self._expand_torch(P_X, (1, points_h, points_w, 1, N))
|
||||
P_Y = self._expand_torch(P_Y, (1, points_h, points_w, 1, N))
|
||||
|
||||
W_X = self._expand_torch(Li[:,:N,:N], (batch_size, N, N)) @ Q_X
|
||||
W_Y = self._expand_torch(Li[:,:N,:N], (batch_size, N, N)) @ Q_Y
|
||||
|
||||
W_X = np.expand_dims(np.expand_dims(W_X, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
W_X = np.repeat(W_X, points_h, axis=1)
|
||||
W_X = np.repeat(W_X, points_w, axis=2)
|
||||
|
||||
W_Y = np.expand_dims(np.expand_dims(W_Y, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
W_Y = np.repeat(W_Y, points_h, axis=1)
|
||||
W_Y = np.repeat(W_Y, points_w, axis=2)
|
||||
|
||||
A_X = self._expand_torch(Li[:, N:, :N], (batch_size, 3, N)) @ Q_X
|
||||
A_Y = self._expand_torch(Li[:, N:, :N], (batch_size, 3, N)) @ Q_Y
|
||||
|
||||
A_X = np.expand_dims(np.expand_dims(A_X, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
A_X = np.repeat(A_X, points_h, axis=1)
|
||||
A_X = np.repeat(A_X, points_w, axis=2)
|
||||
|
||||
A_Y = np.expand_dims(np.expand_dims(A_Y, axis=3), axis=4).transpose(0, 4, 2, 3, 1)
|
||||
A_Y = np.repeat(A_Y, points_h, axis=1)
|
||||
A_Y = np.repeat(A_Y, points_w, axis=2)
|
||||
|
||||
points_X_for_summation = np.expand_dims(np.expand_dims(points[:, :, :, 0], axis=3), axis=4)
|
||||
points_X_for_summation = self._expand_torch(points_X_for_summation, points[:, :, :, 0].shape + (1, N))
|
||||
|
||||
points_Y_for_summation = np.expand_dims(np.expand_dims(points[:, :, :, 1], axis=3), axis=4)
|
||||
points_Y_for_summation = self._expand_torch(points_Y_for_summation, points[:, :, :, 0].shape + (1, N))
|
||||
|
||||
if points_b == 1:
|
||||
delta_X = points_X_for_summation - P_X
|
||||
delta_Y = points_Y_for_summation - P_Y
|
||||
else:
|
||||
delta_X = points_X_for_summation - self._expand_torch(P_X, points_X_for_summation.shape)
|
||||
delta_Y = points_Y_for_summation - self._expand_torch(P_Y, points_Y_for_summation.shape)
|
||||
|
||||
dist_squared = np.power(delta_X, 2) + np.power(delta_Y, 2)
|
||||
dist_squared[dist_squared == 0] = 1
|
||||
U = np.multiply(dist_squared, np.log(dist_squared))
|
||||
|
||||
points_X_batch = np.expand_dims(points[:,:,:,0], axis=3)
|
||||
points_Y_batch = np.expand_dims(points[:,:,:,1], axis=3)
|
||||
|
||||
if points_b == 1:
|
||||
points_X_batch = self._expand_torch(points_X_batch, (batch_size, ) + points_X_batch.shape[1:])
|
||||
points_Y_batch = self._expand_torch(points_Y_batch, (batch_size, ) + points_Y_batch.shape[1:])
|
||||
|
||||
points_X_prime = A_X[:,:,:,:,0]+ \
|
||||
np.multiply(A_X[:,:,:,:,1], points_X_batch) + \
|
||||
np.multiply(A_X[:,:,:,:,2], points_Y_batch) + \
|
||||
np.sum(np.multiply(W_X, self._expand_torch(U, W_X.shape)), 4)
|
||||
|
||||
points_Y_prime = A_Y[:,:,:,:,0]+ \
|
||||
np.multiply(A_Y[:,:,:,:,1], points_X_batch) + \
|
||||
np.multiply(A_Y[:,:,:,:,2], points_Y_batch) + \
|
||||
np.sum(np.multiply(W_Y, self._expand_torch(U, W_Y.shape)), 4)
|
||||
|
||||
return np.concatenate((points_X_prime, points_Y_prime), 3)
|
||||
|
||||
def _generate_grid(self, theta):
|
||||
grid_X, grid_Y, N, P_X, P_Y = self._prepare_to_transform()
|
||||
warped_grid = self._apply_transformation(theta, np.concatenate((grid_X, grid_Y), axis=3), N, P_X, P_Y)
|
||||
return warped_grid
|
||||
|
||||
def _bilinear_sampler(self, img, grid):
|
||||
x, y = grid[:,:,:,0], grid[:,:,:,1]
|
||||
|
||||
H = img.shape[2]
|
||||
W = img.shape[3]
|
||||
max_y = H - 1
|
||||
max_x = W - 1
|
||||
|
||||
# rescale x and y to [0, W-1/H-1]
|
||||
x = 0.5 * (x + 1.0) * (max_x - 1)
|
||||
y = 0.5 * (y + 1.0) * (max_y - 1)
|
||||
|
||||
# grab 4 nearest corner points for each (x_i, y_i)
|
||||
x0 = np.floor(x).astype(int)
|
||||
x1 = x0 + 1
|
||||
y0 = np.floor(y).astype(int)
|
||||
y1 = y0 + 1
|
||||
|
||||
# calculate deltas
|
||||
wa = (x1 - x) * (y1 - y)
|
||||
wb = (x1 - x) * (y - y0)
|
||||
wc = (x - x0) * (y1 - y)
|
||||
wd = (x - x0) * (y - y0)
|
||||
|
||||
# clip to range [0, H-1/W-1] to not violate img boundaries
|
||||
x0 = np.clip(x0, 0, max_x)
|
||||
x1 = np.clip(x1, 0, max_x)
|
||||
y0 = np.clip(y0, 0, max_y)
|
||||
y1 = np.clip(y1, 0, max_y)
|
||||
|
||||
# get pixel value at corner coords
|
||||
img = img.reshape(-1, H, W)
|
||||
Ia = img[:, y0, x0].swapaxes(0, 1)
|
||||
Ib = img[:, y1, x0].swapaxes(0, 1)
|
||||
Ic = img[:, y0, x1].swapaxes(0, 1)
|
||||
Id = img[:, y1, x1].swapaxes(0, 1)
|
||||
|
||||
wa = np.expand_dims(wa, axis=0)
|
||||
wb = np.expand_dims(wb, axis=0)
|
||||
wc = np.expand_dims(wc, axis=0)
|
||||
wd = np.expand_dims(wd, axis=0)
|
||||
|
||||
# compute output
|
||||
out = wa*Ia + wb*Ib + wc*Ic + wd*Id
|
||||
return out
|
||||
|
||||
|
||||
class CorrelationLayer(object):
|
||||
def __init__(self, params, blobs):
|
||||
super(CorrelationLayer, self).__init__()
|
||||
|
||||
def getMemoryShapes(self, inputs):
|
||||
fetureAShape = inputs[0]
|
||||
b, _, h, w = fetureAShape
|
||||
return [[b, h * w, h, w]]
|
||||
|
||||
def forward(self, inputs):
|
||||
feature_A, feature_B = inputs
|
||||
b, c, h, w = feature_A.shape
|
||||
feature_A = feature_A.transpose(0, 1, 3, 2)
|
||||
feature_A = np.reshape(feature_A, (b, c, h * w))
|
||||
feature_B = np.reshape(feature_B, (b, c, h * w))
|
||||
feature_B = feature_B.transpose(0, 2, 1)
|
||||
feature_mul = feature_B @ feature_A
|
||||
feature_mul= np.reshape(feature_mul, (b, h, w, h * w))
|
||||
feature_mul = feature_mul.transpose(0, 1, 3, 2)
|
||||
correlation_tensor = feature_mul.transpose(0, 2, 1, 3)
|
||||
correlation_tensor = np.ascontiguousarray(correlation_tensor)
|
||||
return [correlation_tensor]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.path.isfile(args.gmm_model):
|
||||
raise OSError("GMM model not exist")
|
||||
if not os.path.isfile(args.tom_model):
|
||||
raise OSError("TOM model not exist")
|
||||
if not os.path.isfile(args.segmentation_model):
|
||||
raise OSError("Segmentation model not exist")
|
||||
if not os.path.isfile(findFile(args.openpose_proto)):
|
||||
raise OSError("OpenPose proto not exist")
|
||||
if not os.path.isfile(findFile(args.openpose_model)):
|
||||
raise OSError("OpenPose model not exist")
|
||||
|
||||
person_img = cv.imread(args.input_image)
|
||||
ratio = 256 / 192
|
||||
inp_h, inp_w, _ = person_img.shape
|
||||
current_ratio = inp_h / inp_w
|
||||
if current_ratio > ratio:
|
||||
center_h = inp_h // 2
|
||||
out_h = inp_w * ratio
|
||||
start = int(center_h - out_h // 2)
|
||||
end = int(center_h + out_h // 2)
|
||||
person_img = person_img[start:end, ...]
|
||||
else:
|
||||
center_w = inp_w // 2
|
||||
out_w = inp_h / ratio
|
||||
start = int(center_w - out_w // 2)
|
||||
end = int(center_w + out_w // 2)
|
||||
person_img = person_img[:, start:end, :]
|
||||
|
||||
cloth_img = cv.imread(args.input_cloth)
|
||||
pose = get_pose_map(person_img, findFile(args.openpose_proto),
|
||||
findFile(args.openpose_model), args.backend, args.target)
|
||||
segm_image = parse_human(person_img, args.segmentation_model)
|
||||
segm_image = cv.resize(segm_image, (192, 256), cv.INTER_LINEAR)
|
||||
|
||||
cv.dnn_registerLayer('Correlation', CorrelationLayer)
|
||||
|
||||
model = CpVton(args.gmm_model, args.tom_model, args.backend, args.target)
|
||||
agnostic = model.prepare_agnostic(segm_image, person_img, pose)
|
||||
warped_cloth = model.get_warped_cloth(cloth_img, agnostic)
|
||||
output = model.get_tryon(agnostic, warped_cloth)
|
||||
|
||||
cv.dnn_unregisterLayer('Correlation')
|
||||
|
||||
winName = 'Virtual Try-On'
|
||||
cv.namedWindow(winName, cv.WINDOW_AUTOSIZE)
|
||||
cv.imshow(winName, output)
|
||||
cv.waitKey()
|
Reference in New Issue
Block a user