718c41634f
1.项目后端整体迁移至PaddleOCR-NCNN算法,已通过基本的兼容性测试 2.工程改为使用CMake组织,后续为了更好地兼容第三方库,不再提供QMake工程 3.重整权利声明文件,重整代码工程,确保最小化侵权风险 Log: 切换后端至PaddleOCR-NCNN,切换工程为CMake Change-Id: I4d5d2c5d37505a4a24b389b1a4c5d12f17bfa38c
1736 lines
54 KiB
C++
1736 lines
54 KiB
C++
// Tencent is pleased to support the open source community by making ncnn available.
|
|
//
|
|
// author:BUG1989 (https://github.com/BUG1989/) Long-term support.
|
|
// author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
|
|
//
|
|
// Copyright (C) 2019 BUG1989. All rights reserved.
|
|
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
|
|
//
|
|
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
|
|
// in compliance with the License. You may obtain a copy of the License at
|
|
//
|
|
// https://opensource.org/licenses/BSD-3-Clause
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed
|
|
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
|
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations under the License.
|
|
|
|
#ifdef _MSC_VER
|
|
#define _CRT_SECURE_NO_DEPRECATE
|
|
#endif
|
|
|
|
#include <float.h>
|
|
#include <limits.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#if defined(USE_NCNN_SIMPLEOCV)
|
|
#include "simpleocv.h"
|
|
#elif defined(USE_LOCAL_IMREADWRITE)
|
|
#include "imreadwrite.h"
|
|
#else
|
|
#include <opencv2/core/core.hpp>
|
|
#include <opencv2/highgui/highgui.hpp>
|
|
#endif
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
// ncnn public header
|
|
#include "benchmark.h"
|
|
#include "cpu.h"
|
|
#include "net.h"
|
|
|
|
// ncnn private header
|
|
#include "layer/convolution.h"
|
|
#include "layer/convolutiondepthwise.h"
|
|
#include "layer/innerproduct.h"
|
|
|
|
class QuantBlobStat
|
|
{
|
|
public:
|
|
QuantBlobStat()
|
|
{
|
|
threshold = 0.f;
|
|
absmax = 0.f;
|
|
total = 0;
|
|
}
|
|
|
|
public:
|
|
float threshold;
|
|
float absmax;
|
|
|
|
// ACIQ
|
|
int total;
|
|
|
|
// KL
|
|
std::vector<uint64_t> histogram;
|
|
std::vector<float> histogram_normed;
|
|
};
|
|
|
|
class QuantNet : public ncnn::Net
|
|
{
|
|
public:
|
|
QuantNet();
|
|
|
|
std::vector<ncnn::Blob>& blobs;
|
|
std::vector<ncnn::Layer*>& layers;
|
|
|
|
public:
|
|
std::vector<std::vector<std::string> > listspaths;
|
|
std::vector<std::vector<float> > means;
|
|
std::vector<std::vector<float> > norms;
|
|
std::vector<std::vector<int> > shapes;
|
|
std::vector<int> type_to_pixels;
|
|
int quantize_num_threads;
|
|
|
|
public:
|
|
int init();
|
|
void print_quant_info() const;
|
|
int save_table(const char* tablepath);
|
|
int quantize_KL();
|
|
int quantize_ACIQ();
|
|
int quantize_EQ();
|
|
|
|
public:
|
|
std::vector<int> input_blobs;
|
|
std::vector<int> conv_layers;
|
|
std::vector<int> conv_bottom_blobs;
|
|
std::vector<int> conv_top_blobs;
|
|
|
|
// result
|
|
std::vector<QuantBlobStat> quant_blob_stats;
|
|
std::vector<ncnn::Mat> weight_scales;
|
|
std::vector<ncnn::Mat> bottom_blob_scales;
|
|
};
|
|
|
|
QuantNet::QuantNet()
|
|
: blobs(mutable_blobs()), layers(mutable_layers())
|
|
{
|
|
quantize_num_threads = ncnn::get_cpu_count();
|
|
}
|
|
|
|
int QuantNet::init()
|
|
{
|
|
// find all input layers
|
|
for (int i = 0; i < (int)layers.size(); i++)
|
|
{
|
|
const ncnn::Layer* layer = layers[i];
|
|
if (layer->type == "Input")
|
|
{
|
|
input_blobs.push_back(layer->tops[0]);
|
|
}
|
|
}
|
|
|
|
// find all conv layers
|
|
for (int i = 0; i < (int)layers.size(); i++)
|
|
{
|
|
const ncnn::Layer* layer = layers[i];
|
|
if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
|
|
{
|
|
conv_layers.push_back(i);
|
|
conv_bottom_blobs.push_back(layer->bottoms[0]);
|
|
conv_top_blobs.push_back(layer->tops[0]);
|
|
}
|
|
}
|
|
|
|
const int conv_layer_count = (int)conv_layers.size();
|
|
const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
|
|
|
|
quant_blob_stats.resize(conv_bottom_blob_count);
|
|
weight_scales.resize(conv_layer_count);
|
|
bottom_blob_scales.resize(conv_bottom_blob_count);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int QuantNet::save_table(const char* tablepath)
|
|
{
|
|
FILE* fp = fopen(tablepath, "wb");
|
|
if (!fp)
|
|
{
|
|
fprintf(stderr, "fopen %s failed\n", tablepath);
|
|
return -1;
|
|
}
|
|
|
|
const int conv_layer_count = (int)conv_layers.size();
|
|
const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
|
|
|
|
for (int i = 0; i < conv_layer_count; i++)
|
|
{
|
|
const ncnn::Mat& weight_scale = weight_scales[i];
|
|
|
|
fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
|
|
for (int j = 0; j < weight_scale.w; j++)
|
|
{
|
|
fprintf(fp, "%f ", weight_scale[j]);
|
|
}
|
|
fprintf(fp, "\n");
|
|
}
|
|
|
|
for (int i = 0; i < conv_bottom_blob_count; i++)
|
|
{
|
|
const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
|
|
|
|
fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
|
|
for (int j = 0; j < bottom_blob_scale.w; j++)
|
|
{
|
|
fprintf(fp, "%f ", bottom_blob_scale[j]);
|
|
}
|
|
fprintf(fp, "\n");
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
void QuantNet::print_quant_info() const
|
|
{
|
|
for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
|
|
{
|
|
const QuantBlobStat& stat = quant_blob_stats[i];
|
|
|
|
float scale = 127 / stat.threshold;
|
|
|
|
fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Read and resize image
|
|
* shape is input as [w,h,...]
|
|
* if w and h both are given, image will be resized to exactly size.
|
|
* if w and h both are zero or negative, image will not be resized.
|
|
* if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
|
|
* if only w is zero or negative, image's height will scaled resize to h
|
|
* @return ncnn::Mat
|
|
*/
|
|
|
|
inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
|
|
{
|
|
int target_w = shape[0];
|
|
int target_h = shape[1];
|
|
cv::Mat bgr = cv::imread(imagepath, 1);
|
|
if (target_h <= 0 && target_w <= 0)
|
|
{
|
|
return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
|
|
}
|
|
if (target_h <= 0 || target_w <= 0)
|
|
{
|
|
float scale = 1.0;
|
|
if (target_h <= 0)
|
|
{
|
|
scale = 1.0 * bgr.cols / target_w;
|
|
target_h = int(1.0 * bgr.rows / scale);
|
|
}
|
|
if (target_w <= 0)
|
|
{
|
|
scale = 1.0 * bgr.rows / target_h;
|
|
target_w = int(1.0 * bgr.cols / scale);
|
|
}
|
|
}
|
|
return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
|
|
}
|
|
|
|
static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
|
|
{
|
|
const size_t length = a.size();
|
|
|
|
float result = 0;
|
|
for (size_t i = 0; i < length; i++)
|
|
{
|
|
result += a[i] * log(a[i] / b[i]);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int QuantNet::quantize_KL()
|
|
{
|
|
const int input_blob_count = (int)input_blobs.size();
|
|
const int conv_layer_count = (int)conv_layers.size();
|
|
const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
|
|
const int image_count = (int)listspaths[0].size();
|
|
|
|
const int num_histogram_bins = 2048;
|
|
|
|
std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
|
|
std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
|
|
|
|
// initialize conv weight scales
|
|
#pragma omp parallel for num_threads(quantize_num_threads)
|
|
for (int i = 0; i < conv_layer_count; i++)
|
|
{
|
|
const ncnn::Layer* layer = layers[conv_layers[i]];
|
|
|
|
if (layer->type == "Convolution")
|
|
{
|
|
const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
|
|
|
|
const int num_output = convolution->num_output;
|
|
const int kernel_w = convolution->kernel_w;
|
|
const int kernel_h = convolution->kernel_h;
|
|
const int dilation_w = convolution->dilation_w;
|
|
const int dilation_h = convolution->dilation_h;
|
|
const int stride_w = convolution->stride_w;
|
|
const int stride_h = convolution->stride_h;
|
|
|
|
const int weight_data_size_output = convolution->weight_data_size / num_output;
|
|
|
|
// int8 winograd F43 needs weight data to use 6bit quantization
|
|
// TODO proper condition for winograd 3x3 int8
|
|
bool quant_6bit = false;
|
|
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
|
|
quant_6bit = true;
|
|
|
|
weight_scales[i].create(num_output);
|
|
|
|
for (int n = 0; n < num_output; n++)
|
|
{
|
|
const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
|
|
|
|
float absmax = 0.f;
|
|
for (int k = 0; k < weight_data_size_output; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
|
|
}
|
|
|
|
if (quant_6bit)
|
|
{
|
|
weight_scales[i][n] = 31 / absmax;
|
|
}
|
|
else
|
|
{
|
|
weight_scales[i][n] = 127 / absmax;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (layer->type == "ConvolutionDepthWise")
|
|
{
|
|
const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
|
|
|
|
const int group = convolutiondepthwise->group;
|
|
const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
|
|
|
|
std::vector<float> scales;
|
|
|
|
weight_scales[i].create(group);
|
|
|
|
for (int n = 0; n < group; n++)
|
|
{
|
|
const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
|
|
|
|
float absmax = 0.f;
|
|
for (int k = 0; k < weight_data_size_output; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
|
|
}
|
|
|
|
weight_scales[i][n] = 127 / absmax;
|
|
}
|
|
}
|
|
|
|
if (layer->type == "InnerProduct")
|
|
{
|
|
const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
|
|
|
|
const int num_output = innerproduct->num_output;
|
|
const int weight_data_size_output = innerproduct->weight_data_size / num_output;
|
|
|
|
weight_scales[i].create(num_output);
|
|
|
|
for (int n = 0; n < num_output; n++)
|
|
{
|
|
const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
|
|
|
|
float absmax = 0.f;
|
|
for (int k = 0; k < weight_data_size_output; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
|
|
}
|
|
|
|
weight_scales[i][n] = 127 / absmax;
|
|
}
|
|
}
|
|
}
|
|
|
|
// count the absmax
|
|
#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
|
|
for (int i = 0; i < image_count; i++)
|
|
{
|
|
if (i % 100 == 0)
|
|
{
|
|
fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
|
|
}
|
|
|
|
ncnn::Extractor ex = create_extractor();
|
|
|
|
const int thread_num = ncnn::get_omp_thread_num();
|
|
ex.set_blob_allocator(&blob_allocators[thread_num]);
|
|
ex.set_workspace_allocator(&workspace_allocators[thread_num]);
|
|
|
|
for (int j = 0; j < input_blob_count; j++)
|
|
{
|
|
const int type_to_pixel = type_to_pixels[j];
|
|
const std::vector<float>& mean_vals = means[j];
|
|
const std::vector<float>& norm_vals = norms[j];
|
|
|
|
int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
|
|
if (type_to_pixel != pixel_convert_type)
|
|
{
|
|
pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
|
|
}
|
|
|
|
ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
|
|
|
|
in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
|
|
|
|
ex.input(input_blobs[j], in);
|
|
}
|
|
|
|
for (int j = 0; j < conv_bottom_blob_count; j++)
|
|
{
|
|
ncnn::Mat out;
|
|
ex.extract(conv_bottom_blobs[j], out);
|
|
|
|
// count absmax
|
|
{
|
|
float absmax = 0.f;
|
|
|
|
const int outc = out.c;
|
|
const int outsize = out.w * out.h;
|
|
for (int p = 0; p < outc; p++)
|
|
{
|
|
const float* ptr = out.channel(p);
|
|
for (int k = 0; k < outsize; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(ptr[k]));
|
|
}
|
|
}
|
|
|
|
#pragma omp critical
|
|
{
|
|
QuantBlobStat& stat = quant_blob_stats[j];
|
|
stat.absmax = std::max(stat.absmax, absmax);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// initialize histogram
|
|
#pragma omp parallel for num_threads(quantize_num_threads)
|
|
for (int i = 0; i < conv_bottom_blob_count; i++)
|
|
{
|
|
QuantBlobStat& stat = quant_blob_stats[i];
|
|
|
|
stat.histogram.resize(num_histogram_bins, 0);
|
|
stat.histogram_normed.resize(num_histogram_bins, 0);
|
|
}
|
|
|
|
// build histogram
|
|
#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
|
|
for (int i = 0; i < image_count; i++)
|
|
{
|
|
if (i % 100 == 0)
|
|
{
|
|
fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
|
|
}
|
|
|
|
ncnn::Extractor ex = create_extractor();
|
|
|
|
const int thread_num = ncnn::get_omp_thread_num();
|
|
ex.set_blob_allocator(&blob_allocators[thread_num]);
|
|
ex.set_workspace_allocator(&workspace_allocators[thread_num]);
|
|
|
|
for (int j = 0; j < input_blob_count; j++)
|
|
{
|
|
const int type_to_pixel = type_to_pixels[j];
|
|
const std::vector<float>& mean_vals = means[j];
|
|
const std::vector<float>& norm_vals = norms[j];
|
|
|
|
int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
|
|
if (type_to_pixel != pixel_convert_type)
|
|
{
|
|
pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
|
|
}
|
|
|
|
ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
|
|
|
|
in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
|
|
|
|
ex.input(input_blobs[j], in);
|
|
}
|
|
|
|
for (int j = 0; j < conv_bottom_blob_count; j++)
|
|
{
|
|
ncnn::Mat out;
|
|
ex.extract(conv_bottom_blobs[j], out);
|
|
|
|
// count histogram bin
|
|
{
|
|
const float absmax = quant_blob_stats[j].absmax;
|
|
|
|
std::vector<uint64_t> histogram(num_histogram_bins, 0);
|
|
|
|
const int outc = out.c;
|
|
const int outsize = out.w * out.h;
|
|
for (int p = 0; p < outc; p++)
|
|
{
|
|
const float* ptr = out.channel(p);
|
|
for (int k = 0; k < outsize; k++)
|
|
{
|
|
if (ptr[k] == 0.f)
|
|
continue;
|
|
|
|
const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
|
|
|
|
histogram[index] += 1;
|
|
}
|
|
}
|
|
|
|
#pragma omp critical
|
|
{
|
|
QuantBlobStat& stat = quant_blob_stats[j];
|
|
|
|
for (int k = 0; k < num_histogram_bins; k++)
|
|
{
|
|
stat.histogram[k] += histogram[k];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// using kld to find the best threshold value
|
|
#pragma omp parallel for num_threads(quantize_num_threads)
|
|
for (int i = 0; i < conv_bottom_blob_count; i++)
|
|
{
|
|
QuantBlobStat& stat = quant_blob_stats[i];
|
|
|
|
// normalize histogram bin
|
|
{
|
|
uint64_t sum = 0;
|
|
for (int j = 0; j < num_histogram_bins; j++)
|
|
{
|
|
sum += stat.histogram[j];
|
|
}
|
|
|
|
for (int j = 0; j < num_histogram_bins; j++)
|
|
{
|
|
stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
|
|
}
|
|
}
|
|
|
|
const int target_bin = 128;
|
|
|
|
int target_threshold = target_bin;
|
|
float min_kl_divergence = FLT_MAX;
|
|
|
|
for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
|
|
{
|
|
const float kl_eps = 0.0001f;
|
|
|
|
std::vector<float> clip_distribution(threshold, kl_eps);
|
|
{
|
|
for (int j = 0; j < threshold; j++)
|
|
{
|
|
clip_distribution[j] += stat.histogram_normed[j];
|
|
}
|
|
for (int j = threshold; j < num_histogram_bins; j++)
|
|
{
|
|
clip_distribution[threshold - 1] += stat.histogram_normed[j];
|
|
}
|
|
}
|
|
|
|
const float num_per_bin = (float)threshold / target_bin;
|
|
|
|
std::vector<float> quantize_distribution(target_bin, 0.f);
|
|
{
|
|
{
|
|
const float end = num_per_bin;
|
|
|
|
const int right_lower = (int)floor(end);
|
|
const float right_scale = end - right_lower;
|
|
|
|
if (right_scale > 0)
|
|
{
|
|
quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
|
|
}
|
|
|
|
for (int k = 0; k < right_lower; k++)
|
|
{
|
|
quantize_distribution[0] += stat.histogram_normed[k];
|
|
}
|
|
|
|
quantize_distribution[0] /= right_lower + right_scale;
|
|
}
|
|
for (int j = 1; j < target_bin - 1; j++)
|
|
{
|
|
const float start = j * num_per_bin;
|
|
const float end = (j + 1) * num_per_bin;
|
|
|
|
const int left_upper = (int)ceil(start);
|
|
const float left_scale = left_upper - start;
|
|
|
|
const int right_lower = (int)floor(end);
|
|
const float right_scale = end - right_lower;
|
|
|
|
if (left_scale > 0)
|
|
{
|
|
quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
|
|
}
|
|
|
|
if (right_scale > 0)
|
|
{
|
|
quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
|
|
}
|
|
|
|
for (int k = left_upper; k < right_lower; k++)
|
|
{
|
|
quantize_distribution[j] += stat.histogram_normed[k];
|
|
}
|
|
|
|
quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
|
|
}
|
|
{
|
|
const float start = threshold - num_per_bin;
|
|
|
|
const int left_upper = (int)ceil(start);
|
|
const float left_scale = left_upper - start;
|
|
|
|
if (left_scale > 0)
|
|
{
|
|
quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
|
|
}
|
|
|
|
for (int k = left_upper; k < threshold; k++)
|
|
{
|
|
quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
|
|
}
|
|
|
|
quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
|
|
}
|
|
}
|
|
|
|
std::vector<float> expand_distribution(threshold, kl_eps);
|
|
{
|
|
{
|
|
const float end = num_per_bin;
|
|
|
|
const int right_lower = (int)floor(end);
|
|
const float right_scale = end - right_lower;
|
|
|
|
if (right_scale > 0)
|
|
{
|
|
expand_distribution[right_lower] += right_scale * quantize_distribution[0];
|
|
}
|
|
|
|
for (int k = 0; k < right_lower; k++)
|
|
{
|
|
expand_distribution[k] += quantize_distribution[0];
|
|
}
|
|
}
|
|
for (int j = 1; j < target_bin - 1; j++)
|
|
{
|
|
const float start = j * num_per_bin;
|
|
const float end = (j + 1) * num_per_bin;
|
|
|
|
const int left_upper = (int)ceil(start);
|
|
const float left_scale = left_upper - start;
|
|
|
|
const int right_lower = (int)floor(end);
|
|
const float right_scale = end - right_lower;
|
|
|
|
if (left_scale > 0)
|
|
{
|
|
expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
|
|
}
|
|
|
|
if (right_scale > 0)
|
|
{
|
|
expand_distribution[right_lower] += right_scale * quantize_distribution[j];
|
|
}
|
|
|
|
for (int k = left_upper; k < right_lower; k++)
|
|
{
|
|
expand_distribution[k] += quantize_distribution[j];
|
|
}
|
|
}
|
|
{
|
|
const float start = threshold - num_per_bin;
|
|
|
|
const int left_upper = (int)ceil(start);
|
|
const float left_scale = left_upper - start;
|
|
|
|
if (left_scale > 0)
|
|
{
|
|
expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
|
|
}
|
|
|
|
for (int k = left_upper; k < threshold; k++)
|
|
{
|
|
expand_distribution[k] += quantize_distribution[target_bin - 1];
|
|
}
|
|
}
|
|
}
|
|
|
|
// kl
|
|
const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
|
|
|
|
// the best num of bin
|
|
if (kl_divergence < min_kl_divergence)
|
|
{
|
|
min_kl_divergence = kl_divergence;
|
|
target_threshold = threshold;
|
|
}
|
|
}
|
|
|
|
stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
|
|
float scale = 127 / stat.threshold;
|
|
|
|
bottom_blob_scales[i].create(1);
|
|
bottom_blob_scales[i][0] = scale;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
|
|
{
|
|
const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
|
|
|
|
const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
|
|
|
|
double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
|
|
|
|
return (float)(alpha_gaussian[num_bits - 1] * std);
|
|
}
|
|
|
|
int QuantNet::quantize_ACIQ()
|
|
{
|
|
const int input_blob_count = (int)input_blobs.size();
|
|
const int conv_layer_count = (int)conv_layers.size();
|
|
const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
|
|
const int image_count = (int)listspaths[0].size();
|
|
|
|
std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
|
|
std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
|
|
|
|
// initialize conv weight scales
|
|
#pragma omp parallel for num_threads(quantize_num_threads)
|
|
for (int i = 0; i < conv_layer_count; i++)
|
|
{
|
|
const ncnn::Layer* layer = layers[conv_layers[i]];
|
|
|
|
if (layer->type == "Convolution")
|
|
{
|
|
const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
|
|
|
|
const int num_output = convolution->num_output;
|
|
const int kernel_w = convolution->kernel_w;
|
|
const int kernel_h = convolution->kernel_h;
|
|
const int dilation_w = convolution->dilation_w;
|
|
const int dilation_h = convolution->dilation_h;
|
|
const int stride_w = convolution->stride_w;
|
|
const int stride_h = convolution->stride_h;
|
|
|
|
const int weight_data_size_output = convolution->weight_data_size / num_output;
|
|
|
|
// int8 winograd F43 needs weight data to use 6bit quantization
|
|
// TODO proper condition for winograd 3x3 int8
|
|
bool quant_6bit = false;
|
|
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
|
|
quant_6bit = true;
|
|
|
|
weight_scales[i].create(num_output);
|
|
|
|
for (int n = 0; n < num_output; n++)
|
|
{
|
|
const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
|
|
|
|
float absmax = 0.f;
|
|
for (int k = 0; k < weight_data_size_output; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
|
|
}
|
|
|
|
if (quant_6bit)
|
|
{
|
|
const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
|
|
weight_scales[i][n] = 31 / threshold;
|
|
}
|
|
else
|
|
{
|
|
const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
|
|
weight_scales[i][n] = 127 / threshold;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (layer->type == "ConvolutionDepthWise")
|
|
{
|
|
const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
|
|
|
|
const int group = convolutiondepthwise->group;
|
|
const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
|
|
|
|
std::vector<float> scales;
|
|
|
|
weight_scales[i].create(group);
|
|
|
|
for (int n = 0; n < group; n++)
|
|
{
|
|
const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
|
|
|
|
float absmax = 0.f;
|
|
for (int k = 0; k < weight_data_size_output; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
|
|
}
|
|
|
|
const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
|
|
weight_scales[i][n] = 127 / threshold;
|
|
}
|
|
}
|
|
|
|
if (layer->type == "InnerProduct")
|
|
{
|
|
const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
|
|
|
|
const int num_output = innerproduct->num_output;
|
|
const int weight_data_size_output = innerproduct->weight_data_size / num_output;
|
|
|
|
weight_scales[i].create(num_output);
|
|
|
|
for (int n = 0; n < num_output; n++)
|
|
{
|
|
const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
|
|
|
|
float absmax = 0.f;
|
|
for (int k = 0; k < weight_data_size_output; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
|
|
}
|
|
|
|
const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
|
|
weight_scales[i][n] = 127 / threshold;
|
|
}
|
|
}
|
|
}
|
|
|
|
// count the absmax
|
|
#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
|
|
for (int i = 0; i < image_count; i++)
|
|
{
|
|
if (i % 100 == 0)
|
|
{
|
|
fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
|
|
}
|
|
|
|
ncnn::Extractor ex = create_extractor();
|
|
|
|
const int thread_num = ncnn::get_omp_thread_num();
|
|
ex.set_blob_allocator(&blob_allocators[thread_num]);
|
|
ex.set_workspace_allocator(&workspace_allocators[thread_num]);
|
|
|
|
for (int j = 0; j < input_blob_count; j++)
|
|
{
|
|
const int type_to_pixel = type_to_pixels[j];
|
|
const std::vector<float>& mean_vals = means[j];
|
|
const std::vector<float>& norm_vals = norms[j];
|
|
|
|
int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
|
|
if (type_to_pixel != pixel_convert_type)
|
|
{
|
|
pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
|
|
}
|
|
|
|
ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
|
|
|
|
in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
|
|
|
|
ex.input(input_blobs[j], in);
|
|
}
|
|
|
|
for (int j = 0; j < conv_bottom_blob_count; j++)
|
|
{
|
|
ncnn::Mat out;
|
|
ex.extract(conv_bottom_blobs[j], out);
|
|
|
|
// count absmax
|
|
{
|
|
float absmax = 0.f;
|
|
|
|
const int outc = out.c;
|
|
const int outsize = out.w * out.h;
|
|
for (int p = 0; p < outc; p++)
|
|
{
|
|
const float* ptr = out.channel(p);
|
|
for (int k = 0; k < outsize; k++)
|
|
{
|
|
absmax = std::max(absmax, (float)fabs(ptr[k]));
|
|
}
|
|
}
|
|
|
|
#pragma omp critical
|
|
{
|
|
QuantBlobStat& stat = quant_blob_stats[j];
|
|
stat.absmax = std::max(stat.absmax, absmax);
|
|
stat.total = outc * outsize;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// alpha gaussian
|
|
#pragma omp parallel for num_threads(quantize_num_threads)
|
|
for (int i = 0; i < conv_bottom_blob_count; i++)
|
|
{
|
|
QuantBlobStat& stat = quant_blob_stats[i];
|
|
|
|
stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
|
|
float scale = 127 / stat.threshold;
|
|
|
|
bottom_blob_scales[i].create(1);
|
|
bottom_blob_scales[i][0] = scale;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
|
|
{
|
|
const int chanenls = a.c;
|
|
const int size = a.w * a.h;
|
|
|
|
float sa = 0;
|
|
float sb = 0;
|
|
float sum = 0;
|
|
|
|
for (int p = 0; p < chanenls; p++)
|
|
{
|
|
const float* pa = a.channel(p);
|
|
const float* pb = b.channel(p);
|
|
|
|
for (int i = 0; i < size; i++)
|
|
{
|
|
sa += pa[i] * pa[i];
|
|
sb += pb[i] * pb[i];
|
|
sum += pa[i] * pb[i];
|
|
}
|
|
}
|
|
|
|
float sim = (float)sum / sqrt(sa) / sqrt(sb);
|
|
|
|
return sim;
|
|
}
|
|
|
|
static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
|
|
{
|
|
if (layer->type == "Convolution")
|
|
{
|
|
ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
|
|
|
|
pd.set(0, convolution->num_output);
|
|
pd.set(1, convolution->kernel_w);
|
|
pd.set(11, convolution->kernel_h);
|
|
pd.set(2, convolution->dilation_w);
|
|
pd.set(12, convolution->dilation_h);
|
|
pd.set(3, convolution->stride_w);
|
|
pd.set(13, convolution->stride_h);
|
|
pd.set(4, convolution->pad_left);
|
|
pd.set(15, convolution->pad_right);
|
|
pd.set(14, convolution->pad_top);
|
|
pd.set(16, convolution->pad_bottom);
|
|
pd.set(18, convolution->pad_value);
|
|
pd.set(5, convolution->bias_term);
|
|
pd.set(6, convolution->weight_data_size);
|
|
pd.set(8, convolution->int8_scale_term);
|
|
pd.set(9, convolution->activation_type);
|
|
pd.set(10, convolution->activation_params);
|
|
}
|
|
else if (layer->type == "ConvolutionDepthWise")
|
|
{
|
|
ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
|
|
|
|
pd.set(0, convolutiondepthwise->num_output);
|
|
pd.set(1, convolutiondepthwise->kernel_w);
|
|
pd.set(11, convolutiondepthwise->kernel_h);
|
|
pd.set(2, convolutiondepthwise->dilation_w);
|
|
pd.set(12, convolutiondepthwise->dilation_h);
|
|
pd.set(3, convolutiondepthwise->stride_w);
|
|
pd.set(13, convolutiondepthwise->stride_h);
|
|
pd.set(4, convolutiondepthwise->pad_left);
|
|
pd.set(15, convolutiondepthwise->pad_right);
|
|
pd.set(14, convolutiondepthwise->pad_top);
|
|
pd.set(16, convolutiondepthwise->pad_bottom);
|
|
pd.set(18, convolutiondepthwise->pad_value);
|
|
pd.set(5, convolutiondepthwise->bias_term);
|
|
pd.set(6, convolutiondepthwise->weight_data_size);
|
|
pd.set(7, convolutiondepthwise->group);
|
|
pd.set(8, convolutiondepthwise->int8_scale_term);
|
|
pd.set(9, convolutiondepthwise->activation_type);
|
|
pd.set(10, convolutiondepthwise->activation_params);
|
|
}
|
|
else if (layer->type == "InnerProduct")
|
|
{
|
|
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
|
|
|
|
pd.set(0, innerproduct->num_output);
|
|
pd.set(1, innerproduct->bias_term);
|
|
pd.set(2, innerproduct->weight_data_size);
|
|
pd.set(8, innerproduct->int8_scale_term);
|
|
pd.set(9, innerproduct->activation_type);
|
|
pd.set(10, innerproduct->activation_params);
|
|
}
|
|
else
|
|
{
|
|
fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
|
|
{
|
|
if (layer->type == "Convolution")
|
|
{
|
|
ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
|
|
weights.push_back(convolution->weight_data);
|
|
if (convolution->bias_term)
|
|
weights.push_back(convolution->bias_data);
|
|
}
|
|
else if (layer->type == "ConvolutionDepthWise")
|
|
{
|
|
ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
|
|
weights.push_back(convolutiondepthwise->weight_data);
|
|
if (convolutiondepthwise->bias_term)
|
|
weights.push_back(convolutiondepthwise->bias_data);
|
|
}
|
|
else if (layer->type == "InnerProduct")
|
|
{
|
|
ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
|
|
weights.push_back(innerproduct->weight_data);
|
|
if (innerproduct->bias_term)
|
|
weights.push_back(innerproduct->bias_data);
|
|
}
|
|
else
|
|
{
|
|
fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int QuantNet::quantize_EQ()
|
|
{
|
|
// find the initial scale via KL
|
|
quantize_KL();
|
|
|
|
print_quant_info();
|
|
|
|
const int input_blob_count = (int)input_blobs.size();
|
|
const int conv_layer_count = (int)conv_layers.size();
|
|
const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
|
|
|
|
std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
|
|
std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
|
|
|
|
// max 50 images for EQ
|
|
const int image_count = std::min((int)listspaths[0].size(), 50);
|
|
|
|
const float scale_range_lower = 0.5f;
|
|
const float scale_range_upper = 2.0f;
|
|
const int search_steps = 100;
|
|
|
|
for (int i = 0; i < conv_layer_count; i++)
|
|
{
|
|
ncnn::Mat& weight_scale = weight_scales[i];
|
|
ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
|
|
|
|
const ncnn::Layer* layer = layers[conv_layers[i]];
|
|
|
|
// search weight scale
|
|
for (int j = 0; j < weight_scale.w; j++)
|
|
{
|
|
const float scale = weight_scale[j];
|
|
const float scale_lower = scale * scale_range_lower;
|
|
const float scale_upper = scale * scale_range_upper;
|
|
const float scale_step = (scale_upper - scale_lower) / search_steps;
|
|
|
|
std::vector<double> avgsims(search_steps, 0.0);
|
|
|
|
#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
|
|
for (int ii = 0; ii < image_count; ii++)
|
|
{
|
|
if (ii % 100 == 0)
|
|
{
|
|
fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, weight_scale.w, i, conv_layer_count);
|
|
}
|
|
|
|
ncnn::Extractor ex = create_extractor();
|
|
|
|
const int thread_num = ncnn::get_omp_thread_num();
|
|
ex.set_blob_allocator(&blob_allocators[thread_num]);
|
|
ex.set_workspace_allocator(&workspace_allocators[thread_num]);
|
|
|
|
for (int jj = 0; jj < input_blob_count; jj++)
|
|
{
|
|
const int type_to_pixel = type_to_pixels[jj];
|
|
const std::vector<float>& mean_vals = means[jj];
|
|
const std::vector<float>& norm_vals = norms[jj];
|
|
|
|
int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
|
|
if (type_to_pixel != pixel_convert_type)
|
|
{
|
|
pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
|
|
}
|
|
|
|
ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
|
|
|
|
in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
|
|
|
|
ex.input(input_blobs[jj], in);
|
|
}
|
|
|
|
ncnn::Mat in;
|
|
ex.extract(conv_bottom_blobs[i], in);
|
|
|
|
ncnn::Mat out;
|
|
ex.extract(conv_top_blobs[i], out);
|
|
|
|
ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
|
|
|
|
ncnn::ParamDict pd;
|
|
get_layer_param(layer, pd);
|
|
pd.set(8, 1); //int8_scale_term
|
|
layer_int8->load_param(pd);
|
|
|
|
std::vector<float> sims(search_steps);
|
|
for (int k = 0; k < search_steps; k++)
|
|
{
|
|
ncnn::Mat new_weight_scale = weight_scale.clone();
|
|
new_weight_scale[j] = scale_lower + k * scale_step;
|
|
|
|
std::vector<ncnn::Mat> weights;
|
|
get_layer_weights(layer, weights);
|
|
weights.push_back(new_weight_scale);
|
|
weights.push_back(bottom_blob_scale);
|
|
layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
|
|
|
|
ncnn::Option opt_int8;
|
|
opt_int8.use_packing_layout = false;
|
|
|
|
layer_int8->create_pipeline(opt_int8);
|
|
|
|
ncnn::Mat out_int8;
|
|
layer_int8->forward(in, out_int8, opt_int8);
|
|
|
|
layer_int8->destroy_pipeline(opt_int8);
|
|
|
|
sims[k] = cosine_similarity(out, out_int8);
|
|
}
|
|
|
|
delete layer_int8;
|
|
|
|
#pragma omp critical
|
|
{
|
|
for (int k = 0; k < search_steps; k++)
|
|
{
|
|
avgsims[k] += sims[k];
|
|
}
|
|
}
|
|
}
|
|
|
|
double max_avgsim = 0.0;
|
|
float new_scale = scale;
|
|
|
|
// find the scale with min cosine distance
|
|
for (int k = 0; k < search_steps; k++)
|
|
{
|
|
if (max_avgsim < avgsims[k])
|
|
{
|
|
max_avgsim = avgsims[k];
|
|
new_scale = scale_lower + k * scale_step;
|
|
}
|
|
}
|
|
|
|
fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
|
|
weight_scale[j] = new_scale;
|
|
}
|
|
|
|
// search bottom blob scale
|
|
for (int j = 0; j < bottom_blob_scale.w; j++)
|
|
{
|
|
const float scale = bottom_blob_scale[j];
|
|
const float scale_lower = scale * scale_range_lower;
|
|
const float scale_upper = scale * scale_range_upper;
|
|
const float scale_step = (scale_upper - scale_lower) / search_steps;
|
|
|
|
std::vector<double> avgsims(search_steps, 0.0);
|
|
|
|
#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
|
|
for (int ii = 0; ii < image_count; ii++)
|
|
{
|
|
if (ii % 100 == 0)
|
|
{
|
|
fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, bottom_blob_scale.w, i, conv_layer_count);
|
|
}
|
|
|
|
ncnn::Extractor ex = create_extractor();
|
|
|
|
const int thread_num = ncnn::get_omp_thread_num();
|
|
ex.set_blob_allocator(&blob_allocators[thread_num]);
|
|
ex.set_workspace_allocator(&workspace_allocators[thread_num]);
|
|
|
|
for (int jj = 0; jj < input_blob_count; jj++)
|
|
{
|
|
const int type_to_pixel = type_to_pixels[jj];
|
|
const std::vector<float>& mean_vals = means[jj];
|
|
const std::vector<float>& norm_vals = norms[jj];
|
|
|
|
int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
|
|
if (type_to_pixel != pixel_convert_type)
|
|
{
|
|
pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
|
|
}
|
|
|
|
ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
|
|
|
|
in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
|
|
|
|
ex.input(input_blobs[jj], in);
|
|
}
|
|
|
|
ncnn::Mat in;
|
|
ex.extract(conv_bottom_blobs[i], in);
|
|
|
|
ncnn::Mat out;
|
|
ex.extract(conv_top_blobs[i], out);
|
|
|
|
ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
|
|
|
|
ncnn::ParamDict pd;
|
|
get_layer_param(layer, pd);
|
|
pd.set(8, 1); //int8_scale_term
|
|
layer_int8->load_param(pd);
|
|
|
|
std::vector<float> sims(search_steps);
|
|
for (int k = 0; k < search_steps; k++)
|
|
{
|
|
ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
|
|
new_bottom_blob_scale[j] = scale_lower + k * scale_step;
|
|
|
|
std::vector<ncnn::Mat> weights;
|
|
get_layer_weights(layer, weights);
|
|
weights.push_back(weight_scale);
|
|
weights.push_back(new_bottom_blob_scale);
|
|
layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
|
|
|
|
ncnn::Option opt_int8;
|
|
opt_int8.use_packing_layout = false;
|
|
|
|
layer_int8->create_pipeline(opt_int8);
|
|
|
|
ncnn::Mat out_int8;
|
|
layer_int8->forward(in, out_int8, opt_int8);
|
|
|
|
layer_int8->destroy_pipeline(opt_int8);
|
|
|
|
sims[k] = cosine_similarity(out, out_int8);
|
|
}
|
|
|
|
delete layer_int8;
|
|
|
|
#pragma omp critical
|
|
{
|
|
for (int k = 0; k < search_steps; k++)
|
|
{
|
|
avgsims[k] += sims[k];
|
|
}
|
|
}
|
|
}
|
|
|
|
double max_avgsim = 0.0;
|
|
float new_scale = scale;
|
|
|
|
// find the scale with min cosine distance
|
|
for (int k = 0; k < search_steps; k++)
|
|
{
|
|
if (max_avgsim < avgsims[k])
|
|
{
|
|
max_avgsim = avgsims[k];
|
|
new_scale = scale_lower + k * scale_step;
|
|
}
|
|
}
|
|
|
|
fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
|
|
bottom_blob_scale[j] = new_scale;
|
|
}
|
|
|
|
// update quant info
|
|
QuantBlobStat& stat = quant_blob_stats[i];
|
|
stat.threshold = 127 / bottom_blob_scale[0];
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
|
|
{
|
|
std::vector<std::vector<std::string> > aps;
|
|
|
|
char* pch = strtok(s, ",");
|
|
while (pch != NULL)
|
|
{
|
|
FILE* fp = fopen(pch, "rb");
|
|
if (!fp)
|
|
{
|
|
fprintf(stderr, "fopen %s failed\n", pch);
|
|
break;
|
|
}
|
|
|
|
std::vector<std::string> paths;
|
|
|
|
// one filepath per line
|
|
char line[1024];
|
|
while (!feof(fp))
|
|
{
|
|
char* ss = fgets(line, 1024, fp);
|
|
if (!ss)
|
|
break;
|
|
|
|
char filepath[256];
|
|
int nscan = sscanf(line, "%255s", filepath);
|
|
if (nscan != 1)
|
|
continue;
|
|
|
|
paths.push_back(std::string(filepath));
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
aps.push_back(paths);
|
|
|
|
pch = strtok(NULL, ",");
|
|
}
|
|
|
|
return aps;
|
|
}
|
|
|
|
static float vstr_to_float(const char vstr[20])
|
|
{
|
|
double v = 0.0;
|
|
|
|
const char* p = vstr;
|
|
|
|
// sign
|
|
bool sign = *p != '-';
|
|
if (*p == '+' || *p == '-')
|
|
{
|
|
p++;
|
|
}
|
|
|
|
// digits before decimal point or exponent
|
|
uint64_t v1 = 0;
|
|
while (isdigit(*p))
|
|
{
|
|
v1 = v1 * 10 + (*p - '0');
|
|
p++;
|
|
}
|
|
|
|
v = (double)v1;
|
|
|
|
// digits after decimal point
|
|
if (*p == '.')
|
|
{
|
|
p++;
|
|
|
|
uint64_t pow10 = 1;
|
|
uint64_t v2 = 0;
|
|
|
|
while (isdigit(*p))
|
|
{
|
|
v2 = v2 * 10 + (*p - '0');
|
|
pow10 *= 10;
|
|
p++;
|
|
}
|
|
|
|
v += v2 / (double)pow10;
|
|
}
|
|
|
|
// exponent
|
|
if (*p == 'e' || *p == 'E')
|
|
{
|
|
p++;
|
|
|
|
// sign of exponent
|
|
bool fact = *p != '-';
|
|
if (*p == '+' || *p == '-')
|
|
{
|
|
p++;
|
|
}
|
|
|
|
// digits of exponent
|
|
uint64_t expon = 0;
|
|
while (isdigit(*p))
|
|
{
|
|
expon = expon * 10 + (*p - '0');
|
|
p++;
|
|
}
|
|
|
|
double scale = 1.0;
|
|
while (expon >= 8)
|
|
{
|
|
scale *= 1e8;
|
|
expon -= 8;
|
|
}
|
|
while (expon > 0)
|
|
{
|
|
scale *= 10.0;
|
|
expon -= 1;
|
|
}
|
|
|
|
v = fact ? v * scale : v / scale;
|
|
}
|
|
|
|
// fprintf(stderr, "v = %f\n", v);
|
|
return sign ? (float)v : (float)-v;
|
|
}
|
|
|
|
static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
|
|
{
|
|
std::vector<std::vector<float> > aaf;
|
|
|
|
char* pch = strtok(s, "[]");
|
|
while (pch != NULL)
|
|
{
|
|
// parse a,b,c
|
|
char vstr[20];
|
|
int nconsumed = 0;
|
|
int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
|
|
if (nscan == 1)
|
|
{
|
|
// ok we get array
|
|
pch += nconsumed;
|
|
|
|
std::vector<float> af;
|
|
float v = vstr_to_float(vstr);
|
|
af.push_back(v);
|
|
|
|
nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
|
|
while (nscan == 1)
|
|
{
|
|
pch += nconsumed;
|
|
|
|
float v = vstr_to_float(vstr);
|
|
af.push_back(v);
|
|
|
|
nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
|
|
}
|
|
|
|
// array end
|
|
aaf.push_back(af);
|
|
}
|
|
|
|
pch = strtok(NULL, "[]");
|
|
}
|
|
|
|
return aaf;
|
|
}
|
|
|
|
static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
|
|
{
|
|
std::vector<std::vector<int> > aai;
|
|
|
|
char* pch = strtok(s, "[]");
|
|
while (pch != NULL)
|
|
{
|
|
// parse a,b,c
|
|
int v;
|
|
int nconsumed = 0;
|
|
int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
|
|
if (nscan == 1)
|
|
{
|
|
// ok we get array
|
|
pch += nconsumed;
|
|
|
|
std::vector<int> ai;
|
|
ai.push_back(v);
|
|
|
|
nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
|
|
while (nscan == 1)
|
|
{
|
|
pch += nconsumed;
|
|
|
|
ai.push_back(v);
|
|
|
|
nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
|
|
}
|
|
|
|
// array end
|
|
aai.push_back(ai);
|
|
}
|
|
|
|
pch = strtok(NULL, "[]");
|
|
}
|
|
|
|
return aai;
|
|
}
|
|
|
|
static std::vector<int> parse_comma_pixel_type_list(char* s)
|
|
{
|
|
std::vector<int> aps;
|
|
|
|
char* pch = strtok(s, ",");
|
|
while (pch != NULL)
|
|
{
|
|
// RAW/RGB/BGR/GRAY/RGBA/BGRA
|
|
if (strcmp(pch, "RAW") == 0)
|
|
aps.push_back(-233);
|
|
if (strcmp(pch, "RGB") == 0)
|
|
aps.push_back(ncnn::Mat::PIXEL_RGB);
|
|
if (strcmp(pch, "BGR") == 0)
|
|
aps.push_back(ncnn::Mat::PIXEL_BGR);
|
|
if (strcmp(pch, "GRAY") == 0)
|
|
aps.push_back(ncnn::Mat::PIXEL_GRAY);
|
|
if (strcmp(pch, "RGBA") == 0)
|
|
aps.push_back(ncnn::Mat::PIXEL_RGBA);
|
|
if (strcmp(pch, "BGRA") == 0)
|
|
aps.push_back(ncnn::Mat::PIXEL_BGRA);
|
|
|
|
pch = strtok(NULL, ",");
|
|
}
|
|
|
|
return aps;
|
|
}
|
|
|
|
static void print_float_array_list(const std::vector<std::vector<float> >& list)
|
|
{
|
|
for (size_t i = 0; i < list.size(); i++)
|
|
{
|
|
const std::vector<float>& array = list[i];
|
|
fprintf(stderr, "[");
|
|
for (size_t j = 0; j < array.size(); j++)
|
|
{
|
|
fprintf(stderr, "%f", array[j]);
|
|
if (j != array.size() - 1)
|
|
fprintf(stderr, ",");
|
|
}
|
|
fprintf(stderr, "]");
|
|
if (i != list.size() - 1)
|
|
fprintf(stderr, ",");
|
|
}
|
|
}
|
|
|
|
static void print_int_array_list(const std::vector<std::vector<int> >& list)
|
|
{
|
|
for (size_t i = 0; i < list.size(); i++)
|
|
{
|
|
const std::vector<int>& array = list[i];
|
|
fprintf(stderr, "[");
|
|
for (size_t j = 0; j < array.size(); j++)
|
|
{
|
|
fprintf(stderr, "%d", array[j]);
|
|
if (j != array.size() - 1)
|
|
fprintf(stderr, ",");
|
|
}
|
|
fprintf(stderr, "]");
|
|
if (i != list.size() - 1)
|
|
fprintf(stderr, ",");
|
|
}
|
|
}
|
|
|
|
static void print_pixel_type_list(const std::vector<int>& list)
|
|
{
|
|
for (size_t i = 0; i < list.size(); i++)
|
|
{
|
|
const int type = list[i];
|
|
if (type == -233)
|
|
fprintf(stderr, "RAW");
|
|
if (type == ncnn::Mat::PIXEL_RGB)
|
|
fprintf(stderr, "RGB");
|
|
if (type == ncnn::Mat::PIXEL_BGR)
|
|
fprintf(stderr, "BGR");
|
|
if (type == ncnn::Mat::PIXEL_GRAY)
|
|
fprintf(stderr, "GRAY");
|
|
if (type == ncnn::Mat::PIXEL_RGBA)
|
|
fprintf(stderr, "RGBA");
|
|
if (type == ncnn::Mat::PIXEL_BGRA)
|
|
fprintf(stderr, "BGRA");
|
|
if (i != list.size() - 1)
|
|
fprintf(stderr, ",");
|
|
}
|
|
}
|
|
|
|
static void show_usage()
|
|
{
|
|
fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
|
|
fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
|
|
fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
|
|
fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
|
|
fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
|
|
fprintf(stderr, " thread=8\n");
|
|
fprintf(stderr, " method=kl/aciq/eq\n");
|
|
fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
|
|
}
|
|
|
|
int main(int argc, char** argv)
|
|
{
|
|
if (argc < 5)
|
|
{
|
|
show_usage();
|
|
return -1;
|
|
}
|
|
|
|
for (int i = 1; i < argc; i++)
|
|
{
|
|
if (argv[i][0] == '-')
|
|
{
|
|
show_usage();
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
const char* inparam = argv[1];
|
|
const char* inbin = argv[2];
|
|
char* lists = argv[3];
|
|
const char* outtable = argv[4];
|
|
|
|
ncnn::Option opt;
|
|
opt.num_threads = 1;
|
|
opt.use_fp16_packed = false;
|
|
opt.use_fp16_storage = false;
|
|
opt.use_fp16_arithmetic = false;
|
|
|
|
QuantNet net;
|
|
net.opt = opt;
|
|
net.load_param(inparam);
|
|
net.load_model(inbin);
|
|
|
|
net.init();
|
|
|
|
// load lists
|
|
net.listspaths = parse_comma_path_list(lists);
|
|
|
|
std::string method = "kl";
|
|
|
|
for (int i = 5; i < argc; i++)
|
|
{
|
|
// key=value
|
|
char* kv = argv[i];
|
|
|
|
char* eqs = strchr(kv, '=');
|
|
if (eqs == NULL)
|
|
{
|
|
fprintf(stderr, "unrecognized arg %s\n", kv);
|
|
continue;
|
|
}
|
|
|
|
// split k v
|
|
eqs[0] = '\0';
|
|
const char* key = kv;
|
|
char* value = eqs + 1;
|
|
|
|
// load mean norm shape
|
|
if (memcmp(key, "mean", 4) == 0)
|
|
net.means = parse_comma_float_array_list(value);
|
|
if (memcmp(key, "norm", 4) == 0)
|
|
net.norms = parse_comma_float_array_list(value);
|
|
if (memcmp(key, "shape", 5) == 0)
|
|
net.shapes = parse_comma_int_array_list(value);
|
|
if (memcmp(key, "pixel", 5) == 0)
|
|
net.type_to_pixels = parse_comma_pixel_type_list(value);
|
|
if (memcmp(key, "thread", 6) == 0)
|
|
net.quantize_num_threads = atoi(value);
|
|
if (memcmp(key, "method", 6) == 0)
|
|
method = std::string(value);
|
|
}
|
|
|
|
// sanity check
|
|
const size_t input_blob_count = net.input_blobs.size();
|
|
if (net.listspaths.size() != input_blob_count)
|
|
{
|
|
fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
|
|
return -1;
|
|
}
|
|
if (net.means.size() != input_blob_count)
|
|
{
|
|
fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
|
|
return -1;
|
|
}
|
|
if (net.norms.size() != input_blob_count)
|
|
{
|
|
fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
|
|
return -1;
|
|
}
|
|
if (net.shapes.size() != input_blob_count)
|
|
{
|
|
fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
|
|
return -1;
|
|
}
|
|
if (net.type_to_pixels.size() != input_blob_count)
|
|
{
|
|
fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
|
|
return -1;
|
|
}
|
|
if (net.quantize_num_threads < 0)
|
|
{
|
|
fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
|
|
return -1;
|
|
}
|
|
|
|
// print quantnet config
|
|
{
|
|
fprintf(stderr, "mean = ");
|
|
print_float_array_list(net.means);
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "norm = ");
|
|
print_float_array_list(net.norms);
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "shape = ");
|
|
print_int_array_list(net.shapes);
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "pixel = ");
|
|
print_pixel_type_list(net.type_to_pixels);
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
|
|
fprintf(stderr, "method = %s\n", method.c_str());
|
|
fprintf(stderr, "---------------------------------------\n");
|
|
}
|
|
|
|
if (method == "kl")
|
|
{
|
|
net.quantize_KL();
|
|
}
|
|
else if (method == "aciq")
|
|
{
|
|
net.quantize_ACIQ();
|
|
}
|
|
else if (method == "eq")
|
|
{
|
|
net.quantize_EQ();
|
|
}
|
|
else
|
|
{
|
|
fprintf(stderr, "not implemented yet !\n");
|
|
fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
|
|
return -1;
|
|
}
|
|
|
|
net.print_quant_info();
|
|
|
|
net.save_table(outtable);
|
|
|
|
return 0;
|
|
}
|