#include "yolocrowd.hpp" #include "ilogger.hpp" #include "trt_infer.hpp" #include "infer_controller.hpp" #include "preprocess_kernel.cuh" #include "monopoly_allocator.hpp" #include #include #include #include namespace YoloCrowd { using namespace cv; using namespace std; const char *type_name(Type type) { switch (type) { case Type::V5: return "YoloV5"; case Type::V3: return "YoloV3"; case Type::V7: return "YoloV7"; case Type::X: return "YoloX"; default: return "Unknow"; } } void decode_kernel_invoker( float *predict, int num_bboxes, int num_classes, float confidence_threshold, float *invert_affine_matrix, float *parray, int max_objects, cudaStream_t stream); void nms_kernel_invoker( float *parray, float nms_threshold, int max_objects, cudaStream_t stream); struct AffineMatrix { float i2d[6]; // image to dst(network), 2x3 matrix float d2i[6]; // dst to image, 2x3 matrix void compute(const cv::Size &from, const cv::Size &to) { float scale_x = to.width / (float)from.width; float scale_y = to.height / (float)from.height; float scale = std::min(scale_x, scale_y); // + scale * 0.5 - 0.5 的主要原因是使得中心更加对齐,下采样不明显,但是上采样时就比较明显 i2d[0] = scale; i2d[1] = 0; i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5; i2d[3] = 0; i2d[4] = scale; i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5; cv::Mat m2x3_i2d(2, 3, CV_32F, i2d); cv::Mat m2x3_d2i(2, 3, CV_32F, d2i); cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); } cv::Mat i2d_mat() { return cv::Mat(2, 3, CV_32F, i2d); } }; static float iou(const Box &a, const Box &b) { float cleft = max(a.left, b.left); float ctop = max(a.top, b.top); float cright = min(a.right, b.right); float cbottom = min(a.bottom, b.bottom); float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); if (c_area == 0.0f) return 0.0f; float a_area = max(0.0f, a.right - a.left) * max(0.0f, a.bottom - a.top); float b_area = max(0.0f, b.right - b.left) * max(0.0f, b.bottom - b.top); return c_area / (a_area + b_area - c_area); } static BoxArray cpu_nms(BoxArray &boxes, float threshold) { std::sort(boxes.begin(), boxes.end(), [](BoxArray::const_reference a, BoxArray::const_reference b) { return a.confidence > b.confidence; }); BoxArray output; output.reserve(boxes.size()); std::vector remove_flags(boxes.size()); for (int i = 0; i < boxes.size(); ++i) { if (remove_flags[i]) continue; auto &a = boxes[i]; output.emplace_back(a); for (int j = i + 1; j < boxes.size(); ++j) { if (remove_flags[j]) continue; auto &b = boxes[j]; if (b.class_label == a.class_label) { if (iou(a, b) >= threshold) remove_flags[j] = true; } } } return output; } using ControllerImpl = InferController< Mat, // input BoxArray, // output tuple, // start param AffineMatrix // additional >; class InferImpl : public Infer, public ControllerImpl { public: /** 要求在InferImpl里面执行stop,而不是在基类执行stop **/ virtual ~InferImpl() { stop(); } virtual bool startup( const string &file, Type type, int gpuid, float confidence_threshold, float nms_threshold, NMSMethod nms_method, int max_objects, bool use_multi_preprocess_stream) { if (type == Type::V5 || type == Type::V3 || type == Type::V7) { normalize_ = CUDAKernel::Norm::alpha_beta(1 / 255.0f, 0.0f, CUDAKernel::ChannelType::Invert); } else if (type == Type::X) { // float mean[] = {0.485, 0.456, 0.406}; // float std[] = {0.229, 0.224, 0.225}; // normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert); normalize_ = CUDAKernel::Norm::None(); } else { INFOE("Unsupport type %d", type); } use_multi_preprocess_stream_ = use_multi_preprocess_stream; confidence_threshold_ = confidence_threshold; nms_threshold_ = nms_threshold; nms_method_ = nms_method; max_objects_ = max_objects; return ControllerImpl::startup(make_tuple(file, gpuid)); } virtual void worker(promise &result) override { string file = get<0>(start_param_); int gpuid = get<1>(start_param_); TRT::set_device(gpuid); auto engine = TRT::load_infer(file); if (engine == nullptr) { INFOE("Engine %s load failed", file.c_str()); result.set_value(false); return; } engine->print(); const int MAX_IMAGE_BBOX = max_objects_; const int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag TRT::Tensor affin_matrix_device(TRT::DataType::Float); TRT::Tensor output_array_device(TRT::DataType::Float); int max_batch_size = engine->get_max_batch_size(); auto input = engine->tensor("images"); auto output = engine->tensor("output0"); int num_classes = output->size(2) - 5; input_width_ = input->size(3); input_height_ = input->size(2); tensor_allocator_ = make_shared>(max_batch_size * 2); stream_ = engine->get_stream(); gpu_ = gpuid; result.set_value(true); input->resize_single_dim(0, max_batch_size).to_gpu(); affin_matrix_device.set_stream(stream_); // 这里8个值的目的是保证 8 * sizeof(float) % 32 == 0 affin_matrix_device.resize(max_batch_size, 8).to_gpu(); // 这里的 1 + MAX_IMAGE_BBOX 结构是 counter + bboxes ... output_array_device.resize(max_batch_size, 1 + MAX_IMAGE_BBOX * NUM_BOX_ELEMENT).to_gpu(); std::vector fetch_jobs; while (get_jobs_and_wait(fetch_jobs, max_batch_size)) { int infer_batch_size = fetch_jobs.size(); input->resize_single_dim(0, infer_batch_size); for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch) { auto &job = fetch_jobs[ibatch]; auto &mono = job.mono_tensor->data(); if (mono->get_stream() != stream_) { // synchronize preprocess stream finish checkCudaRuntime(cudaStreamSynchronize(mono->get_stream())); } affin_matrix_device.copy_from_gpu(affin_matrix_device.offset(ibatch), mono->get_workspace()->gpu(), 6); input->copy_from_gpu(input->offset(ibatch), mono->gpu(), mono->count()); job.mono_tensor->release(); } engine->forward(false); output_array_device.to_gpu(false); for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch) { auto &job = fetch_jobs[ibatch]; float *image_based_output = output->gpu(ibatch); float *output_array_ptr = output_array_device.gpu(ibatch); auto affine_matrix = affin_matrix_device.gpu(ibatch); checkCudaRuntime(cudaMemsetAsync(output_array_ptr, 0, sizeof(int), stream_)); decode_kernel_invoker(image_based_output, output->size(1), num_classes, confidence_threshold_, affine_matrix, output_array_ptr, MAX_IMAGE_BBOX, stream_); if (nms_method_ == NMSMethod::FastGPU) { nms_kernel_invoker(output_array_ptr, nms_threshold_, MAX_IMAGE_BBOX, stream_); } } output_array_device.to_cpu(); for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch) { float *parray = output_array_device.cpu(ibatch); int count = min(MAX_IMAGE_BBOX, (int)*parray); auto &job = fetch_jobs[ibatch]; auto &image_based_boxes = job.output; for (int i = 0; i < count; ++i) { float *pbox = parray + 1 + i * NUM_BOX_ELEMENT; int label = pbox[5]; int keepflag = pbox[6]; if (keepflag == 1) { image_based_boxes.emplace_back(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label); } } if (nms_method_ == NMSMethod::CPU) { image_based_boxes = cpu_nms(image_based_boxes, nms_threshold_); } job.pro->set_value(image_based_boxes); } fetch_jobs.clear(); } stream_ = nullptr; tensor_allocator_.reset(); INFO("Engine destroy."); } virtual bool preprocess(Job &job, const Mat &image) override { if (tensor_allocator_ == nullptr) { INFOE("tensor_allocator_ is nullptr"); return false; } if (image.empty()) { INFOE("Image is empty"); return false; } job.mono_tensor = tensor_allocator_->query(); if (job.mono_tensor == nullptr) { INFOE("Tensor allocator query failed."); return false; } CUDATools::AutoDevice auto_device(gpu_); auto &tensor = job.mono_tensor->data(); TRT::CUStream preprocess_stream = nullptr; if (tensor == nullptr) { // not init tensor = make_shared(); tensor->set_workspace(make_shared()); if (use_multi_preprocess_stream_) { checkCudaRuntime(cudaStreamCreate(&preprocess_stream)); // owner = true, stream needs to be free during deconstruction tensor->set_stream(preprocess_stream, true); } else { preprocess_stream = stream_; // owner = false, tensor ignored the stream tensor->set_stream(preprocess_stream, false); } } Size input_size(input_width_, input_height_); job.additional.compute(image.size(), input_size); preprocess_stream = tensor->get_stream(); tensor->resize(1, 3, input_height_, input_width_); size_t size_image = image.cols * image.rows * 3; size_t size_matrix = iLogger::upbound(sizeof(job.additional.d2i), 32); auto workspace = tensor->get_workspace(); uint8_t *gpu_workspace = (uint8_t *)workspace->gpu(size_matrix + size_image); float *affine_matrix_device = (float *)gpu_workspace; uint8_t *image_device = size_matrix + gpu_workspace; uint8_t *cpu_workspace = (uint8_t *)workspace->cpu(size_matrix + size_image); float *affine_matrix_host = (float *)cpu_workspace; uint8_t *image_host = size_matrix + cpu_workspace; // checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_)); // speed up memcpy(image_host, image.data, size_image); memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, preprocess_stream)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, preprocess_stream)); CUDAKernel::warp_affine_bilinear_and_normalize_plane( image_device, image.cols * 3, image.cols, image.rows, tensor->gpu(), input_width_, input_height_, affine_matrix_device, 114, normalize_, preprocess_stream); return true; } virtual std::vector> commits(const std::vector &images) override { return ControllerImpl::commits(images); } virtual std::shared_future commit(const Mat &image) override { return ControllerImpl::commit(image); } private: int input_width_ = 0; int input_height_ = 0; int gpu_ = 0; float confidence_threshold_ = 0; float nms_threshold_ = 0; int max_objects_ = 1024; NMSMethod nms_method_ = NMSMethod::FastGPU; TRT::CUStream stream_ = nullptr; bool use_multi_preprocess_stream_ = false; CUDAKernel::Norm normalize_; }; shared_ptr create_infer( const string &engine_file, Type type, int gpuid, float confidence_threshold, float nms_threshold, NMSMethod nms_method, int max_objects, bool use_multi_preprocess_stream) { shared_ptr instance(new InferImpl()); if (!instance->startup( engine_file, type, gpuid, confidence_threshold, nms_threshold, nms_method, max_objects, use_multi_preprocess_stream)) { instance.reset(); } return instance; } void image_to_tensor(const cv::Mat &image, shared_ptr &tensor, Type type, int ibatch) { CUDAKernel::Norm normalize; if (type == Type::V5 || type == Type::V3 || type == Type::V7) { normalize = CUDAKernel::Norm::alpha_beta(1 / 255.0f, 0.0f, CUDAKernel::ChannelType::Invert); } else if (type == Type::X) { // float mean[] = {0.485, 0.456, 0.406}; // float std[] = {0.229, 0.224, 0.225}; // normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert); normalize = CUDAKernel::Norm::None(); } else { INFOE("Unsupport type %d", type); } Size input_size(tensor->size(3), tensor->size(2)); AffineMatrix affine; affine.compute(image.size(), input_size); size_t size_image = image.cols * image.rows * 3; size_t size_matrix = iLogger::upbound(sizeof(affine.d2i), 32); auto workspace = tensor->get_workspace(); uint8_t *gpu_workspace = (uint8_t *)workspace->gpu(size_matrix + size_image); float *affine_matrix_device = (float *)gpu_workspace; uint8_t *image_device = size_matrix + gpu_workspace; uint8_t *cpu_workspace = (uint8_t *)workspace->cpu(size_matrix + size_image); float *affine_matrix_host = (float *)cpu_workspace; uint8_t *image_host = size_matrix + cpu_workspace; auto stream = tensor->get_stream(); memcpy(image_host, image.data, size_image); memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i)); checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream)); checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i), cudaMemcpyHostToDevice, stream)); CUDAKernel::warp_affine_bilinear_and_normalize_plane( image_device, image.cols * 3, image.cols, image.rows, tensor->gpu(ibatch), input_size.width, input_size.height, affine_matrix_device, 114, normalize, stream); tensor->synchronize(); } };