yolocrowd.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. #include "yolocrowd.hpp"
  2. #include "ilogger.hpp"
  3. #include "trt_infer.hpp"
  4. #include "infer_controller.hpp"
  5. #include "preprocess_kernel.cuh"
  6. #include "monopoly_allocator.hpp"
  7. #include <atomic>
  8. #include <mutex>
  9. #include <queue>
  10. #include <condition_variable>
  11. namespace YoloCrowd
  12. {
  13. using namespace cv;
  14. using namespace std;
  15. const char *type_name(Type type)
  16. {
  17. switch (type)
  18. {
  19. case Type::V5:
  20. return "YoloV5";
  21. case Type::V3:
  22. return "YoloV3";
  23. case Type::V7:
  24. return "YoloV7";
  25. case Type::X:
  26. return "YoloX";
  27. default:
  28. return "Unknow";
  29. }
  30. }
  31. void decode_kernel_invoker(
  32. float *predict, int num_bboxes, int num_classes, float confidence_threshold,
  33. float *invert_affine_matrix, float *parray,
  34. int max_objects, cudaStream_t stream);
  35. void nms_kernel_invoker(
  36. float *parray, float nms_threshold, int max_objects, cudaStream_t stream);
  37. struct AffineMatrix
  38. {
  39. float i2d[6]; // image to dst(network), 2x3 matrix
  40. float d2i[6]; // dst to image, 2x3 matrix
  41. void compute(const cv::Size &from, const cv::Size &to)
  42. {
  43. float scale_x = to.width / (float)from.width;
  44. float scale_y = to.height / (float)from.height;
  45. float scale = std::min(scale_x, scale_y);
  46. // + scale * 0.5 - 0.5 的主要原因是使得中心更加对齐,下采样不明显,但是上采样时就比较明显
  47. i2d[0] = scale;
  48. i2d[1] = 0;
  49. i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;
  50. i2d[3] = 0;
  51. i2d[4] = scale;
  52. i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;
  53. cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
  54. cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
  55. cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
  56. }
  57. cv::Mat i2d_mat()
  58. {
  59. return cv::Mat(2, 3, CV_32F, i2d);
  60. }
  61. };
  62. static float iou(const Box &a, const Box &b)
  63. {
  64. float cleft = max(a.left, b.left);
  65. float ctop = max(a.top, b.top);
  66. float cright = min(a.right, b.right);
  67. float cbottom = min(a.bottom, b.bottom);
  68. float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
  69. if (c_area == 0.0f)
  70. return 0.0f;
  71. float a_area = max(0.0f, a.right - a.left) * max(0.0f, a.bottom - a.top);
  72. float b_area = max(0.0f, b.right - b.left) * max(0.0f, b.bottom - b.top);
  73. return c_area / (a_area + b_area - c_area);
  74. }
  75. static BoxArray cpu_nms(BoxArray &boxes, float threshold)
  76. {
  77. std::sort(boxes.begin(), boxes.end(), [](BoxArray::const_reference a, BoxArray::const_reference b)
  78. { return a.confidence > b.confidence; });
  79. BoxArray output;
  80. output.reserve(boxes.size());
  81. std::vector<bool> remove_flags(boxes.size());
  82. for (int i = 0; i < boxes.size(); ++i)
  83. {
  84. if (remove_flags[i])
  85. continue;
  86. auto &a = boxes[i];
  87. output.emplace_back(a);
  88. for (int j = i + 1; j < boxes.size(); ++j)
  89. {
  90. if (remove_flags[j])
  91. continue;
  92. auto &b = boxes[j];
  93. if (b.class_label == a.class_label)
  94. {
  95. if (iou(a, b) >= threshold)
  96. remove_flags[j] = true;
  97. }
  98. }
  99. }
  100. return output;
  101. }
  102. using ControllerImpl = InferController<
  103. Mat, // input
  104. BoxArray, // output
  105. tuple<string, int>, // start param
  106. AffineMatrix // additional
  107. >;
  108. class InferImpl : public Infer, public ControllerImpl
  109. {
  110. public:
  111. /** 要求在InferImpl里面执行stop,而不是在基类执行stop **/
  112. virtual ~InferImpl()
  113. {
  114. stop();
  115. }
  116. virtual bool startup(
  117. const string &file, Type type, int gpuid,
  118. float confidence_threshold, float nms_threshold,
  119. NMSMethod nms_method, int max_objects,
  120. bool use_multi_preprocess_stream)
  121. {
  122. if (type == Type::V5 || type == Type::V3 || type == Type::V7)
  123. {
  124. normalize_ = CUDAKernel::Norm::alpha_beta(1 / 255.0f, 0.0f, CUDAKernel::ChannelType::Invert);
  125. }
  126. else if (type == Type::X)
  127. {
  128. // float mean[] = {0.485, 0.456, 0.406};
  129. // float std[] = {0.229, 0.224, 0.225};
  130. // normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert);
  131. normalize_ = CUDAKernel::Norm::None();
  132. }
  133. else
  134. {
  135. INFOE("Unsupport type %d", type);
  136. }
  137. use_multi_preprocess_stream_ = use_multi_preprocess_stream;
  138. confidence_threshold_ = confidence_threshold;
  139. nms_threshold_ = nms_threshold;
  140. nms_method_ = nms_method;
  141. max_objects_ = max_objects;
  142. return ControllerImpl::startup(make_tuple(file, gpuid));
  143. }
  144. virtual void worker(promise<bool> &result) override
  145. {
  146. string file = get<0>(start_param_);
  147. int gpuid = get<1>(start_param_);
  148. TRT::set_device(gpuid);
  149. auto engine = TRT::load_infer(file);
  150. if (engine == nullptr)
  151. {
  152. INFOE("Engine %s load failed", file.c_str());
  153. result.set_value(false);
  154. return;
  155. }
  156. engine->print();
  157. const int MAX_IMAGE_BBOX = max_objects_;
  158. const int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
  159. TRT::Tensor affin_matrix_device(TRT::DataType::Float);
  160. TRT::Tensor output_array_device(TRT::DataType::Float);
  161. int max_batch_size = engine->get_max_batch_size();
  162. auto input = engine->tensor("images");
  163. auto output = engine->tensor("output0");
  164. int num_classes = output->size(2) - 5;
  165. input_width_ = input->size(3);
  166. input_height_ = input->size(2);
  167. tensor_allocator_ = make_shared<MonopolyAllocator<TRT::Tensor>>(max_batch_size * 2);
  168. stream_ = engine->get_stream();
  169. gpu_ = gpuid;
  170. result.set_value(true);
  171. input->resize_single_dim(0, max_batch_size).to_gpu();
  172. affin_matrix_device.set_stream(stream_);
  173. // 这里8个值的目的是保证 8 * sizeof(float) % 32 == 0
  174. affin_matrix_device.resize(max_batch_size, 8).to_gpu();
  175. // 这里的 1 + MAX_IMAGE_BBOX 结构是 counter + bboxes ...
  176. output_array_device.resize(max_batch_size, 1 + MAX_IMAGE_BBOX * NUM_BOX_ELEMENT).to_gpu();
  177. std::vector<Job> fetch_jobs;
  178. while (get_jobs_and_wait(fetch_jobs, max_batch_size))
  179. {
  180. int infer_batch_size = fetch_jobs.size();
  181. input->resize_single_dim(0, infer_batch_size);
  182. for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch)
  183. {
  184. auto &job = fetch_jobs[ibatch];
  185. auto &mono = job.mono_tensor->data();
  186. if (mono->get_stream() != stream_)
  187. {
  188. // synchronize preprocess stream finish
  189. checkCudaRuntime(cudaStreamSynchronize(mono->get_stream()));
  190. }
  191. affin_matrix_device.copy_from_gpu(affin_matrix_device.offset(ibatch), mono->get_workspace()->gpu(), 6);
  192. input->copy_from_gpu(input->offset(ibatch), mono->gpu(), mono->count());
  193. job.mono_tensor->release();
  194. }
  195. engine->forward(false);
  196. output_array_device.to_gpu(false);
  197. for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch)
  198. {
  199. auto &job = fetch_jobs[ibatch];
  200. float *image_based_output = output->gpu<float>(ibatch);
  201. float *output_array_ptr = output_array_device.gpu<float>(ibatch);
  202. auto affine_matrix = affin_matrix_device.gpu<float>(ibatch);
  203. checkCudaRuntime(cudaMemsetAsync(output_array_ptr, 0, sizeof(int), stream_));
  204. decode_kernel_invoker(image_based_output, output->size(1), num_classes, confidence_threshold_, affine_matrix, output_array_ptr, MAX_IMAGE_BBOX, stream_);
  205. if (nms_method_ == NMSMethod::FastGPU)
  206. {
  207. nms_kernel_invoker(output_array_ptr, nms_threshold_, MAX_IMAGE_BBOX, stream_);
  208. }
  209. }
  210. output_array_device.to_cpu();
  211. for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch)
  212. {
  213. float *parray = output_array_device.cpu<float>(ibatch);
  214. int count = min(MAX_IMAGE_BBOX, (int)*parray);
  215. auto &job = fetch_jobs[ibatch];
  216. auto &image_based_boxes = job.output;
  217. for (int i = 0; i < count; ++i)
  218. {
  219. float *pbox = parray + 1 + i * NUM_BOX_ELEMENT;
  220. int label = pbox[5];
  221. int keepflag = pbox[6];
  222. if (keepflag == 1)
  223. {
  224. image_based_boxes.emplace_back(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
  225. }
  226. }
  227. if (nms_method_ == NMSMethod::CPU)
  228. {
  229. image_based_boxes = cpu_nms(image_based_boxes, nms_threshold_);
  230. }
  231. job.pro->set_value(image_based_boxes);
  232. }
  233. fetch_jobs.clear();
  234. }
  235. stream_ = nullptr;
  236. tensor_allocator_.reset();
  237. INFO("Engine destroy.");
  238. }
  239. virtual bool preprocess(Job &job, const Mat &image) override
  240. {
  241. if (tensor_allocator_ == nullptr)
  242. {
  243. INFOE("tensor_allocator_ is nullptr");
  244. return false;
  245. }
  246. if (image.empty())
  247. {
  248. INFOE("Image is empty");
  249. return false;
  250. }
  251. job.mono_tensor = tensor_allocator_->query();
  252. if (job.mono_tensor == nullptr)
  253. {
  254. INFOE("Tensor allocator query failed.");
  255. return false;
  256. }
  257. CUDATools::AutoDevice auto_device(gpu_);
  258. auto &tensor = job.mono_tensor->data();
  259. TRT::CUStream preprocess_stream = nullptr;
  260. if (tensor == nullptr)
  261. {
  262. // not init
  263. tensor = make_shared<TRT::Tensor>();
  264. tensor->set_workspace(make_shared<TRT::MixMemory>());
  265. if (use_multi_preprocess_stream_)
  266. {
  267. checkCudaRuntime(cudaStreamCreate(&preprocess_stream));
  268. // owner = true, stream needs to be free during deconstruction
  269. tensor->set_stream(preprocess_stream, true);
  270. }
  271. else
  272. {
  273. preprocess_stream = stream_;
  274. // owner = false, tensor ignored the stream
  275. tensor->set_stream(preprocess_stream, false);
  276. }
  277. }
  278. Size input_size(input_width_, input_height_);
  279. job.additional.compute(image.size(), input_size);
  280. preprocess_stream = tensor->get_stream();
  281. tensor->resize(1, 3, input_height_, input_width_);
  282. size_t size_image = image.cols * image.rows * 3;
  283. size_t size_matrix = iLogger::upbound(sizeof(job.additional.d2i), 32);
  284. auto workspace = tensor->get_workspace();
  285. uint8_t *gpu_workspace = (uint8_t *)workspace->gpu(size_matrix + size_image);
  286. float *affine_matrix_device = (float *)gpu_workspace;
  287. uint8_t *image_device = size_matrix + gpu_workspace;
  288. uint8_t *cpu_workspace = (uint8_t *)workspace->cpu(size_matrix + size_image);
  289. float *affine_matrix_host = (float *)cpu_workspace;
  290. uint8_t *image_host = size_matrix + cpu_workspace;
  291. // checkCudaRuntime(cudaMemcpyAsync(image_host, image.data, size_image, cudaMemcpyHostToHost, stream_));
  292. // speed up
  293. memcpy(image_host, image.data, size_image);
  294. memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
  295. checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, preprocess_stream));
  296. checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, preprocess_stream));
  297. CUDAKernel::warp_affine_bilinear_and_normalize_plane(
  298. image_device, image.cols * 3, image.cols, image.rows,
  299. tensor->gpu<float>(), input_width_, input_height_,
  300. affine_matrix_device, 114,
  301. normalize_, preprocess_stream);
  302. return true;
  303. }
  304. virtual std::vector<shared_future<BoxArray>> commits(const std::vector<Mat> &images) override
  305. {
  306. return ControllerImpl::commits(images);
  307. }
  308. virtual std::shared_future<BoxArray> commit(const Mat &image) override
  309. {
  310. return ControllerImpl::commit(image);
  311. }
  312. private:
  313. int input_width_ = 0;
  314. int input_height_ = 0;
  315. int gpu_ = 0;
  316. float confidence_threshold_ = 0;
  317. float nms_threshold_ = 0;
  318. int max_objects_ = 1024;
  319. NMSMethod nms_method_ = NMSMethod::FastGPU;
  320. TRT::CUStream stream_ = nullptr;
  321. bool use_multi_preprocess_stream_ = false;
  322. CUDAKernel::Norm normalize_;
  323. };
  324. shared_ptr<Infer> create_infer(
  325. const string &engine_file, Type type, int gpuid,
  326. float confidence_threshold, float nms_threshold,
  327. NMSMethod nms_method, int max_objects,
  328. bool use_multi_preprocess_stream)
  329. {
  330. shared_ptr<InferImpl> instance(new InferImpl());
  331. if (!instance->startup(
  332. engine_file, type, gpuid, confidence_threshold,
  333. nms_threshold, nms_method, max_objects, use_multi_preprocess_stream))
  334. {
  335. instance.reset();
  336. }
  337. return instance;
  338. }
  339. void image_to_tensor(const cv::Mat &image, shared_ptr<TRT::Tensor> &tensor, Type type, int ibatch)
  340. {
  341. CUDAKernel::Norm normalize;
  342. if (type == Type::V5 || type == Type::V3 || type == Type::V7)
  343. {
  344. normalize = CUDAKernel::Norm::alpha_beta(1 / 255.0f, 0.0f, CUDAKernel::ChannelType::Invert);
  345. }
  346. else if (type == Type::X)
  347. {
  348. // float mean[] = {0.485, 0.456, 0.406};
  349. // float std[] = {0.229, 0.224, 0.225};
  350. // normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert);
  351. normalize = CUDAKernel::Norm::None();
  352. }
  353. else
  354. {
  355. INFOE("Unsupport type %d", type);
  356. }
  357. Size input_size(tensor->size(3), tensor->size(2));
  358. AffineMatrix affine;
  359. affine.compute(image.size(), input_size);
  360. size_t size_image = image.cols * image.rows * 3;
  361. size_t size_matrix = iLogger::upbound(sizeof(affine.d2i), 32);
  362. auto workspace = tensor->get_workspace();
  363. uint8_t *gpu_workspace = (uint8_t *)workspace->gpu(size_matrix + size_image);
  364. float *affine_matrix_device = (float *)gpu_workspace;
  365. uint8_t *image_device = size_matrix + gpu_workspace;
  366. uint8_t *cpu_workspace = (uint8_t *)workspace->cpu(size_matrix + size_image);
  367. float *affine_matrix_host = (float *)cpu_workspace;
  368. uint8_t *image_host = size_matrix + cpu_workspace;
  369. auto stream = tensor->get_stream();
  370. memcpy(image_host, image.data, size_image);
  371. memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
  372. checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream));
  373. checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i), cudaMemcpyHostToDevice, stream));
  374. CUDAKernel::warp_affine_bilinear_and_normalize_plane(
  375. image_device, image.cols * 3, image.cols, image.rows,
  376. tensor->gpu<float>(ibatch), input_size.width, input_size.height,
  377. affine_matrix_device, 114,
  378. normalize, stream);
  379. tensor->synchronize();
  380. }
  381. };