test_trt.cpp 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. #include <iostream>
  2. #include <fstream>
  3. #include <NvInfer.h>
  4. #include </home/cl/package/TensorRT-8.6.1.6/samples/common/logger.h>
  5. void CHECK(int status) {
  6. if (status != 0) {
  7. std::cerr << "Cuda failure: " << status << std::endl;
  8. std::abort();
  9. }
  10. }
  11. using namespace nvinfer1;
  12. using namespace sample;
  13. const char* IN_NAME = "images";
  14. const char* OUT_NAME1 = "output";
  15. const char* OUT_NAME2 = "947";
  16. const char* OUT_NAME3 = "961";
  17. static const int IN_H = 640;
  18. static const int IN_W = 640;
  19. static const int BATCH_SIZE = 1;
  20. static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  21. void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
  22. {
  23. const ICudaEngine& engine = context.getEngine();
  24. // Pointers to input and output device buffers to pass to engine.
  25. // Engine requires exactly IEngine::getNbBindings() number of buffers.
  26. assert(engine.getNbBindings() == 4);
  27. void* buffers[4];
  28. // In order to bind the buffers, we need to know the names of the input and output tensors.
  29. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
  30. const int inputIndex = engine.getBindingIndex(IN_NAME);
  31. const int outputIndex1 = engine.getBindingIndex(OUT_NAME1);
  32. const int outputIndex2 = engine.getBindingIndex(OUT_NAME2);
  33. const int outputIndex3 = engine.getBindingIndex(OUT_NAME3);
  34. printf("outputIndex1: %d, outputIndex2: %d, outputIndex3: %d\n", outputIndex1, outputIndex2, outputIndex3);
  35. // Create GPU buffers on device
  36. CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float)));
  37. CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float)));
  38. CHECK(cudaMalloc(&buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float)));
  39. CHECK(cudaMalloc(&buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float)));
  40. // Create stream
  41. cudaStream_t stream;
  42. CHECK(cudaStreamCreate(&stream));
  43. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
  44. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream));
  45. context.enqueue(batchSize, buffers, stream, nullptr);
  46. CHECK(cudaMemcpyAsync(output, buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
  47. CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4) * 6, buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
  48. CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8) * 6, buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
  49. cudaStreamSynchronize(stream);
  50. // Release stream and buffers
  51. cudaStreamDestroy(stream);
  52. CHECK(cudaFree(buffers[inputIndex]));
  53. CHECK(cudaFree(buffers[outputIndex1]));
  54. CHECK(cudaFree(buffers[outputIndex2]));
  55. CHECK(cudaFree(buffers[outputIndex3]));
  56. }
  57. int main(int argc, char** argv)
  58. {
  59. // create a model using the API directly and serialize it to a stream
  60. char *trtModelStream{ nullptr };
  61. size_t size{ 0 };
  62. std::ifstream file("yolov5-crowd-n.engine", std::ios::binary);
  63. if (file.good()) {
  64. file.seekg(0, file.end);
  65. size = file.tellg();
  66. file.seekg(0, file.beg);
  67. trtModelStream = new char[size];
  68. assert(trtModelStream);
  69. file.read(trtModelStream, size);
  70. file.close();
  71. }
  72. Logger m_logger;
  73. IRuntime* runtime = createInferRuntime(m_logger);
  74. assert(runtime != nullptr);
  75. ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
  76. assert(engine != nullptr);
  77. IExecutionContext* context = engine->createExecutionContext();
  78. assert(context != nullptr);
  79. // generate input data
  80. float data[BATCH_SIZE * 3 * IN_H * IN_W];
  81. for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++)
  82. data[i] = 1;
  83. // Run inference
  84. int num_total = 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8 + IN_H/16 * IN_W/16) * 6;
  85. float prob[num_total];
  86. printf("num_total: %d\n", num_total);
  87. doInference(*context, data, prob, BATCH_SIZE);
  88. float *prob_ptr = prob;
  89. int count = 0;
  90. int invalid_count = 0;
  91. for (int i = 0; i < num_total / 6; i++)
  92. {
  93. {
  94. for (int j = 0; j < 6; j++)
  95. {
  96. //printf("%f ", prob_ptr[j]);
  97. //count++;
  98. }
  99. if( prob_ptr[4] > 1){
  100. invalid_count++;
  101. }else{
  102. count++;
  103. }
  104. //printf("\n");
  105. }
  106. prob_ptr += 6;
  107. }
  108. printf("invalid_count: %d\n", invalid_count);
  109. printf("count: %d\n", count);
  110. printf("inference done");
  111. // Destroy the engine
  112. context->destroy();
  113. engine->destroy();
  114. runtime->destroy();
  115. return 0;
  116. }