#include #include #include #include void CHECK(int status) { if (status != 0) { std::cerr << "Cuda failure: " << status << std::endl; std::abort(); } } using namespace nvinfer1; using namespace sample; const char* IN_NAME = "images"; const char* OUT_NAME1 = "output"; const char* OUT_NAME2 = "947"; const char* OUT_NAME3 = "961"; static const int IN_H = 640; static const int IN_W = 640; static const int BATCH_SIZE = 1; static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 4); void* buffers[4]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(IN_NAME); const int outputIndex1 = engine.getBindingIndex(OUT_NAME1); const int outputIndex2 = engine.getBindingIndex(OUT_NAME2); const int outputIndex3 = engine.getBindingIndex(OUT_NAME3); printf("outputIndex1: %d, outputIndex2: %d, outputIndex3: %d\n", outputIndex1, outputIndex2, outputIndex3); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4) * 6, buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8) * 6, buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex1])); CHECK(cudaFree(buffers[outputIndex2])); CHECK(cudaFree(buffers[outputIndex3])); } int main(int argc, char** argv) { // create a model using the API directly and serialize it to a stream char *trtModelStream{ nullptr }; size_t size{ 0 }; std::ifstream file("yolov5-crowd-n.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } Logger m_logger; IRuntime* runtime = createInferRuntime(m_logger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); // generate input data float data[BATCH_SIZE * 3 * IN_H * IN_W]; for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++) data[i] = 1; // Run inference int num_total = 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8 + IN_H/16 * IN_W/16) * 6; float prob[num_total]; printf("num_total: %d\n", num_total); doInference(*context, data, prob, BATCH_SIZE); float *prob_ptr = prob; int count = 0; int invalid_count = 0; for (int i = 0; i < num_total / 6; i++) { { for (int j = 0; j < 6; j++) { //printf("%f ", prob_ptr[j]); //count++; } if( prob_ptr[4] > 1){ invalid_count++; }else{ count++; } //printf("\n"); } prob_ptr += 6; } printf("invalid_count: %d\n", invalid_count); printf("count: %d\n", count); printf("inference done"); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; }