123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- #include <iostream>
- #include <fstream>
- #include <NvInfer.h>
- #include </home/cl/package/TensorRT-8.6.1.6/samples/common/logger.h>
- void CHECK(int status) {
- if (status != 0) {
- std::cerr << "Cuda failure: " << status << std::endl;
- std::abort();
- }
- }
- using namespace nvinfer1;
- using namespace sample;
- const char* IN_NAME = "images";
- const char* OUT_NAME1 = "output";
- const char* OUT_NAME2 = "947";
- const char* OUT_NAME3 = "961";
- static const int IN_H = 640;
- static const int IN_W = 640;
- static const int BATCH_SIZE = 1;
- static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
- void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
- {
- const ICudaEngine& engine = context.getEngine();
- // Pointers to input and output device buffers to pass to engine.
- // Engine requires exactly IEngine::getNbBindings() number of buffers.
- assert(engine.getNbBindings() == 4);
- void* buffers[4];
- // In order to bind the buffers, we need to know the names of the input and output tensors.
- // Note that indices are guaranteed to be less than IEngine::getNbBindings()
- const int inputIndex = engine.getBindingIndex(IN_NAME);
- const int outputIndex1 = engine.getBindingIndex(OUT_NAME1);
- const int outputIndex2 = engine.getBindingIndex(OUT_NAME2);
- const int outputIndex3 = engine.getBindingIndex(OUT_NAME3);
- printf("outputIndex1: %d, outputIndex2: %d, outputIndex3: %d\n", outputIndex1, outputIndex2, outputIndex3);
- // Create GPU buffers on device
- CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float)));
- CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float)));
- CHECK(cudaMalloc(&buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float)));
- CHECK(cudaMalloc(&buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float)));
- // Create stream
- cudaStream_t stream;
- CHECK(cudaStreamCreate(&stream));
- // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
- CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream));
- context.enqueue(batchSize, buffers, stream, nullptr);
- CHECK(cudaMemcpyAsync(output, buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
- CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4) * 6, buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
- CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8) * 6, buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
- cudaStreamSynchronize(stream);
- // Release stream and buffers
- cudaStreamDestroy(stream);
- CHECK(cudaFree(buffers[inputIndex]));
- CHECK(cudaFree(buffers[outputIndex1]));
- CHECK(cudaFree(buffers[outputIndex2]));
- CHECK(cudaFree(buffers[outputIndex3]));
- }
- int main(int argc, char** argv)
- {
- // create a model using the API directly and serialize it to a stream
- char *trtModelStream{ nullptr };
- size_t size{ 0 };
- std::ifstream file("yolov5-crowd-n.engine", std::ios::binary);
- if (file.good()) {
- file.seekg(0, file.end);
- size = file.tellg();
- file.seekg(0, file.beg);
- trtModelStream = new char[size];
- assert(trtModelStream);
- file.read(trtModelStream, size);
- file.close();
- }
- Logger m_logger;
- IRuntime* runtime = createInferRuntime(m_logger);
- assert(runtime != nullptr);
- ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
- assert(engine != nullptr);
- IExecutionContext* context = engine->createExecutionContext();
- assert(context != nullptr);
- // generate input data
- float data[BATCH_SIZE * 3 * IN_H * IN_W];
- for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++)
- data[i] = 1;
- // Run inference
- int num_total = 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8 + IN_H/16 * IN_W/16) * 6;
- float prob[num_total];
- printf("num_total: %d\n", num_total);
- doInference(*context, data, prob, BATCH_SIZE);
- float *prob_ptr = prob;
- int count = 0;
- int invalid_count = 0;
- for (int i = 0; i < num_total / 6; i++)
- {
- {
- for (int j = 0; j < 6; j++)
- {
- //printf("%f ", prob_ptr[j]);
- //count++;
- }
- if( prob_ptr[4] > 1){
- invalid_count++;
- }else{
- count++;
- }
- //printf("\n");
-
- }
- prob_ptr += 6;
- }
- printf("invalid_count: %d\n", invalid_count);
- printf("count: %d\n", count);
- printf("inference done");
- // Destroy the engine
- context->destroy();
- engine->destroy();
- runtime->destroy();
- return 0;
- }
|