MAS-Algo
/
head-count-algo


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
							#include <iostream>
#include <fstream>

#include <NvInfer.h>
#include </home/cl/package/TensorRT-8.6.1.6/samples/common/logger.h>


void CHECK(int status) {
    if (status != 0) {
        std::cerr << "Cuda failure: " << status << std::endl;
        std::abort();
    }
}


using namespace nvinfer1;

using namespace sample;


const char* IN_NAME = "images";

const char* OUT_NAME1 = "output";
const char* OUT_NAME2 = "947";
const char* OUT_NAME3 = "961";

static const int IN_H = 640;

static const int IN_W = 640;

static const int BATCH_SIZE = 1;

static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);


void doInference(IExecutionContext& context, float* input, float* output, int batchSize)

{

        const ICudaEngine& engine = context.getEngine();


        // Pointers to input and output device buffers to pass to engine.

        // Engine requires exactly IEngine::getNbBindings() number of buffers.

        assert(engine.getNbBindings() == 4);

        void* buffers[4];


        // In order to bind the buffers, we need to know the names of the input and output tensors.

        // Note that indices are guaranteed to be less than IEngine::getNbBindings()

        const int inputIndex = engine.getBindingIndex(IN_NAME);

        const int outputIndex1 = engine.getBindingIndex(OUT_NAME1);
        const int outputIndex2 = engine.getBindingIndex(OUT_NAME2);
        const int outputIndex3 = engine.getBindingIndex(OUT_NAME3);
        printf("outputIndex1: %d, outputIndex2: %d, outputIndex3: %d\n", outputIndex1, outputIndex2, outputIndex3);


        // Create GPU buffers on device

        CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float)));

        CHECK(cudaMalloc(&buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float)));
        CHECK(cudaMalloc(&buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float)));
        CHECK(cudaMalloc(&buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float)));


        // Create stream

        cudaStream_t stream;

        CHECK(cudaStreamCreate(&stream));


        // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host

        CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream));

        context.enqueue(batchSize, buffers, stream, nullptr);

        CHECK(cudaMemcpyAsync(output, buffers[outputIndex1], batchSize * 3 * IN_H/4 * IN_W /4 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
        CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4) * 6, buffers[outputIndex2], batchSize * 3 * IN_H/8 * IN_W /8 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));
        CHECK(cudaMemcpyAsync(output + 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8) * 6, buffers[outputIndex3], batchSize * 3 * IN_H/16 * IN_W /16 * 6 * sizeof(float), cudaMemcpyDeviceToHost, stream));

        cudaStreamSynchronize(stream);


        // Release stream and buffers

        cudaStreamDestroy(stream);

        CHECK(cudaFree(buffers[inputIndex]));

        CHECK(cudaFree(buffers[outputIndex1]));
        CHECK(cudaFree(buffers[outputIndex2]));
        CHECK(cudaFree(buffers[outputIndex3]));

}


int main(int argc, char** argv)

{

        // create a model using the API directly and serialize it to a stream

        char *trtModelStream{ nullptr };

        size_t size{ 0 };


        std::ifstream file("yolov5-crowd-n.engine", std::ios::binary);

        if (file.good()) {

                file.seekg(0, file.end);

                size = file.tellg();

                file.seekg(0, file.beg);

                trtModelStream = new char[size];

                assert(trtModelStream);

                file.read(trtModelStream, size);

                file.close();

        }


        Logger m_logger;

        IRuntime* runtime = createInferRuntime(m_logger);

        assert(runtime != nullptr);

        ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);

        assert(engine != nullptr);

        IExecutionContext* context = engine->createExecutionContext();

        assert(context != nullptr);


        // generate input data

        float data[BATCH_SIZE * 3 * IN_H * IN_W];

        for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++)

                data[i] = 1;


        // Run inference
        int num_total = 3 * (IN_H/4 * IN_W/4 + IN_H/8 * IN_W/8 + IN_H/16 * IN_W/16) * 6;
        float prob[num_total];
        printf("num_total: %d\n", num_total);

        doInference(*context, data, prob, BATCH_SIZE);
        float *prob_ptr = prob;
        int count = 0;
        int invalid_count = 0;
        for (int i = 0; i < num_total / 6; i++)
        {
            {
                for (int j = 0; j < 6; j++)
                {
                    //printf("%f ", prob_ptr[j]);
                    //count++;
                }
                if( prob_ptr[4] > 1){
                    invalid_count++;
                }else{
                    count++;
                }
                //printf("\n");
                
            }
            prob_ptr += 6;

        }
        printf("invalid_count: %d\n", invalid_count);
        printf("count: %d\n", count);
        printf("inference done");


        // Destroy the engine

        context->destroy();

        engine->destroy();

        runtime->destroy();

        return 0;

}