qiaofeishuo
/
HeadCountAlgo


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
							#include "yolocrowd.hpp"

#include "ilogger.hpp"
#include "trt_infer.hpp"
#include "infer_controller.hpp"
#include "preprocess_kernel.cuh"
#include "monopoly_allocator.hpp"

#include <atomic>
#include <mutex>
#include <queue>
#include <condition_variable>

namespace YoloCrowd
{
    using namespace cv;
    using namespace std;

    const char *type_name(Type type)
    {
        switch (type)
        {
        case Type::V5:
            return "YoloV5";
        case Type::V3:
            return "YoloV3";
        case Type::V7:
            return "YoloV7";
        case Type::X:
            return "YoloX";
        default:
            return "Unknow";
        }
    }

    void decode_kernel_invoker(
        float *predict, int num_bboxes, int num_classes, float confidence_threshold,
        float *invert_affine_matrix, float *parray,
        int max_objects, cudaStream_t stream);

    void nms_kernel_invoker(
        float *parray, float nms_threshold, int max_objects, cudaStream_t stream);

    struct AffineMatrix
    {
        float i2d[6]; // image to dst(network), 2x3 matrix
        float d2i[6]; // dst to image, 2x3 matrix

        void compute(const cv::Size &from, const cv::Size &to)
        {
            float scale_x = to.width / (float)from.width;
            float scale_y = to.height / (float)from.height;

            float scale = std::min(scale_x, scale_y);

            // + scale * 0.5 - 0.5 的主要原因是使得中心更加对齐，下采样不明显，但是上采样时就比较明显
            i2d[0] = scale;
            i2d[1] = 0;
            i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;
            i2d[3] = 0;
            i2d[4] = scale;
            i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;

            cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
            cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
            cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
        }

        cv::Mat i2d_mat()
        {
            return cv::Mat(2, 3, CV_32F, i2d);
        }
    };

    static float iou(const Box &a, const Box &b)
    {
        float cleft = max(a.left, b.left);
        float ctop = max(a.top, b.top);
        float cright = min(a.right, b.right);
        float cbottom = min(a.bottom, b.bottom);

        float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
        if (c_area == 0.0f)
            return 0.0f;

        float a_area = max(0.0f, a.right - a.left) * max(0.0f, a.bottom - a.top);
        float b_area = max(0.0f, b.right - b.left) * max(0.0f, b.bottom - b.top);
        return c_area / (a_area + b_area - c_area);
    }

    static BoxArray cpu_nms(BoxArray &boxes, float threshold)
    {
        std::sort(boxes.begin(), boxes.end(), [](BoxArray::const_reference a, BoxArray::const_reference b)
                  { return a.confidence > b.confidence; });

        BoxArray output;
        output.reserve(boxes.size());

        std::vector<bool> remove_flags(boxes.size());
        for (int i = 0; i < boxes.size(); ++i)
        {

            if (remove_flags[i])
                continue;

            auto &a = boxes[i];
            output.emplace_back(a);

            for (int j = i + 1; j < boxes.size(); ++j)
            {
                if (remove_flags[j])
                    continue;

                auto &b = boxes[j];
                if (b.class_label == a.class_label)
                {
                    if (iou(a, b) >= threshold)
                        remove_flags[j] = true;
                }
            }
        }
        return output;
    }

    using ControllerImpl = InferController<
        Mat,                // input
        BoxArray,           // output
        tuple<string, int>, // start param
        AffineMatrix        // additional
        >;
    class InferImpl : public Infer, public ControllerImpl
    {
    public:
        /** 要求在InferImpl里面执行stop，而不是在基类执行stop **/
        virtual ~InferImpl()
        {
            stop();
        }

        virtual bool startup(
            const string &file, Type type, int gpuid,
            float confidence_threshold, float nms_threshold,
            NMSMethod nms_method, int max_objects,
            bool use_multi_preprocess_stream)
        {
            if (type == Type::V5 || type == Type::V3 || type == Type::V7)
            {
                normalize_ = CUDAKernel::Norm::alpha_beta(1 / 255.0f, 0.0f, CUDAKernel::ChannelType::Invert);
            }
            else if (type == Type::X)
            {
                // float mean[] = {0.485, 0.456, 0.406};
                // float std[]  = {0.229, 0.224, 0.225};
                // normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert);
                normalize_ = CUDAKernel::Norm::None();
            }
            else
            {
                INFOE("Unsupport type %d", type);
            }

            use_multi_preprocess_stream_ = use_multi_preprocess_stream;
            confidence_threshold_ = confidence_threshold;
            nms_threshold_ = nms_threshold;
            nms_method_ = nms_method;
            max_objects_ = max_objects;
            return ControllerImpl::startup(make_tuple(file, gpuid));
        }

        virtual void worker(promise<bool> &result) override
        {

            string file = get<0>(start_param_);
            int gpuid = get<1>(start_param_);

            TRT::set_device(gpuid);
            auto engine = TRT::load_infer(file);
            if (engine == nullptr)
            {
                INFOE("Engine %s load failed", file.c_str());
                result.set_value(false);
                return;
            }

            engine->print();

            const int MAX_IMAGE_BBOX = max_objects_;
            const int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
            TRT::Tensor affin_matrix_device(TRT::DataType::Float);
            TRT::Tensor output_array_device(TRT::DataType::Float);
            int max_batch_size = engine->get_max_batch_size();
            auto input = engine->tensor("images");
            auto output = engine->tensor("output0");
            int num_classes = output->size(2) - 5;

            input_width_ = input->size(3);
            input_height_ = input->size(2);
            tensor_allocator_ = make_shared<MonopolyAllocator<TRT::Tensor>>(max_batch_size * 2);
            stream_ = engine->get_stream();
            gpu_ = gpuid;
            result.set_value(true);

            input->resize_single_dim(0, max_batch_size).to_gpu();
            affin_matrix_device.set_stream(stream_);

            // 这里8个值的目的是保证 8 * sizeof(float) % 32 == 0
            affin_matrix_device.resize(max_batch_size, 8).to_gpu();

            // 这里的 1 + MAX_IMAGE_BBOX 结构是 counter + bboxes ...
            output_array_device.resize(max_batch_size, 1 + MAX_IMAGE_BBOX * NUM_BOX_ELEMENT).to_gpu();

            std::vector<Job> fetch_jobs;
            while (get_jobs_and_wait(fetch_jobs, max_batch_size))
            {
                int infer_batch_size = fetch_jobs.size();
                input->resize_single_dim(0, infer_batch_size);

                for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch)
                {
                    auto &job = fetch_jobs[ibatch];
                    auto &mono = job.mono_tensor->data();

                    if (mono->get_stream() != stream_)
                    {
                        // synchronize preprocess stream finish
                        checkCudaRuntime(cudaStreamSynchronize(mono->get_stream()));
                    }

                    affin_matrix_device.copy_from_gpu(affin_matrix_device.offset(ibatch), mono->get_workspace()->gpu(), 6);
                    input->copy_from_gpu(input->offset(ibatch), mono->gpu(), mono->count());
                    job.mono_tensor->release();
                }

                engine->forward(false);
                output_array_device.to_gpu(false);
                for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch)
                {
                    auto &job = fetch_jobs[ibatch];
                    float *image_based_output = output->gpu<float>(ibatch);
                    float *output_array_ptr = output_array_device.gpu<float>(ibatch);
                    auto affine_matrix = affin_matrix_device.gpu<float>(ibatch);
                    checkCudaRuntime(cudaMemsetAsync(output_array_ptr, 0, sizeof(int), stream_));
                    decode_kernel_invoker(image_based_output, output->size(1), num_classes, confidence_threshold_, affine_matrix, output_array_ptr, MAX_IMAGE_BBOX, stream_);

                    if (nms_method_ == NMSMethod::FastGPU)
                    {
                        nms_kernel_invoker(output_array_ptr, nms_threshold_, MAX_IMAGE_BBOX, stream_);
                    }
                }

                output_array_device.to_cpu();
                for (int ibatch = 0; ibatch < infer_batch_size; ++ibatch)
                {
                    float *parray = output_array_device.cpu<float>(ibatch);
                    int count = min(MAX_IMAGE_BBOX, (int)*parray);
                    auto &job = fetch_jobs[ibatch];
                    auto &image_based_boxes = job.output;
                    for (int i = 0; i < count; ++i)
                    {
                        float *pbox = parray + 1 + i * NUM_BOX_ELEMENT;
                        int label = pbox[5];
                        int keepflag = pbox[6];
                        if (keepflag == 1)
                        {
                            image_based_boxes.emplace_back(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
                        }
                    }

                    if (nms_method_ == NMSMethod::CPU)
                    {
                        image_based_boxes = cpu_nms(image_based_boxes, nms_threshold_);
                    }
                    job.pro->set_value(image_based_boxes);
                }
                fetch_jobs.clear();
            }

            stream_ = nullptr;
            tensor_allocator_.reset();
            INFO("Engine destroy.");
        }

        virtual bool preprocess(Job &job, const Mat &image) override
        {

            if (tensor_allocator_ == nullptr)
            {
                INFOE("tensor_allocator_ is nullptr");
                return false;
            }

            if (image.empty())
            {
                INFOE("Image is empty");
                return false;
            }

            job.mono_tensor = tensor_allocator_->query();
            if (job.mono_tensor == nullptr)
            {
                INFOE("Tensor allocator query failed.");
                return false;
            }

            CUDATools::AutoDevice auto_device(gpu_);
            auto &tensor = job.mono_tensor->data();
            TRT::CUStream preprocess_stream = nullptr;

            if (tensor == nullptr)
            {
                // not init
                tensor = make_shared<TRT::Tensor>();
                tensor->set_workspace(make_shared<TRT::MixMemory>());

                if (use_multi_preprocess_stream_)
                {
                    checkCudaRuntime(cudaStreamCreate(&preprocess_stream));

                    // owner = true, stream needs to be free during deconstruction
                    tensor->set_stream(preprocess_stream, true);
                }
                else
                {
                    preprocess_stream = stream_;

                    // owner = false, tensor ignored the stream
                    tensor->set_stream(preprocess_stream, false);
                }
            }

            Size input_size(input_width_, input_height_);
            job.additional.compute(image.size(), input_size);

            preprocess_stream = tensor->get_stream();
            tensor->resize(1, 3, input_height_, input_width_);

            size_t size_image = image.cols * image.rows * 3;
            size_t size_matrix = iLogger::upbound(sizeof(job.additional.d2i), 32);
            auto workspace = tensor->get_workspace();
            uint8_t *gpu_workspace = (uint8_t *)workspace->gpu(size_matrix + size_image);
            float *affine_matrix_device = (float *)gpu_workspace;
            uint8_t *image_device = size_matrix + gpu_workspace;

            uint8_t *cpu_workspace = (uint8_t *)workspace->cpu(size_matrix + size_image);
            float *affine_matrix_host = (float *)cpu_workspace;
            uint8_t *image_host = size_matrix + cpu_workspace;

            // checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
            //  speed up
            memcpy(image_host, image.data, size_image);
            memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
            checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, preprocess_stream));
            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, preprocess_stream));

            CUDAKernel::warp_affine_bilinear_and_normalize_plane(
                image_device, image.cols * 3, image.cols, image.rows,
                tensor->gpu<float>(), input_width_, input_height_,
                affine_matrix_device, 114,
                normalize_, preprocess_stream);
            return true;
        }

        virtual std::vector<shared_future<BoxArray>> commits(const std::vector<Mat> &images) override
        {
            return ControllerImpl::commits(images);
        }

        virtual std::shared_future<BoxArray> commit(const Mat &image) override
        {
            return ControllerImpl::commit(image);
        }

    private:
        int input_width_ = 0;
        int input_height_ = 0;
        int gpu_ = 0;
        float confidence_threshold_ = 0;
        float nms_threshold_ = 0;
        int max_objects_ = 1024;
        NMSMethod nms_method_ = NMSMethod::FastGPU;
        TRT::CUStream stream_ = nullptr;
        bool use_multi_preprocess_stream_ = false;
        CUDAKernel::Norm normalize_;
    };

    shared_ptr<Infer> create_infer(
        const string &engine_file, Type type, int gpuid,
        float confidence_threshold, float nms_threshold,
        NMSMethod nms_method, int max_objects,
        bool use_multi_preprocess_stream)
    {
        shared_ptr<InferImpl> instance(new InferImpl());
        if (!instance->startup(
                engine_file, type, gpuid, confidence_threshold,
                nms_threshold, nms_method, max_objects, use_multi_preprocess_stream))
        {
            instance.reset();
        }
        return instance;
    }

    void image_to_tensor(const cv::Mat &image, shared_ptr<TRT::Tensor> &tensor, Type type, int ibatch)
    {

        CUDAKernel::Norm normalize;
        if (type == Type::V5 || type == Type::V3 || type == Type::V7)
        {
            normalize = CUDAKernel::Norm::alpha_beta(1 / 255.0f, 0.0f, CUDAKernel::ChannelType::Invert);
        }
        else if (type == Type::X)
        {
            // float mean[] = {0.485, 0.456, 0.406};
            // float std[]  = {0.229, 0.224, 0.225};
            // normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert);
            normalize = CUDAKernel::Norm::None();
        }
        else
        {
            INFOE("Unsupport type %d", type);
        }

        Size input_size(tensor->size(3), tensor->size(2));
        AffineMatrix affine;
        affine.compute(image.size(), input_size);

        size_t size_image = image.cols * image.rows * 3;
        size_t size_matrix = iLogger::upbound(sizeof(affine.d2i), 32);
        auto workspace = tensor->get_workspace();
        uint8_t *gpu_workspace = (uint8_t *)workspace->gpu(size_matrix + size_image);
        float *affine_matrix_device = (float *)gpu_workspace;
        uint8_t *image_device = size_matrix + gpu_workspace;

        uint8_t *cpu_workspace = (uint8_t *)workspace->cpu(size_matrix + size_image);
        float *affine_matrix_host = (float *)cpu_workspace;
        uint8_t *image_host = size_matrix + cpu_workspace;
        auto stream = tensor->get_stream();

        memcpy(image_host, image.data, size_image);
        memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
        checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream));
        checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i), cudaMemcpyHostToDevice, stream));

        CUDAKernel::warp_affine_bilinear_and_normalize_plane(
            image_device, image.cols * 3, image.cols, image.rows,
            tensor->gpu<float>(ibatch), input_size.width, input_size.height,
            affine_matrix_device, 114,
            normalize, stream);
        tensor->synchronize();
    }
};