dev-guide/sdk-api/infer__trtis__server_8h_source.html

/*

 * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

 * SPDX-License-Identifier: LicenseRef-NvidiaProprietary

 *

 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual

 * property and proprietary rights in and to this material, related

 * documentation and any modifications thereto. Any use, reproduction,

 * disclosure or distribution of this material and related documentation

 * without an express license agreement from NVIDIA CORPORATION or

 * its affiliates is strictly prohibited.

 */


#ifndef __NVDSINFER_TRTIS_SERVER_H__

#define __NVDSINFER_TRTIS_SERVER_H__


#include <string>

#include <thread>

#include <vector>

#include <memory>


#include "infer_batch_buffer.h"

#include "infer_post_datatypes.h"

#include "infer_proto_utils.h"

#include "infer_trtis_utils.h"


#include "nvds_version.h"


#ifdef IS_TEGRA

#define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 5.3

#define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 26)

#else

#define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 6.0

#define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 28)

#endif

#define TRITON_DEFAULT_BACKEND_DIR "/opt/tritonserver/backends"


struct TRITONSERVER_Server;


namespace nvdsinferserver {


namespace ni = inference;


class TrtISServer;

class TrtServerRequest;

class TrtServerResponse;

class TrtServerAllocator;


using SharedRequest = std::shared_ptr<TrtServerRequest>;

using UniqResponse = std::unique_ptr<TrtServerResponse>;

using SharedResponse = std::shared_ptr<TrtServerResponse>;


using TritonInferAsyncDone = std::function<void(SharedRequest, UniqResponse)>;


class TrtServerRequest {

protected:

    friend class TrtISServer;

    TrtServerRequest(TrtServerPtr server);


    NvDsInferStatus init(

        const std::string& model, int64_t version, SharedBatchArray& inputs,

        const std::vector<std::string>& outputs, uint64_t reqId,

        const std::vector<TritonClassParams>& clasList);


    NvDsInferStatus setRequestComplete(

        TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb,

        void* userPtr);


    NvDsInferStatus setResponseComplete(

        ShrTritonAllocator& allocator,

        TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb,

        void* responseUserPtr);


    static void RequestOnRelease(

        TRITONSERVER_InferenceRequest* request, const uint32_t flags,

        void* userp);


public:

    ~TrtServerRequest();


    TRITONSERVER_InferenceRequest* ptr() { return m_ReqPtr.get(); }

    const std::string& model() const { return m_Model; }

    uint64_t id() const { return m_ReqId; }

    uint64_t bufId() const { return m_BufId; }

    SharedBatchArray releaseInputs() { return std::move(m_Inputs); }

    const std::vector<std::string>& outputs() const { return m_Outputs; }

    const std::map<std::string, TritonClassParams>& classParams() const {

        return m_ClasList; }


private:

    NvDsInferStatus setInputs(SharedBatchArray& inputs);

    NvDsInferStatus setOption(const IOptions* option);


    DISABLE_CLASS_COPY(TrtServerRequest);


private:

    UniqTritonT<TRITONSERVER_InferenceRequest> m_ReqPtr;

    std::weak_ptr<TrtISServer> m_Server;

    SharedBatchArray m_Inputs;

    std::string m_Model;

    uint64_t m_ReqId = UINT64_C(0);

    uint64_t m_BufId = UINT64_C(0);


    std::vector<std::string> m_Outputs;

    std::map<std::string, TritonClassParams> m_ClasList;

};


class TrtServerResponse {

    friend class TrtISServer;


protected:

    TrtServerResponse(

        TrtServerPtr server, UniqTritonT<TRITONSERVER_InferenceResponse> data,

        uint64_t id);


public:

    NvDsInferStatus parse(const TrtServerRequest* req);

    const std::string& model() const { return m_Model; }

    std::vector<SharedBatchBuf>& mutableOutputs() { return m_BufOutputs; }

    NvDsInferStatus getStatus() const { return m_Status; }

    SharedOptions takeoverOptions() { return std::move(m_Options); }


private:

    NvDsInferStatus parseParams();


    NvDsInferStatus parseOutputData(const TrtServerRequest* req);


    NvDsInferStatus addClass(

        const TritonClassParams& classP, const InferBufferDescription& desc, uint32_t batchSize,

        uint32_t tensorIdx, const void* base);


    NvDsInferStatus topKClass(

        InferClassificationOutput& ret, const TritonClassParams& classP,

        const InferBufferDescription& desc, uint32_t tensorIdx, const void* base);


    DISABLE_CLASS_COPY(TrtServerResponse);


    uint64_t m_ResponseId = UINT64_C(0);

    ShrTritonT<TRITONSERVER_InferenceResponse> m_Data;

    std::weak_ptr<TrtISServer> m_Server;

    std::string m_Model;

    int64_t m_ModelVersion = UINT64_C(1);

    NvDsInferStatus m_Status = NVDSINFER_SUCCESS;


    std::vector<SharedBatchBuf> m_BufOutputs;

    SharedOptions m_Options;

};


class TrtServerAllocator

    : public std::enable_shared_from_this<TrtServerAllocator> {

public:

    using AllocFn = std::function<SharedSysMem(const std::string&, size_t, InferMemType, int64_t)>;

    using FreeFn = std::function<void(const std::string&, SharedSysMem)>;


    TrtServerAllocator(AllocFn alloc, FreeFn release);


    virtual ~TrtServerAllocator() = default;


    TRITONSERVER_ResponseAllocator* ptr() { return m_Allocator.get(); }


private:

    static TRITONSERVER_Error* ResponseAlloc(

        TRITONSERVER_ResponseAllocator* allocator, const char* tensorName,

        size_t bytes, TRITONSERVER_MemoryType preferredMemType,

        int64_t preferredDevId, void* userP, void** buffer, void** bufferUserP,

        TRITONSERVER_MemoryType* actualMemType, int64_t* actualMemTypeId);


    static TRITONSERVER_Error* ResponseRelease(

        TRITONSERVER_ResponseAllocator* allocator, void* buffer,

        void* bufferUserP, size_t bytes, TRITONSERVER_MemoryType memType,

        int64_t devId);


private:

    DISABLE_CLASS_COPY(TrtServerAllocator);


    UniqTritonT<TRITONSERVER_ResponseAllocator> m_Allocator;

    AllocFn m_allocFn;

    FreeFn m_releaseFn;

};


namespace triton {


struct BackendConfig {

    std::string backend;

    std::string key;

    std::string value;

};


struct RepoSettings {

    std::set<std::string> roots;

    uint32_t logLevel = 0;

    bool tfAllowSoftPlacement = true;

    float tfGpuMemoryFraction = 0;

    bool strictModelConfig = true;

    double minComputeCapacity = TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY;

    uint64_t pinnedMemBytes = TRITON_DEFAULT_PINNED_MEMORY_BYTES;

    std::string backendDirectory{TRITON_DEFAULT_BACKEND_DIR};

    int32_t controlMode = (int32_t)TRITONSERVER_MODEL_CONTROL_EXPLICIT;

    std::map<uint32_t, uint64_t> cudaDevMemMap;

    std::vector<BackendConfig> backendConfigs;


    std::string debugStr;


    bool initFrom(

        const ic::TritonModelRepo& repo, const std::vector<int>& devIds);


    bool operator==(const RepoSettings& other) const;

    bool operator!=(const RepoSettings& other) const

    {

        return !this->operator==(other);

    }

};

}  // namespace triton


class TrtISServer : public std::enable_shared_from_this<TrtISServer> {

    friend class TrtServerRequest;

    friend class TrtServerResponse;


protected:

    TrtISServer(const triton::RepoSettings& repo);


    NvDsInferStatus initialize();


    const triton::RepoSettings& getRepoSettings() { return m_RepoSettings; }


public:

    ~TrtISServer();


    static TrtServerPtr getInstance(const triton::RepoSettings* repo);


    bool isServerReady();


    bool isServerLive();


    bool isModelReady(const std::string& model, int64_t version);


    NvDsInferStatus loadModel(const std::string& modelName);


    NvDsInferStatus unloadModel(const std::string& modelName);


    NvDsInferStatus getModelConfig(

        const std::string& model, int64_t version, ni::ModelConfig& config);


    SharedRequest createRequest(

        const std::string& model, int64_t version, SharedBatchArray& inputs,

        const std::vector<std::string>& outputs, const std::vector<TritonClassParams>& clasList);


    NvDsInferStatus inferAsync(

        SharedRequest request, WeakTritonAllocator allocator,

        TritonInferAsyncDone done);


private:

    UniqResponse createResponse(

        UniqTritonT<TRITONSERVER_InferenceResponse>&& data, uint64_t id);


    using InferUserData =

        std::tuple<SharedRequest, TritonInferAsyncDone, TrtISServer*>;

    static void InferComplete(

        TRITONSERVER_InferenceResponse* response, const uint32_t flags,

        void* userp);


    TRITONSERVER_Server* serverPtr() const { return m_Impl.get(); }


    DISABLE_CLASS_COPY(TrtISServer);


    static std::weak_ptr<TrtISServer> sTrtServerInstance;

    static std::mutex sTrtServerMutex;


private:

    UniqTritonT<TRITONSERVER_Server> m_Impl;

    std::atomic<uint64_t> m_LastRequestId{UINT64_C(0)};

    triton::RepoSettings m_RepoSettings;

};


} // namespace nvdsinferserver


#endif