|
NVIDIA DeepStream SDK API Reference
|
7.1 Release
|
Go to the documentation of this file.
24 #ifndef __NVDSINFER_TRTIS_SERVER_H__
25 #define __NVDSINFER_TRTIS_SERVER_H__
40 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 5.3
41 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 26)
43 #define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY 6.0
44 #define TRITON_DEFAULT_PINNED_MEMORY_BYTES (1 << 28)
46 #define TRITON_DEFAULT_BACKEND_DIR "/opt/tritonserver/backends"
48 struct TRITONSERVER_Server;
52 namespace ni = inference;
55 class TrtServerRequest;
56 class TrtServerResponse;
57 class TrtServerAllocator;
91 const std::vector<std::string>&
outputs, uint64_t reqId,
92 const std::vector<TritonClassParams>& clasList);
114 void* responseUserPtr);
123 TRITONSERVER_InferenceRequest* request,
const uint32_t flags,
135 TRITONSERVER_InferenceRequest*
ptr() {
return m_ReqPtr.get(); }
139 const std::string&
model()
const {
return m_Model; }
143 uint64_t
id()
const {
return m_ReqId; }
147 uint64_t
bufId()
const {
return m_BufId; }
156 const std::vector<std::string>&
outputs()
const {
return m_Outputs; }
161 const std::map<std::string, TritonClassParams>&
classParams()
const {
188 std::weak_ptr<TrtISServer> m_Server;
200 uint64_t m_ReqId = UINT64_C(0);
204 uint64_t m_BufId = UINT64_C(0);
209 std::vector<std::string> m_Outputs;
213 std::map<std::string, TritonClassParams> m_ClasList;
248 const std::string&
model()
const {
return m_Model; }
294 uint32_t tensorIdx,
const void* base);
315 uint64_t m_ResponseId = UINT64_C(0);
323 std::weak_ptr<TrtISServer> m_Server;
331 int64_t m_ModelVersion = UINT64_C(1);
341 std::vector<SharedBatchBuf> m_BufOutputs;
352 :
public std::enable_shared_from_this<TrtServerAllocator> {
374 TRITONSERVER_ResponseAllocator*
ptr() {
return m_Allocator.get(); }
392 static TRITONSERVER_Error* ResponseAlloc(
393 TRITONSERVER_ResponseAllocator* allocator,
const char* tensorName,
395 int64_t preferredDevId,
void* userP,
void** buffer,
void** bufferUserP,
408 static TRITONSERVER_Error* ResponseRelease(
409 TRITONSERVER_ResponseAllocator* allocator,
void* buffer,
513 const ic::TritonModelRepo& repo,
const std::vector<int>& devIds);
533 class TrtISServer :
public std::enable_shared_from_this<TrtISServer> {
598 bool isModelReady(
const std::string& model, int64_t version);
622 const std::string& model, int64_t version, ni::ModelConfig& config);
636 const std::vector<std::string>& outputs,
const std::vector<TritonClassParams>& clasList);
668 using InferUserData =
669 std::tuple<SharedRequest, TritonInferAsyncDone, TrtISServer*>;
679 static void InferComplete(
680 TRITONSERVER_InferenceResponse* response,
const uint32_t flags,
686 TRITONSERVER_Server* serverPtr()
const {
return m_Impl.get(); }
693 static std::weak_ptr<TrtISServer> sTrtServerInstance;
697 static std::mutex sTrtServerMutex;
703 UniqTritonT<TRITONSERVER_Server> m_Impl;
707 std::atomic<uint64_t> m_LastRequestId{UINT64_C(0)};
711 triton::RepoSettings m_RepoSettings;
double minComputeCapacity
The minimun supported compute compability for Triton server.
This is a header file for pre-processing cuda kernels with normalization and mean subtraction require...
bool initFrom(const ic::TritonModelRepo &repo, const std::vector< int > &devIds)
Populate the RepoSettings instance with the values from the TritonModelRepo protobuf message.
const std::vector< std::string > & outputs() const
Get the list of requested output layer names.
std::unique_ptr< TrtServerResponse > UniqResponse
#define TRITON_DEFAULT_MINIMUM_COMPUTE_CAPABILITY
bool isModelReady(const std::string &model, int64_t version)
Check if the server is ready for inference using specified model.
NvDsInferStatus setRequestComplete(TRITONSERVER_InferenceRequestReleaseFn_t requestCompleteCb, void *userPtr)
Set the release callback function for the request.
#define TRITON_DEFAULT_BACKEND_DIR
Wrapper class for creating Triton Inference Server instance.
std::set< std::string > roots
Set of model repository directories.
uint64_t id() const
Get the request ID.
const std::map< std::string, TritonClassParams > & classParams() const
Get the Triton classification parameters list (tensor name : classification parameters).
std::string debugStr
Debug string of the TritonModelRepo protobuf message.
std::shared_ptr< SysMem > SharedSysMem
Wrapper class for Triton output parsing.
NvDsInferStatus getStatus() const
Check if the response could be parsed correctly.
std::shared_ptr< T > ShrTritonT
static TrtServerPtr getInstance(const triton::RepoSettings *repo)
Get a new or existing instance of the Triton Inference Server.
SharedRequest createRequest(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, const std::vector< TritonClassParams > &clasList)
Create and initializes a new inference request.
enum TRITONSERVER_memorytype_enum TRITONSERVER_MemoryType
TRITONSERVER_MemoryType.
uint64_t pinnedMemBytes
Pre-allocated pinned memory on host for Triton runtime.
bool strictModelConfig
Flag to enable/disable Triton strict model configuration.
std::weak_ptr< TrtServerAllocator > WeakTritonAllocator
@ NVDSINFER_SUCCESS
NvDsInferContext operation succeeded.
Header file of batch buffer related class declarations.
Wrapper class for Triton server output memory allocator.
std::vector< SharedBatchBuf > & mutableOutputs()
Get the list of output batch buffers.
uint64_t bufId() const
Get the input buffer ID associated with the request.
std::shared_ptr< TrtServerAllocator > ShrTritonAllocator
bool isServerReady()
Check if the server is ready.
#define TRITON_DEFAULT_PINNED_MEMORY_BYTES
uint32_t logLevel
Level of the Triton log output.
std::shared_ptr< TrtISServer > TrtServerPtr
NvDsInferStatus initialize()
Create a new instance of the Triton Inference Server.
std::string backend
Name of the backend.
NvDsInferStatus getModelConfig(const std::string &model, int64_t version, ni::ModelConfig &config)
Get the model configuration for the specified model.
Model repository settings for the Triton Inference Server.
TRITONSERVER_ResponseAllocator * ptr()
Get the pointer to the TRITONSERVER_ResponseAllocator instance.
std::string key
Name of the setting.
TRITONSERVER_InferenceRequest * ptr()
Get the pointer to the Triton inference request object.
std::shared_ptr< TrtServerResponse > SharedResponse
TrtISServer(const triton::RepoSettings &repo)
Constructor.
std::unique_ptr< T, std::function< void(T *)> > UniqTritonT
void(* TRITONSERVER_InferenceResponseCompleteFn_t)(struct TRITONSERVER_InferenceResponse *response, const uint32_t flags, void *userp)
Type for callback function indicating that an inference response has completed.
float tfGpuMemoryFraction
TensorFlow GPU memory fraction per process.
int32_t controlMode
Triton model control mode.
std::function< void(SharedRequest, UniqResponse)> TritonInferAsyncDone
SharedOptions takeoverOptions()
Get and own the options list.
std::string value
Value of the setting.
@ TRITONSERVER_MODEL_CONTROL_EXPLICIT
TrtServerRequest(TrtServerPtr server)
Constructor.
std::shared_ptr< IOptions > SharedOptions
NvDsInferStatus init(const std::string &model, int64_t version, SharedBatchArray &inputs, const std::vector< std::string > &outputs, uint64_t reqId, const std::vector< TritonClassParams > &clasList)
Create a new Triton inference request with the specified inputs and parameters.
static void RequestOnRelease(TRITONSERVER_InferenceRequest *request, const uint32_t flags, void *userp)
The callback function to release the request instance.
const std::string & model() const
Get the model name parsed from the Triton response.
virtual ~TrtServerAllocator()=default
Destructor.
bool tfAllowSoftPlacement
Flag to enable/disable soft placement of TF operators.
std::function< SharedSysMem(const std::string &, size_t, InferMemType, int64_t)> AllocFn
~TrtServerRequest()
Destructor.
The backend configuration settings.
std::function< void(const std::string &, SharedSysMem)> FreeFn
bool isServerLive()
Check if the server is live.
std::string backendDirectory
The path to the Triton backends directory.
NvDsInferStatus parse(const TrtServerRequest *req)
Check for error and parse the inference output.
NvDsInferStatus setResponseComplete(ShrTritonAllocator &allocator, TRITONSERVER_InferenceResponseCompleteFn_t responseCompleteCb, void *responseUserPtr)
Set the allocator and response callback for the request.
std::vector< BackendConfig > backendConfigs
Array of backend configurations settings.
bool operator!=(const RepoSettings &other) const
InferMemType
The memory types of inference buffers.
#define DISABLE_CLASS_COPY(NoCopyClass)
NvDsInferStatus loadModel(const std::string &modelName)
Load or reload the specified model.
bool operator==(const RepoSettings &other) const
Comparison operators.
void(* TRITONSERVER_InferenceRequestReleaseFn_t)(struct TRITONSERVER_InferenceRequest *request, const uint32_t flags, void *userp)
Type for inference request release callback function.
~TrtISServer()
Destructor.
const std::string & model() const
Get the model name.
TrtServerResponse(TrtServerPtr server, UniqTritonT< TRITONSERVER_InferenceResponse > data, uint64_t id)
Constructor.
Wrapper class for Triton inference request.
std::shared_ptr< TrtServerRequest > SharedRequest
std::shared_ptr< BaseBatchArray > SharedBatchArray
NvDsInferStatus unloadModel(const std::string &modelName)
Unload the specified model.
Triton Inference Server utilies header file.
NvDsInferStatus inferAsync(SharedRequest request, WeakTritonAllocator allocator, TritonInferAsyncDone done)
Submit a request for asynchronous inference.
const triton::RepoSettings & getRepoSettings()
Get the model repository settings.
SharedBatchArray releaseInputs()
Release the ownership of input batch buffer array.
TrtServerAllocator(AllocFn alloc, FreeFn release)
Constructor, create an instance of the type TRITONSERVER_ResponseAllocator which calls provided alloc...
NvDsInferStatus
Enum for the status codes returned by NvDsInferContext.
Holds the information about a inference buffer.
std::map< uint32_t, uint64_t > cudaDevMemMap
Map of the device IDs and corresponding size of CUDA memory pool to be allocated.