docs/html/tensorNet_8h_source.html

 /*
  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */

 #ifndef __TENSOR_NET_H__
 #define __TENSOR_NET_H__

 // forward declaration of IInt8Calibrator
 namespace nvinfer1 { class IInt8Calibrator; }

 // includes
 #include <NvInfer.h>

 #include <jetson-utils/cudaUtility.h>
 #include <jetson-utils/timespec.h>

 #include <vector>
 #include <sstream>
 #include <math.h>


 #if NV_TENSORRT_MAJOR > 1
 typedef nvinfer1::DimsCHW Dims3;

 #define DIMS_C(x) x.d[0]
 #define DIMS_H(x) x.d[1]
 #define DIMS_W(x) x.d[2]

 #else
 typedef nvinfer1::Dims3 Dims3;

 #define DIMS_C(x) x.c
 #define DIMS_H(x) x.h
 #define DIMS_W(x) x.w

 #ifndef NV_TENSORRT_MAJOR
 #define NV_TENSORRT_MAJOR 1
 #define NV_TENSORRT_MINOR 0
 #endif
 #endif


 #define DEFAULT_MAX_BATCH_SIZE  1

 #define LOG_TRT "[TRT]   "


 enum precisionType
 {
         TYPE_DISABLED = 0,
         TYPE_FASTEST,
         TYPE_FP32,
         TYPE_FP16,
         TYPE_INT8,
         NUM_PRECISIONS
 };

 const char* precisionTypeToStr( precisionType type );

 precisionType precisionTypeFromStr( const char* str );

 enum deviceType
 {
         DEVICE_GPU = 0,
         DEVICE_DLA,
         DEVICE_DLA_0 = DEVICE_DLA,
         DEVICE_DLA_1,
         NUM_DEVICES
 };

 const char* deviceTypeToStr( deviceType type );

 deviceType deviceTypeFromStr( const char* str );

 enum modelType
 {
         MODEL_CUSTOM = 0,
         MODEL_CAFFE,
         MODEL_ONNX,
         MODEL_UFF
 };

 const char* modelTypeToStr( modelType type );

 modelType modelTypeFromStr( const char* str );

 enum profilerQuery
 {
         PROFILER_PREPROCESS = 0,
         PROFILER_NETWORK,
         PROFILER_POSTPROCESS,
         PROFILER_VISUALIZE,
         PROFILER_TOTAL,
 };

 const char* profilerQueryToStr( profilerQuery query );

 enum profilerDevice
 {
         PROFILER_CPU = 0,
         PROFILER_CUDA,
 };


 class tensorNet
 {
 public:
         virtual ~tensorNet();

         bool LoadNetwork( const char* prototxt, const char* model, const char* mean=NULL,
                                    const char* input_blob="data", const char* output_blob="prob",
                                    uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE, precisionType precision=TYPE_FASTEST,
                                    deviceType device=DEVICE_GPU, bool allowGPUFallback=true,
                                    nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );

         bool LoadNetwork( const char* prototxt, const char* model, const char* mean,
                                    const char* input_blob, const std::vector<std::string>& output_blobs,
                                    uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE, precisionType precision=TYPE_FASTEST,
                                    deviceType device=DEVICE_GPU, bool allowGPUFallback=true,
                                    nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );

         bool LoadNetwork( const char* prototxt, const char* model, const char* mean,
                                    const char* input_blob, const Dims3& input_dims,
                                    const std::vector<std::string>& output_blobs,
                                    uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE,
                                    precisionType precision=TYPE_FASTEST,
                                    deviceType device=DEVICE_GPU, bool allowGPUFallback=true,
                                    nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );

         void EnableLayerProfiler();

         void EnableDebug();

         inline bool AllowGPUFallback() const                            { return mAllowGPUFallback; }

         inline deviceType GetDevice() const                             { return mDevice; }

         inline precisionType GetPrecision() const                       { return mPrecision; }

         inline bool IsPrecision( precisionType type ) const     { return (mPrecision == type); }

         static precisionType FindFastestPrecision( deviceType device=DEVICE_GPU, bool allowInt8=true );

         static std::vector<precisionType> DetectNativePrecisions( deviceType device=DEVICE_GPU );

         static bool DetectNativePrecision( const std::vector<precisionType>& nativeTypes, precisionType type );

         static bool DetectNativePrecision( precisionType precision, deviceType device=DEVICE_GPU );

         inline cudaStream_t GetStream() const                           { return mStream; }

         cudaStream_t CreateStream( bool nonBlocking=true );

         void SetStream( cudaStream_t stream );

         inline const char* GetPrototxtPath() const                      { return mPrototxtPath.c_str(); }

         inline const char* GetModelPath() const                         { return mModelPath.c_str(); }

         inline modelType GetModelType() const                           { return mModelType; }

         inline bool IsModelType( modelType type ) const         { return (mModelType == type); }

         inline float GetNetworkTime()                                                                           { return GetProfilerTime(PROFILER_NETWORK, PROFILER_CUDA); }

         inline float2 GetProfilerTime( profilerQuery query )                                     { PROFILER_QUERY(query); return mProfilerTimes[query]; }

         inline float GetProfilerTime( profilerQuery query, profilerDevice device ) { PROFILER_QUERY(query); return (device == PROFILER_CPU) ? mProfilerTimes[query].x : mProfilerTimes[query].y; }

         inline void PrintProfilerTimes()
         {
                 printf("\n");
                 printf(LOG_TRT "----------------------------------------------\n");
                 printf(LOG_TRT "Timing Report %s\n", GetModelPath());
                 printf(LOG_TRT "----------------------------------------------\n");

                 for( uint32_t n=0; n <= PROFILER_TOTAL; n++ )
                 {
                         const profilerQuery query = (profilerQuery)n;

                         if( PROFILER_QUERY(query) )
                                 printf(LOG_TRT "%-12s  CPU %8.5fms  CUDA %8.5fms\n", profilerQueryToStr(query), mProfilerTimes[n].x, mProfilerTimes[n].y);
                 }

                 printf(LOG_TRT "----------------------------------------------\n\n");

                 static bool first_run=true;

                 if( first_run )
                 {
                         printf(LOG_TRT "note -- when processing a single image, run 'sudo jetson_clocks' before\n"
                                   "                to disable DVFS for more accurate profiling/timing measurements\n\n");

                         first_run = false;
                 }
         }

 protected:

         tensorNet();

         bool ProfileModel( const std::string& deployFile, const std::string& modelFile,
                                         const char* input, const Dims3& inputDims,
                                     const std::vector<std::string>& outputs, uint32_t maxBatchSize,
                                     precisionType precision, deviceType device, bool allowGPUFallback,
                                     nvinfer1::IInt8Calibrator* calibrator, std::ostream& modelStream);

         class Logger : public nvinfer1::ILogger
         {
                 void log( Severity severity, const char* msg ) override
                 {
                         if( severity != Severity::kINFO /*|| mEnableDebug*/ )
                                 printf(LOG_TRT "%s\n", msg);
                 }
         } gLogger;

         class Profiler : public nvinfer1::IProfiler
         {
         public:
                 Profiler() : timingAccumulator(0.0f)    { }

                 virtual void reportLayerTime(const char* layerName, float ms)
                 {
                         printf(LOG_TRT "layer %s - %f ms\n", layerName, ms);
                         timingAccumulator += ms;
                 }

                 float timingAccumulator;

         } gProfiler;

         inline void PROFILER_BEGIN( profilerQuery query )
         {
                 const uint32_t evt = query*2;
                 const uint32_t flag = (1 << query);

                 CUDA(cudaEventRecord(mEventsGPU[evt], mStream));
                 timestamp(&mEventsCPU[evt]);

                 mProfilerQueriesUsed |= flag;
                 mProfilerQueriesDone &= ~flag;
         }

         inline void PROFILER_END( profilerQuery query )
         {
                 const uint32_t evt = query*2+1;

                 CUDA(cudaEventRecord(mEventsGPU[evt]));
                 timestamp(&mEventsCPU[evt]);
                 timespec cpuTime;
                 timeDiff(mEventsCPU[evt-1], mEventsCPU[evt], &cpuTime);
                 mProfilerTimes[query].x = timeFloat(cpuTime);

                 if( mEnableProfiler && query == PROFILER_NETWORK )
                 {
                         printf(LOG_TRT "layer network time - %f ms\n", gProfiler.timingAccumulator);
                         gProfiler.timingAccumulator = 0.0f;
                         printf(LOG_TRT "note -- when processing a single image, run 'sudo jetson_clocks' before\n"
                                   "                to disable DVFS for more accurate profiling/timing measurements\n");
                 }
         }

         inline bool PROFILER_QUERY( profilerQuery query )
         {
                 const uint32_t flag = (1 << query);

                 if( query == PROFILER_TOTAL )
                 {
                         mProfilerTimes[PROFILER_TOTAL].x = 0.0f;
                         mProfilerTimes[PROFILER_TOTAL].y = 0.0f;

                         for( uint32_t n=0; n < PROFILER_TOTAL; n++ )
                         {
                                 if( PROFILER_QUERY((profilerQuery)n) )
                                 {
                                         mProfilerTimes[PROFILER_TOTAL].x += mProfilerTimes[n].x;
                                         mProfilerTimes[PROFILER_TOTAL].y += mProfilerTimes[n].y;
                                 }
                         }

                         return true;
                 }
                 else if( mProfilerQueriesUsed & flag )
                 {
                         if( !(mProfilerQueriesDone & flag) )
                         {
                                 const uint32_t evt = query*2;
                                 float cuda_time = 0.0f;
                                 CUDA(cudaEventElapsedTime(&cuda_time, mEventsGPU[evt], mEventsGPU[evt+1]));
                                 mProfilerTimes[query].y = cuda_time;
                                 mProfilerQueriesDone |= flag;
                                 //mProfilerQueriesUsed &= ~flag;
                         }

                         return true;
                 }

                 return false;
         }

 protected:

         /* Member Variables */
         std::string mPrototxtPath;
         std::string mModelPath;
         std::string mMeanPath;
         std::string mInputBlobName;
         std::string mCacheEnginePath;
         std::string mCacheCalibrationPath;

         deviceType    mDevice;
         precisionType mPrecision;
         modelType     mModelType;
         cudaStream_t  mStream;
         cudaEvent_t   mEventsGPU[PROFILER_TOTAL * 2];
         timespec      mEventsCPU[PROFILER_TOTAL * 2];

         nvinfer1::IRuntime* mInfer;
         nvinfer1::ICudaEngine* mEngine;
         nvinfer1::IExecutionContext* mContext;

         uint32_t mWidth;
         uint32_t mHeight;
         uint32_t mInputSize;
         float*   mInputCPU;
         float*   mInputCUDA;
         float2   mProfilerTimes[PROFILER_TOTAL + 1];
         uint32_t mProfilerQueriesUsed;
         uint32_t mProfilerQueriesDone;
         uint32_t mMaxBatchSize;
         bool        mEnableProfiler;
         bool     mEnableDebug;
         bool        mAllowGPUFallback;

         Dims3 mInputDims;

         struct outputLayer
         {
                 std::string name;
                 Dims3 dims;
                 uint32_t size;
                 float* CPU;
                 float* CUDA;
         };

         std::vector<outputLayer> mOutputs;
 };

 #endif
modelType
modelType
Enumeration indicating the format of the model that&#39;s imported in TensorRT (either caffe...
Definition: tensorNet.h:132

modelTypeFromStr
modelType modelTypeFromStr(const char *str)
Parse the model format from a string.

precisionType
precisionType
Enumeration for indicating the desired precision that the network should run in, if available in hard...
Definition: tensorNet.h:79

timeFloat
float timeFloat(const timespec &a)
Convert to 32-bit float (in milliseconds).
Definition: timespec.h:132

DEVICE_GPU
GPU (if multiple GPUs are present, a specific GPU can be selected with cudaSetDevice() ...
Definition: tensorNet.h:108

profilerQuery
profilerQuery
Profiling queries.
Definition: tensorNet.h:157

tensorNet::GetProfilerTime
float2 GetProfilerTime(profilerQuery query)
Retrieve the profiler runtime (in milliseconds).
Definition: tensorNet.h:337

tensorNet::GetNetworkTime
float GetNetworkTime()
Retrieve the network runtime (in milliseconds).
Definition: tensorNet.h:332

tensorNet::mAllowGPUFallback
bool mAllowGPUFallback
Definition: tensorNet.h:538

tensorNet::GetDevice
deviceType GetDevice() const
Retrieve the device being used for execution.
Definition: tensorNet.h:262

deviceTypeFromStr
deviceType deviceTypeFromStr(const char *str)
Parse the device type from a string.

tensorNet::GetProfilerTime
float GetProfilerTime(profilerQuery query, profilerDevice device)
Retrieve the profiler runtime (in milliseconds).
Definition: tensorNet.h:342

tensorNet::mMaxBatchSize
uint32_t mMaxBatchSize
Definition: tensorNet.h:535

tensorNet::outputLayer::dims
Dims3 dims
Definition: tensorNet.h:545

PROFILER_TOTAL
Definition: tensorNet.h:163

tensorNet::Logger
Logger class for GIE info/warning/errors.
Definition: tensorNet.h:401

deviceTypeToStr
const char * deviceTypeToStr(deviceType type)
Stringize function that returns deviceType in text.

modelTypeToStr
const char * modelTypeToStr(modelType type)
Stringize function that returns modelType in text.

PROFILER_NETWORK
Definition: tensorNet.h:160

tensorNet::PROFILER_END
void PROFILER_END(profilerQuery query)
End a profiling query, after the network is run.
Definition: tensorNet.h:446

tensorNet::mEngine
nvinfer1::ICudaEngine * mEngine
Definition: tensorNet.h:524

tensorNet::mCacheCalibrationPath
std::string mCacheCalibrationPath
Definition: tensorNet.h:514

tensorNet::mDevice
deviceType mDevice
Definition: tensorNet.h:516

tensorNet::mOutputs
std::vector< outputLayer > mOutputs
Definition: tensorNet.h:551

deviceType
deviceType
Enumeration for indicating the desired device that the network should run on, if available in hardwar...
Definition: tensorNet.h:106

tensorNet::mPrecision
precisionType mPrecision
Definition: tensorNet.h:517

tensorNet::Profiler::reportLayerTime
virtual void reportLayerTime(const char *layerName, float ms)
Definition: tensorNet.h:418

MODEL_ONNX
ONNX.
Definition: tensorNet.h:136

TYPE_FP32
32-bit floating-point precision (FP32)
Definition: tensorNet.h:83

tensorNet::outputLayer::CUDA
float * CUDA
Definition: tensorNet.h:548

timestamp
void timestamp(timespec *timestampOut)
Retrieve a timestamp of the current system time.
Definition: timespec.h:36

TYPE_FASTEST
The fastest detected precision should be use (i.e.
Definition: tensorNet.h:82

LOG_TRT
#define LOG_TRT
Prefix used for tagging printed log output from TensorRT.
Definition: tensorNet.h:71

nvinfer1
Definition: tensorNet.h:27

tensorNet::mHeight
uint32_t mHeight
Definition: tensorNet.h:528

tensorNet::GetModelType
modelType GetModelType() const
Retrieve the format of the network model.
Definition: tensorNet.h:322

NUM_PRECISIONS
Number of precision types defined.
Definition: tensorNet.h:86

tensorNet::mStream
cudaStream_t mStream
Definition: tensorNet.h:519

tensorNet::mModelPath
std::string mModelPath
Definition: tensorNet.h:510

tensorNet::mProfilerQueriesUsed
uint32_t mProfilerQueriesUsed
Definition: tensorNet.h:533

tensorNet::outputLayer
Definition: tensorNet.h:542

PROFILER_PREPROCESS
Definition: tensorNet.h:159

TYPE_INT8
8-bit integer precision (INT8)
Definition: tensorNet.h:85

tensorNet::mWidth
uint32_t mWidth
Definition: tensorNet.h:527

DEVICE_DLA
Deep Learning Accelerator (DLA) Core 0 (only on Jetson Xavier)
Definition: tensorNet.h:109

tensorNet::mInputDims
Dims3 mInputDims
Definition: tensorNet.h:540

tensorNet::PROFILER_QUERY
bool PROFILER_QUERY(profilerQuery query)
Query the CUDA part of a profiler query.
Definition: tensorNet.h:468

tensorNet::mInputCPU
float * mInputCPU
Definition: tensorNet.h:530

timeDiff
void timeDiff(const timespec &start, const timespec &end, timespec *result)
Find the difference between two timestamps.
Definition: timespec.h:78

tensorNet::GetPrototxtPath
const char * GetPrototxtPath() const
Retrieve the path to the network prototxt file.
Definition: tensorNet.h:312

tensorNet::mInfer
nvinfer1::IRuntime * mInfer
Definition: tensorNet.h:523

tensorNet::mEnableProfiler
bool mEnableProfiler
Definition: tensorNet.h:536

tensorNet::mProfilerQueriesDone
uint32_t mProfilerQueriesDone
Definition: tensorNet.h:534

tensorNet::Profiler::timingAccumulator
float timingAccumulator
Definition: tensorNet.h:424

tensorNet::mInputSize
uint32_t mInputSize
Definition: tensorNet.h:529

DEVICE_DLA_0
Deep Learning Accelerator (DLA) Core 0 (only on Jetson Xavier)
Definition: tensorNet.h:110

tensorNet::mEnableDebug
bool mEnableDebug
Definition: tensorNet.h:537

MODEL_CUSTOM
Created directly with TensorRT API.
Definition: tensorNet.h:134

tensorNet::mCacheEnginePath
std::string mCacheEnginePath
Definition: tensorNet.h:513

TYPE_FP16
16-bit floating-point half precision (FP16)
Definition: tensorNet.h:84

PROFILER_CPU
CPU walltime.
Definition: tensorNet.h:178

TYPE_DISABLED
Unknown, unspecified, or disabled type.
Definition: tensorNet.h:81

MODEL_UFF
UFF.
Definition: tensorNet.h:137

DEVICE_DLA_1
Deep Learning Accelerator (DLA) Core 1 (only on Jetson Xavier)
Definition: tensorNet.h:111

tensorNet::PROFILER_BEGIN
void PROFILER_BEGIN(profilerQuery query)
Begin a profiling query, before network is run.
Definition: tensorNet.h:431

DEFAULT_MAX_BATCH_SIZE
#define DEFAULT_MAX_BATCH_SIZE
Default maximum batch size.
Definition: tensorNet.h:65

tensorNet
Abstract class for loading a tensor network with TensorRT.
Definition: tensorNet.h:188

tensorNet::IsPrecision
bool IsPrecision(precisionType type) const
Check if a particular precision is being used.
Definition: tensorNet.h:272

precisionTypeToStr
const char * precisionTypeToStr(precisionType type)
Stringize function that returns precisionType in text.

tensorNet::PrintProfilerTimes
void PrintProfilerTimes()
Print the profiler times (in millseconds).
Definition: tensorNet.h:347

tensorNet::GetStream
cudaStream_t GetStream() const
Retrieve the stream that the device is operating on.
Definition: tensorNet.h:297

profilerQueryToStr
const char * profilerQueryToStr(profilerQuery query)
Stringize function that returns profilerQuery in text.

tensorNet::mInputCUDA
float * mInputCUDA
Definition: tensorNet.h:531

timespec.h

tensorNet::IsModelType
bool IsModelType(modelType type) const
Return true if the model is of the specified format.
Definition: tensorNet.h:327

PROFILER_CUDA
CUDA kernel time.
Definition: tensorNet.h:179

CUDA
#define CUDA(x)
Execute a CUDA call and print out any errors.
Definition: cudaUtility.h:38

tensorNet::outputLayer::CPU
float * CPU
Definition: tensorNet.h:547

tensorNet::mModelType
modelType mModelType
Definition: tensorNet.h:518

NUM_DEVICES
Number of device types defined.
Definition: tensorNet.h:112

tensorNet::GetPrecision
precisionType GetPrecision() const
Retrieve the type of precision being used.
Definition: tensorNet.h:267

tensorNet::GetModelPath
const char * GetModelPath() const
Retrieve the path to the network model file.
Definition: tensorNet.h:317

tensorNet::Profiler
Profiler interface for measuring layer timings.
Definition: tensorNet.h:413

PROFILER_POSTPROCESS
Definition: tensorNet.h:161

tensorNet::mPrototxtPath
std::string mPrototxtPath
Definition: tensorNet.h:509

tensorNet::mContext
nvinfer1::IExecutionContext * mContext
Definition: tensorNet.h:525

precisionTypeFromStr
precisionType precisionTypeFromStr(const char *str)
Parse the precision type from a string.

tensorNet::outputLayer::name
std::string name
Definition: tensorNet.h:544

PROFILER_VISUALIZE
Definition: tensorNet.h:162

tensorNet::mInputBlobName
std::string mInputBlobName
Definition: tensorNet.h:512

profilerDevice
profilerDevice
Profiler device.
Definition: tensorNet.h:176

MODEL_CAFFE
caffemodel
Definition: tensorNet.h:135

tensorNet::Profiler::Profiler
Profiler()
Definition: tensorNet.h:416

cudaUtility.h

tensorNet::AllowGPUFallback
bool AllowGPUFallback() const
Return true if GPU fallback is enabled.
Definition: tensorNet.h:257

Dims3
nvinfer1::Dims3 Dims3
Definition: tensorNet.h:48

tensorNet::mMeanPath
std::string mMeanPath
Definition: tensorNet.h:511

tensorNet::outputLayer::size
uint32_t size
Definition: tensorNet.h:546