docs/html/tensorNet_8h_source.html

/*

 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.

 *

 * Permission is hereby granted, free of charge, to any person obtaining a

 * copy of this software and associated documentation files (the "Software"),

 * to deal in the Software without restriction, including without limitation

 * the rights to use, copy, modify, merge, publish, distribute, sublicense,

 * and/or sell copies of the Software, and to permit persons to whom the

 * Software is furnished to do so, subject to the following conditions:

 *

 * The above copyright notice and this permission notice shall be included in

 * all copies or substantial portions of the Software.

 *

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL

 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING

 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER

 * DEALINGS IN THE SOFTWARE.

 */


#ifndef __TENSOR_NET_H__

#define __TENSOR_NET_H__


// forward declaration of IInt8Calibrator

namespace nvinfer1 { class IInt8Calibrator; }


// includes

#include <NvInfer.h>


#include <jetson-utils/cudaUtility.h>

#include <jetson-utils/commandLine.h>

#include <jetson-utils/imageFormat.h>

#include <jetson-utils/timespec.h>

#include <jetson-utils/logging.h>


#include <vector>

#include <sstream>

#include <math.h>


#if NV_TENSORRT_MAJOR >= 6

typedef nvinfer1::Dims3 Dims3;


#define DIMS_C(x) x.d[0]

#define DIMS_H(x) x.d[1]

#define DIMS_W(x) x.d[2]


#elif NV_TENSORRT_MAJOR >= 2

typedef nvinfer1::DimsCHW Dims3;


#define DIMS_C(x) x.d[0]

#define DIMS_H(x) x.d[1]

#define DIMS_W(x) x.d[2]


#else

typedef nvinfer1::Dims3 Dims3;


#define DIMS_C(x) x.c

#define DIMS_H(x) x.h

#define DIMS_W(x) x.w


#ifndef NV_TENSORRT_MAJOR

#define NV_TENSORRT_MAJOR 1

#define NV_TENSORRT_MINOR 0

#endif

#endif


#if NV_TENSORRT_MAJOR >= 8

#define NOEXCEPT noexcept

#else

#define NOEXCEPT

#endif


#define TENSORRT_VERSION_CHECK(major, minor, patch)  (NV_TENSORRT_MAJOR > major || (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && NV_TENSORRT_PATCH >= patch))


#define DEFAULT_MAX_BATCH_SIZE  1


#define LOG_TRT "[TRT]    "


enum precisionType

{

        TYPE_DISABLED = 0,

        TYPE_FASTEST,

        TYPE_FP32,

        TYPE_FP16,

        TYPE_INT8,

        NUM_PRECISIONS

};


const char* precisionTypeToStr( precisionType type );


precisionType precisionTypeFromStr( const char* str );


enum deviceType

{

        DEVICE_GPU = 0,

        DEVICE_DLA,

        DEVICE_DLA_0 = DEVICE_DLA,

        DEVICE_DLA_1,

        NUM_DEVICES

};


const char* deviceTypeToStr( deviceType type );


deviceType deviceTypeFromStr( const char* str );


enum modelType

{

        MODEL_CUSTOM = 0,

        MODEL_CAFFE,

        MODEL_ONNX,

        MODEL_UFF,

        MODEL_ENGINE

};


const char* modelTypeToStr( modelType type );


modelType modelTypeFromStr( const char* str );


modelType modelTypeFromPath( const char* path );


enum profilerQuery

{

        PROFILER_PREPROCESS = 0,

        PROFILER_NETWORK,

        PROFILER_POSTPROCESS,

        PROFILER_VISUALIZE,

        PROFILER_TOTAL,

};


const char* profilerQueryToStr( profilerQuery query );


enum profilerDevice

{

        PROFILER_CPU = 0,

        PROFILER_CUDA,

};


class tensorNet

{

public:

        virtual ~tensorNet();


        bool LoadNetwork( const char* prototxt, const char* model, const char* mean=NULL,

                                   const char* input_blob="data", const char* output_blob="prob",

                                   uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE, precisionType precision=TYPE_FASTEST,

                                   deviceType device=DEVICE_GPU, bool allowGPUFallback=true,

                                   nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );


        bool LoadNetwork( const char* prototxt, const char* model, const char* mean,

                                   const char* input_blob, const std::vector<std::string>& output_blobs,

                                   uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE, precisionType precision=TYPE_FASTEST,

                                   deviceType device=DEVICE_GPU, bool allowGPUFallback=true,

                                   nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );


        bool LoadNetwork( const char* prototxt, const char* model, const char* mean,

                                   const std::vector<std::string>& input_blobs,

                                   const std::vector<std::string>& output_blobs,

                                   uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE,

                                   precisionType precision=TYPE_FASTEST,

                                   deviceType device=DEVICE_GPU, bool allowGPUFallback=true,

                                   nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );


        bool LoadNetwork( const char* prototxt, const char* model, const char* mean,

                                   const char* input_blob, const Dims3& input_dims,

                                   const std::vector<std::string>& output_blobs,

                                   uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE,

                                   precisionType precision=TYPE_FASTEST,

                                   deviceType device=DEVICE_GPU, bool allowGPUFallback=true,

                                   nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );


        bool LoadNetwork( const char* prototxt, const char* model, const char* mean,

                                   const std::vector<std::string>& input_blobs,

                                   const std::vector<Dims3>& input_dims,

                                   const std::vector<std::string>& output_blobs,

                                   uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE,

                                   precisionType precision=TYPE_FASTEST,

                                   deviceType device=DEVICE_GPU, bool allowGPUFallback=true,

                                   nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );


        bool LoadEngine( const char* engine_filename,

                                  const std::vector<std::string>& input_blobs,

                                  const std::vector<std::string>& output_blobs,

                                  nvinfer1::IPluginFactory* pluginFactory=NULL,

                                  deviceType device=DEVICE_GPU,

                                  cudaStream_t stream=NULL );


        bool LoadEngine( char* engine_stream, size_t engine_size,

                                  const std::vector<std::string>& input_blobs,

                                  const std::vector<std::string>& output_blobs,

                                  nvinfer1::IPluginFactory* pluginFactory=NULL,

                                  deviceType device=DEVICE_GPU,

                                  cudaStream_t stream=NULL );


        bool LoadEngine( nvinfer1::ICudaEngine* engine,

                                  const std::vector<std::string>& input_blobs,

                                  const std::vector<std::string>& output_blobs,

                                  deviceType device=DEVICE_GPU,

                                  cudaStream_t stream=NULL );


        bool LoadEngine( const char* filename, char** stream, size_t* size );


        static bool LoadClassLabels( const char* filename, std::vector<std::string>& descriptions, int expectedClasses=-1 );


        static bool LoadClassLabels( const char* filename, std::vector<std::string>& descriptions, std::vector<std::string>& synsets, int expectedClasses=-1 );


        static bool LoadClassColors( const char* filename, float4* colors, int expectedClasses, float defaultAlpha=255.0f );


        static bool LoadClassColors( const char* filename, float4** colors, int expectedClasses, float defaultAlpha=255.0f );


        static float4 GenerateColor( uint32_t classID, float alpha=255.0f );


        void EnableLayerProfiler();


        void EnableDebug();


        inline bool AllowGPUFallback() const                                    { return mAllowGPUFallback; }


        inline deviceType GetDevice() const                                     { return mDevice; }


        inline precisionType GetPrecision() const                               { return mPrecision; }


        inline bool IsPrecision( precisionType type ) const             { return (mPrecision == type); }


        static precisionType SelectPrecision( precisionType precision, deviceType device=DEVICE_GPU, bool allowInt8=true );


        static precisionType FindFastestPrecision( deviceType device=DEVICE_GPU, bool allowInt8=true );


        static std::vector<precisionType> DetectNativePrecisions( deviceType device=DEVICE_GPU );


        static bool DetectNativePrecision( const std::vector<precisionType>& nativeTypes, precisionType type );


        static bool DetectNativePrecision( precisionType precision, deviceType device=DEVICE_GPU );


        inline cudaStream_t GetStream() const                                   { return mStream; }


        cudaStream_t CreateStream( bool nonBlocking=true );


        void SetStream( cudaStream_t stream );


        inline const char* GetPrototxtPath() const                              { return mPrototxtPath.c_str(); }


        inline const char* GetModelPath() const                                 { return mModelPath.c_str(); }


        inline const char* GetModelFilename() const                             { return mModelFile.c_str(); }


        inline modelType GetModelType() const                                   { return mModelType; }


        inline bool IsModelType( modelType type ) const                 { return (mModelType == type); }


        inline uint32_t GetInputLayers() const                                  { return mInputs.size(); }


        inline uint32_t GetOutputLayers() const                                 { return mOutputs.size(); }


        inline Dims3 GetInputDims( uint32_t layer=0 ) const             { return mInputs[layer].dims; }


        inline uint32_t GetInputWidth( uint32_t layer=0 ) const { return DIMS_W(mInputs[layer].dims); }


        inline uint32_t GetInputHeight( uint32_t layer=0 ) const        { return DIMS_H(mInputs[layer].dims); }


        inline uint32_t GetInputSize( uint32_t layer=0 ) const          { return mInputs[layer].size; }


        inline float* GetInputPtr( uint32_t layer=0 ) const             { return mInputs[layer].CUDA; }


        inline Dims3 GetOutputDims( uint32_t layer=0 ) const            { return mOutputs[layer].dims; }


        inline uint32_t GetOutputWidth( uint32_t layer=0 ) const        { return DIMS_W(mOutputs[layer].dims); }


        inline uint32_t GetOutputHeight( uint32_t layer=0 ) const       { return DIMS_H(mOutputs[layer].dims); }


        inline uint32_t GetOutputSize( uint32_t layer=0 ) const { return mOutputs[layer].size; }


        inline float* GetOutputPtr( uint32_t layer=0 ) const            { return mOutputs[layer].CUDA; }


        inline float GetNetworkFPS()                                                    { return 1000.0f / GetNetworkTime(); }


        inline float GetNetworkTime()                                                   { return GetProfilerTime(PROFILER_NETWORK, PROFILER_CUDA); }


        inline const char* GetNetworkName() const                               { return mModelFile.c_str(); }


        inline float2 GetProfilerTime( profilerQuery query )            { PROFILER_QUERY(query); return mProfilerTimes[query]; }


        inline float GetProfilerTime( profilerQuery query, profilerDevice device ) { PROFILER_QUERY(query); return (device == PROFILER_CPU) ? mProfilerTimes[query].x : mProfilerTimes[query].y; }


        inline void PrintProfilerTimes()

        {

                LogInfo("\n");

                LogInfo(LOG_TRT "------------------------------------------------\n");

                LogInfo(LOG_TRT "Timing Report %s\n", GetModelPath());

                LogInfo(LOG_TRT "------------------------------------------------\n");


                for( uint32_t n=0; n <= PROFILER_TOTAL; n++ )

                {

                        const profilerQuery query = (profilerQuery)n;


                        if( PROFILER_QUERY(query) )

                                LogInfo(LOG_TRT "%-12s  CPU %9.5fms  CUDA %9.5fms\n", profilerQueryToStr(query), mProfilerTimes[n].x, mProfilerTimes[n].y);

                }


                LogInfo(LOG_TRT "------------------------------------------------\n\n");


                static bool first_run=true;


                if( first_run )

                {

                        LogWarning(LOG_TRT "note -- when processing a single image, run 'sudo jetson_clocks' before\n"

                                      "                to disable DVFS for more accurate profiling/timing measurements\n\n");


                        first_run = false;

                }

        }


protected:


        tensorNet();


        bool ProcessNetwork( bool sync=true );


        bool ProfileModel( const std::string& deployFile, const std::string& modelFile,

                                    const std::vector<std::string>& inputs, const std::vector<Dims3>& inputDims,

                                    const std::vector<std::string>& outputs, uint32_t maxBatchSize,

                                    precisionType precision, deviceType device, bool allowGPUFallback,

                                    nvinfer1::IInt8Calibrator* calibrator, char** engineStream, size_t* engineSize );


#if NV_TENSORRT_MAJOR >= 8

        bool ConfigureBuilder( nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,

                                           uint32_t maxBatchSize, uint32_t workspaceSize, precisionType precision,

                                           deviceType device, bool allowGPUFallback,

                                           nvinfer1::IInt8Calibrator* calibrator );

#else

        bool ConfigureBuilder( nvinfer1::IBuilder* builder, uint32_t maxBatchSize,

                                           uint32_t workspaceSize, precisionType precision,

                                           deviceType device, bool allowGPUFallback,

                                           nvinfer1::IInt8Calibrator* calibrator );

#endif


        bool ValidateEngine( const char* model_path, const char* cache_path, const char* checksum_path );


        class Logger : public nvinfer1::ILogger

        {

        public:

                void log( Severity severity, const char* msg ) NOEXCEPT override

                {

                        if( severity == Severity::kWARNING )

                        {

                                LogWarning(LOG_TRT "%s\n", msg);

                        }

                        else if( severity == Severity::kINFO )

                        {

                                LogInfo(LOG_TRT "%s\n", msg);

                        }

                #if NV_TENSORRT_MAJOR >= 6

                        else if( severity == Severity::kVERBOSE )

                        {

                                LogVerbose(LOG_TRT "%s\n", msg);

                        }

                #endif

                        else

                        {

                                LogError(LOG_TRT "%s\n", msg);

                        }

                }

        } static gLogger;


        class Profiler : public nvinfer1::IProfiler

        {

        public:

                Profiler() : timingAccumulator(0.0f)    { }


                virtual void reportLayerTime(const char* layerName, float ms) NOEXCEPT

                {

                        LogVerbose(LOG_TRT "layer %s - %f ms\n", layerName, ms);

                        timingAccumulator += ms;

                }


                float timingAccumulator;

        } gProfiler;


        inline void PROFILER_BEGIN( profilerQuery query )

        {

                const uint32_t evt = query*2;

                const uint32_t flag = (1 << query);


                CUDA(cudaEventRecord(mEventsGPU[evt], mStream));

                timestamp(&mEventsCPU[evt]);


                mProfilerQueriesUsed |= flag;

                mProfilerQueriesDone &= ~flag;

        }


        inline void PROFILER_END( profilerQuery query )

        {

                const uint32_t evt = query*2+1;


                CUDA(cudaEventRecord(mEventsGPU[evt]));

                timestamp(&mEventsCPU[evt]);

                timespec cpuTime;

                timeDiff(mEventsCPU[evt-1], mEventsCPU[evt], &cpuTime);

                mProfilerTimes[query].x = timeFloat(cpuTime);


                if( mEnableProfiler && query == PROFILER_NETWORK )

                {

                        LogVerbose(LOG_TRT "layer network time - %f ms\n", gProfiler.timingAccumulator);

                        gProfiler.timingAccumulator = 0.0f;

                        LogWarning(LOG_TRT "note -- when processing a single image, run 'sudo jetson_clocks' before\n"

                                      "                to disable DVFS for more accurate profiling/timing measurements\n");

                }

        }


        inline bool PROFILER_QUERY( profilerQuery query )

        {

                const uint32_t flag = (1 << query);


                if( query == PROFILER_TOTAL )

                {

                        mProfilerTimes[PROFILER_TOTAL].x = 0.0f;

                        mProfilerTimes[PROFILER_TOTAL].y = 0.0f;


                        for( uint32_t n=0; n < PROFILER_TOTAL; n++ )

                        {

                                if( PROFILER_QUERY((profilerQuery)n) )

                                {

                                        mProfilerTimes[PROFILER_TOTAL].x += mProfilerTimes[n].x;

                                        mProfilerTimes[PROFILER_TOTAL].y += mProfilerTimes[n].y;

                                }

                        }


                        return true;

                }

                else if( mProfilerQueriesUsed & flag )

                {

                        if( !(mProfilerQueriesDone & flag) )

                        {

                                const uint32_t evt = query*2;

                                float cuda_time = 0.0f;

                                CUDA(cudaEventElapsedTime(&cuda_time, mEventsGPU[evt], mEventsGPU[evt+1]));

                                mProfilerTimes[query].y = cuda_time;

                                mProfilerQueriesDone |= flag;

                                //mProfilerQueriesUsed &= ~flag;

                        }


                        return true;

                }


                return false;

        }


protected:


        /* Member Variables */

        std::string mPrototxtPath;

        std::string mModelPath;

        std::string mModelFile;

        std::string mMeanPath;

        std::string mCacheEnginePath;

        std::string mCacheCalibrationPath;

        std::string mChecksumPath;


        deviceType    mDevice;

        precisionType mPrecision;

        modelType     mModelType;

        cudaStream_t  mStream;

        cudaEvent_t   mEventsGPU[PROFILER_TOTAL * 2];

        timespec      mEventsCPU[PROFILER_TOTAL * 2];


        nvinfer1::IRuntime* mInfer;

        nvinfer1::ICudaEngine* mEngine;

        nvinfer1::IExecutionContext* mContext;


        float2   mProfilerTimes[PROFILER_TOTAL + 1];

        uint32_t mProfilerQueriesUsed;

        uint32_t mProfilerQueriesDone;

        uint32_t mWorkspaceSize;

        uint32_t mMaxBatchSize;

        bool        mEnableProfiler;

        bool     mEnableDebug;

        bool        mAllowGPUFallback;

        void**   mBindings;


        struct layerInfo

        {

                std::string name;

                Dims3 dims;

                uint32_t size;

                uint32_t binding;

                float* CPU;

                float* CUDA;

        };


        std::vector<layerInfo> mInputs;

        std::vector<layerInfo> mOutputs;

};


#endif