Jetson Inference
DNN Vision Library
tensorNet.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #ifndef __TENSOR_NET_H__
24 #define __TENSOR_NET_H__
25 
26 // forward declaration of IInt8Calibrator
27 namespace nvinfer1 { class IInt8Calibrator; }
28 
29 // includes
30 #include <NvInfer.h>
31 
33 #include <jetson-utils/timespec.h>
34 
35 #include <vector>
36 #include <sstream>
37 #include <math.h>
38 
39 
40 #if NV_TENSORRT_MAJOR > 1
41 typedef nvinfer1::DimsCHW Dims3;
42 
43 #define DIMS_C(x) x.d[0]
44 #define DIMS_H(x) x.d[1]
45 #define DIMS_W(x) x.d[2]
46 
47 #else
49 
50 #define DIMS_C(x) x.c
51 #define DIMS_H(x) x.h
52 #define DIMS_W(x) x.w
53 
54 #ifndef NV_TENSORRT_MAJOR
55 #define NV_TENSORRT_MAJOR 1
56 #define NV_TENSORRT_MINOR 0
57 #endif
58 #endif
59 
60 
65 #define DEFAULT_MAX_BATCH_SIZE 1
66 
71 #define LOG_TRT "[TRT] "
72 
73 
80 {
87 };
88 
93 const char* precisionTypeToStr( precisionType type );
94 
99 precisionType precisionTypeFromStr( const char* str );
100 
107 {
113 };
114 
119 const char* deviceTypeToStr( deviceType type );
120 
125 deviceType deviceTypeFromStr( const char* str );
126 
133 {
138 };
139 
144 const char* modelTypeToStr( modelType type );
145 
150 modelType modelTypeFromStr( const char* str );
151 
158 {
164 };
165 
170 const char* profilerQueryToStr( profilerQuery query );
171 
177 {
180 };
181 
182 
189 {
190 public:
194  virtual ~tensorNet();
195 
205  bool LoadNetwork( const char* prototxt, const char* model, const char* mean=NULL,
206  const char* input_blob="data", const char* output_blob="prob",
207  uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE, precisionType precision=TYPE_FASTEST,
208  deviceType device=DEVICE_GPU, bool allowGPUFallback=true,
209  nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );
210 
220  bool LoadNetwork( const char* prototxt, const char* model, const char* mean,
221  const char* input_blob, const std::vector<std::string>& output_blobs,
222  uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE, precisionType precision=TYPE_FASTEST,
223  deviceType device=DEVICE_GPU, bool allowGPUFallback=true,
224  nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );
225 
236  bool LoadNetwork( const char* prototxt, const char* model, const char* mean,
237  const char* input_blob, const Dims3& input_dims,
238  const std::vector<std::string>& output_blobs,
239  uint32_t maxBatchSize=DEFAULT_MAX_BATCH_SIZE,
240  precisionType precision=TYPE_FASTEST,
241  deviceType device=DEVICE_GPU, bool allowGPUFallback=true,
242  nvinfer1::IInt8Calibrator* calibrator=NULL, cudaStream_t stream=NULL );
243 
247  void EnableLayerProfiler();
248 
252  void EnableDebug();
253 
257  inline bool AllowGPUFallback() const { return mAllowGPUFallback; }
258 
262  inline deviceType GetDevice() const { return mDevice; }
263 
267  inline precisionType GetPrecision() const { return mPrecision; }
268 
272  inline bool IsPrecision( precisionType type ) const { return (mPrecision == type); }
273 
277  static precisionType FindFastestPrecision( deviceType device=DEVICE_GPU, bool allowInt8=true );
278 
282  static std::vector<precisionType> DetectNativePrecisions( deviceType device=DEVICE_GPU );
283 
287  static bool DetectNativePrecision( const std::vector<precisionType>& nativeTypes, precisionType type );
288 
292  static bool DetectNativePrecision( precisionType precision, deviceType device=DEVICE_GPU );
293 
297  inline cudaStream_t GetStream() const { return mStream; }
298 
302  cudaStream_t CreateStream( bool nonBlocking=true );
303 
307  void SetStream( cudaStream_t stream );
308 
312  inline const char* GetPrototxtPath() const { return mPrototxtPath.c_str(); }
313 
317  inline const char* GetModelPath() const { return mModelPath.c_str(); }
318 
322  inline modelType GetModelType() const { return mModelType; }
323 
327  inline bool IsModelType( modelType type ) const { return (mModelType == type); }
328 
332  inline float GetNetworkTime() { return GetProfilerTime(PROFILER_NETWORK, PROFILER_CUDA); }
333 
337  inline float2 GetProfilerTime( profilerQuery query ) { PROFILER_QUERY(query); return mProfilerTimes[query]; }
338 
342  inline float GetProfilerTime( profilerQuery query, profilerDevice device ) { PROFILER_QUERY(query); return (device == PROFILER_CPU) ? mProfilerTimes[query].x : mProfilerTimes[query].y; }
343 
347  inline void PrintProfilerTimes()
348  {
349  printf("\n");
350  printf(LOG_TRT "----------------------------------------------\n");
351  printf(LOG_TRT "Timing Report %s\n", GetModelPath());
352  printf(LOG_TRT "----------------------------------------------\n");
353 
354  for( uint32_t n=0; n <= PROFILER_TOTAL; n++ )
355  {
356  const profilerQuery query = (profilerQuery)n;
357 
358  if( PROFILER_QUERY(query) )
359  printf(LOG_TRT "%-12s CPU %8.5fms CUDA %8.5fms\n", profilerQueryToStr(query), mProfilerTimes[n].x, mProfilerTimes[n].y);
360  }
361 
362  printf(LOG_TRT "----------------------------------------------\n\n");
363 
364  static bool first_run=true;
365 
366  if( first_run )
367  {
368  printf(LOG_TRT "note -- when processing a single image, run 'sudo jetson_clocks' before\n"
369  " to disable DVFS for more accurate profiling/timing measurements\n\n");
370 
371  first_run = false;
372  }
373  }
374 
375 protected:
376 
380  tensorNet();
381 
392  bool ProfileModel( const std::string& deployFile, const std::string& modelFile,
393  const char* input, const Dims3& inputDims,
394  const std::vector<std::string>& outputs, uint32_t maxBatchSize,
395  precisionType precision, deviceType device, bool allowGPUFallback,
396  nvinfer1::IInt8Calibrator* calibrator, std::ostream& modelStream);
397 
401  class Logger : public nvinfer1::ILogger
402  {
403  void log( Severity severity, const char* msg ) override
404  {
405  if( severity != Severity::kINFO /*|| mEnableDebug*/ )
406  printf(LOG_TRT "%s\n", msg);
407  }
408  } gLogger;
409 
413  class Profiler : public nvinfer1::IProfiler
414  {
415  public:
416  Profiler() : timingAccumulator(0.0f) { }
417 
418  virtual void reportLayerTime(const char* layerName, float ms)
419  {
420  printf(LOG_TRT "layer %s - %f ms\n", layerName, ms);
421  timingAccumulator += ms;
422  }
423 
425 
426  } gProfiler;
427 
431  inline void PROFILER_BEGIN( profilerQuery query )
432  {
433  const uint32_t evt = query*2;
434  const uint32_t flag = (1 << query);
435 
436  CUDA(cudaEventRecord(mEventsGPU[evt], mStream));
437  timestamp(&mEventsCPU[evt]);
438 
439  mProfilerQueriesUsed |= flag;
440  mProfilerQueriesDone &= ~flag;
441  }
442 
446  inline void PROFILER_END( profilerQuery query )
447  {
448  const uint32_t evt = query*2+1;
449 
450  CUDA(cudaEventRecord(mEventsGPU[evt]));
451  timestamp(&mEventsCPU[evt]);
452  timespec cpuTime;
453  timeDiff(mEventsCPU[evt-1], mEventsCPU[evt], &cpuTime);
454  mProfilerTimes[query].x = timeFloat(cpuTime);
455 
456  if( mEnableProfiler && query == PROFILER_NETWORK )
457  {
458  printf(LOG_TRT "layer network time - %f ms\n", gProfiler.timingAccumulator);
459  gProfiler.timingAccumulator = 0.0f;
460  printf(LOG_TRT "note -- when processing a single image, run 'sudo jetson_clocks' before\n"
461  " to disable DVFS for more accurate profiling/timing measurements\n");
462  }
463  }
464 
468  inline bool PROFILER_QUERY( profilerQuery query )
469  {
470  const uint32_t flag = (1 << query);
471 
472  if( query == PROFILER_TOTAL )
473  {
474  mProfilerTimes[PROFILER_TOTAL].x = 0.0f;
475  mProfilerTimes[PROFILER_TOTAL].y = 0.0f;
476 
477  for( uint32_t n=0; n < PROFILER_TOTAL; n++ )
478  {
479  if( PROFILER_QUERY((profilerQuery)n) )
480  {
481  mProfilerTimes[PROFILER_TOTAL].x += mProfilerTimes[n].x;
482  mProfilerTimes[PROFILER_TOTAL].y += mProfilerTimes[n].y;
483  }
484  }
485 
486  return true;
487  }
488  else if( mProfilerQueriesUsed & flag )
489  {
490  if( !(mProfilerQueriesDone & flag) )
491  {
492  const uint32_t evt = query*2;
493  float cuda_time = 0.0f;
494  CUDA(cudaEventElapsedTime(&cuda_time, mEventsGPU[evt], mEventsGPU[evt+1]));
495  mProfilerTimes[query].y = cuda_time;
496  mProfilerQueriesDone |= flag;
497  //mProfilerQueriesUsed &= ~flag;
498  }
499 
500  return true;
501  }
502 
503  return false;
504  }
505 
506 protected:
507 
508  /* Member Variables */
509  std::string mPrototxtPath;
510  std::string mModelPath;
511  std::string mMeanPath;
512  std::string mInputBlobName;
513  std::string mCacheEnginePath;
515 
519  cudaStream_t mStream;
520  cudaEvent_t mEventsGPU[PROFILER_TOTAL * 2];
521  timespec mEventsCPU[PROFILER_TOTAL * 2];
522 
523  nvinfer1::IRuntime* mInfer;
524  nvinfer1::ICudaEngine* mEngine;
525  nvinfer1::IExecutionContext* mContext;
526 
527  uint32_t mWidth;
528  uint32_t mHeight;
529  uint32_t mInputSize;
530  float* mInputCPU;
531  float* mInputCUDA;
532  float2 mProfilerTimes[PROFILER_TOTAL + 1];
535  uint32_t mMaxBatchSize;
539 
541 
542  struct outputLayer
543  {
544  std::string name;
546  uint32_t size;
547  float* CPU;
548  float* CUDA;
549  };
550 
551  std::vector<outputLayer> mOutputs;
552 };
553 
554 #endif
modelType
Enumeration indicating the format of the model that&#39;s imported in TensorRT (either caffe...
Definition: tensorNet.h:132
modelType modelTypeFromStr(const char *str)
Parse the model format from a string.
precisionType
Enumeration for indicating the desired precision that the network should run in, if available in hard...
Definition: tensorNet.h:79
float timeFloat(const timespec &a)
Convert to 32-bit float (in milliseconds).
Definition: timespec.h:132
GPU (if multiple GPUs are present, a specific GPU can be selected with cudaSetDevice() ...
Definition: tensorNet.h:108
profilerQuery
Profiling queries.
Definition: tensorNet.h:157
float2 GetProfilerTime(profilerQuery query)
Retrieve the profiler runtime (in milliseconds).
Definition: tensorNet.h:337
float GetNetworkTime()
Retrieve the network runtime (in milliseconds).
Definition: tensorNet.h:332
bool mAllowGPUFallback
Definition: tensorNet.h:538
deviceType GetDevice() const
Retrieve the device being used for execution.
Definition: tensorNet.h:262
deviceType deviceTypeFromStr(const char *str)
Parse the device type from a string.
float GetProfilerTime(profilerQuery query, profilerDevice device)
Retrieve the profiler runtime (in milliseconds).
Definition: tensorNet.h:342
uint32_t mMaxBatchSize
Definition: tensorNet.h:535
Dims3 dims
Definition: tensorNet.h:545
Definition: tensorNet.h:163
Logger class for GIE info/warning/errors.
Definition: tensorNet.h:401
const char * deviceTypeToStr(deviceType type)
Stringize function that returns deviceType in text.
const char * modelTypeToStr(modelType type)
Stringize function that returns modelType in text.
Definition: tensorNet.h:160
void PROFILER_END(profilerQuery query)
End a profiling query, after the network is run.
Definition: tensorNet.h:446
nvinfer1::ICudaEngine * mEngine
Definition: tensorNet.h:524
std::string mCacheCalibrationPath
Definition: tensorNet.h:514
deviceType mDevice
Definition: tensorNet.h:516
std::vector< outputLayer > mOutputs
Definition: tensorNet.h:551
deviceType
Enumeration for indicating the desired device that the network should run on, if available in hardwar...
Definition: tensorNet.h:106
precisionType mPrecision
Definition: tensorNet.h:517
virtual void reportLayerTime(const char *layerName, float ms)
Definition: tensorNet.h:418
ONNX.
Definition: tensorNet.h:136
32-bit floating-point precision (FP32)
Definition: tensorNet.h:83
float * CUDA
Definition: tensorNet.h:548
void timestamp(timespec *timestampOut)
Retrieve a timestamp of the current system time.
Definition: timespec.h:36
The fastest detected precision should be use (i.e.
Definition: tensorNet.h:82
#define LOG_TRT
Prefix used for tagging printed log output from TensorRT.
Definition: tensorNet.h:71
Definition: tensorNet.h:27
uint32_t mHeight
Definition: tensorNet.h:528
modelType GetModelType() const
Retrieve the format of the network model.
Definition: tensorNet.h:322
Number of precision types defined.
Definition: tensorNet.h:86
cudaStream_t mStream
Definition: tensorNet.h:519
std::string mModelPath
Definition: tensorNet.h:510
uint32_t mProfilerQueriesUsed
Definition: tensorNet.h:533
Definition: tensorNet.h:542
Definition: tensorNet.h:159
8-bit integer precision (INT8)
Definition: tensorNet.h:85
uint32_t mWidth
Definition: tensorNet.h:527
Deep Learning Accelerator (DLA) Core 0 (only on Jetson Xavier)
Definition: tensorNet.h:109
Dims3 mInputDims
Definition: tensorNet.h:540
bool PROFILER_QUERY(profilerQuery query)
Query the CUDA part of a profiler query.
Definition: tensorNet.h:468
float * mInputCPU
Definition: tensorNet.h:530
void timeDiff(const timespec &start, const timespec &end, timespec *result)
Find the difference between two timestamps.
Definition: timespec.h:78
const char * GetPrototxtPath() const
Retrieve the path to the network prototxt file.
Definition: tensorNet.h:312
nvinfer1::IRuntime * mInfer
Definition: tensorNet.h:523
bool mEnableProfiler
Definition: tensorNet.h:536
uint32_t mProfilerQueriesDone
Definition: tensorNet.h:534
float timingAccumulator
Definition: tensorNet.h:424
uint32_t mInputSize
Definition: tensorNet.h:529
Deep Learning Accelerator (DLA) Core 0 (only on Jetson Xavier)
Definition: tensorNet.h:110
bool mEnableDebug
Definition: tensorNet.h:537
Created directly with TensorRT API.
Definition: tensorNet.h:134
std::string mCacheEnginePath
Definition: tensorNet.h:513
16-bit floating-point half precision (FP16)
Definition: tensorNet.h:84
CPU walltime.
Definition: tensorNet.h:178
Unknown, unspecified, or disabled type.
Definition: tensorNet.h:81
UFF.
Definition: tensorNet.h:137
Deep Learning Accelerator (DLA) Core 1 (only on Jetson Xavier)
Definition: tensorNet.h:111
void PROFILER_BEGIN(profilerQuery query)
Begin a profiling query, before network is run.
Definition: tensorNet.h:431
#define DEFAULT_MAX_BATCH_SIZE
Default maximum batch size.
Definition: tensorNet.h:65
Abstract class for loading a tensor network with TensorRT.
Definition: tensorNet.h:188
bool IsPrecision(precisionType type) const
Check if a particular precision is being used.
Definition: tensorNet.h:272
const char * precisionTypeToStr(precisionType type)
Stringize function that returns precisionType in text.
void PrintProfilerTimes()
Print the profiler times (in millseconds).
Definition: tensorNet.h:347
cudaStream_t GetStream() const
Retrieve the stream that the device is operating on.
Definition: tensorNet.h:297
const char * profilerQueryToStr(profilerQuery query)
Stringize function that returns profilerQuery in text.
float * mInputCUDA
Definition: tensorNet.h:531
bool IsModelType(modelType type) const
Return true if the model is of the specified format.
Definition: tensorNet.h:327
CUDA kernel time.
Definition: tensorNet.h:179
#define CUDA(x)
Execute a CUDA call and print out any errors.
Definition: cudaUtility.h:38
float * CPU
Definition: tensorNet.h:547
modelType mModelType
Definition: tensorNet.h:518
Number of device types defined.
Definition: tensorNet.h:112
precisionType GetPrecision() const
Retrieve the type of precision being used.
Definition: tensorNet.h:267
const char * GetModelPath() const
Retrieve the path to the network model file.
Definition: tensorNet.h:317
Profiler interface for measuring layer timings.
Definition: tensorNet.h:413
Definition: tensorNet.h:161
std::string mPrototxtPath
Definition: tensorNet.h:509
nvinfer1::IExecutionContext * mContext
Definition: tensorNet.h:525
precisionType precisionTypeFromStr(const char *str)
Parse the precision type from a string.
std::string name
Definition: tensorNet.h:544
Definition: tensorNet.h:162
std::string mInputBlobName
Definition: tensorNet.h:512
profilerDevice
Profiler device.
Definition: tensorNet.h:176
caffemodel
Definition: tensorNet.h:135
Profiler()
Definition: tensorNet.h:416
bool AllowGPUFallback() const
Return true if GPU fallback is enabled.
Definition: tensorNet.h:257
nvinfer1::Dims3 Dims3
Definition: tensorNet.h:48
std::string mMeanPath
Definition: tensorNet.h:511
uint32_t size
Definition: tensorNet.h:546