LCOV - code coverage report
Current view: top level - sources/inference - TensorRTInferencer.cpp (source / functions) Hit Total Coverage
Test: filtered.info Lines: 124 162 76.5 %
Date: 2025-08-07 15:31:19 Functions: 8 10 80.0 %

          Line data    Source code
       1             : #include "../../includes/inference/TensorRTInferencer.hpp"  // Include TensorRTInferencer header
       2             : #include <fstream>
       3             : #include <iostream>
       4             : #include <stdexcept>
       5             : #include <numeric>
       6             : 
       7             : // Constructor: loads TensorRT engine, allocates memory and sets up execution context
       8          24 : TensorRTInferencer::TensorRTInferencer(const std::string& enginePath) :
       9             :         runtime(nullptr),        // Initialize runtime pointer to nullptr
      10             :         engine(nullptr),         // Initialize engine pointer to nullptr
      11             :         context(nullptr),        // Initialize execution context pointer to nullptr
      12             :         inputBindingIndex(-1),   // Initialize input binding index
      13             :         outputBindingIndex(-1),  // Initialize output binding index
      14             :         inputSize(208, 208),     // Set default input image size
      15             :         deviceInput(nullptr),    // Initialize device input pointer to nullptr
      16             :         deviceOutput(nullptr),   // Initialize device output pointer to nullptr
      17             :         stream(nullptr),         // Initialize CUDA stream pointer to nullptr
      18             :         hostInput(nullptr),      // Initialize host input pointer to nullptr
      19          31 :         hostOutput(nullptr) {    // Initialize host output pointer to nullptr
      20             : 
      21          24 :         cudaSetDevice(0);  // Set CUDA device to GPU 0
      22             : 
      23          24 :         engineData = readEngineFile(enginePath);  // Load serialized engine file into memory
      24             : 
      25          23 :         runtime = nvinfer1::createInferRuntime(Logger::instance());  // Create TensorRT runtime with logger
      26          23 :         if (!runtime) {  // Check if runtime creation failed
      27           0 :                 throw std::runtime_error("Failed to create TensorRT Runtime");
      28             :         }
      29             : 
      30          23 :         engine = runtime->deserializeCudaEngine(engineData.data(), engineData.size());  // Deserialize the engine from the loaded data
      31          23 :         if (!engine) {  // Check if deserialization failed
      32           0 :                 throw std::runtime_error("Failed to deserialize engine");
      33             :         }
      34             : 
      35          23 :         context = engine->createExecutionContext();  // Create execution context from the engine
      36          23 :         if (!context) {  // Check if context creation failed
      37           0 :                 throw std::runtime_error("Failed to create execution context");
      38             :         }
      39             : 
      40          23 :         lanePostProcessor = new LanePostProcessor(350, 260, 10.0f, 10.0f);  // Initialize lane post-processor with parameters
      41          23 :         laneCurveFitter = new LaneCurveFitter(5.0f, 20, 20, 300); // Initialize lane curve fitter with parameters
      42             : 
      43          69 :         for (int i = 0; i < engine->getNbBindings(); i++) {  // Loop through all bindings
      44          46 :                 if (engine->bindingIsInput(i)) {  // If binding is input
      45          23 :                         inputBindingIndex = i;  // Save input binding index
      46             :                 } else {
      47          23 :                         outputBindingIndex = i;  // Otherwise, save output binding index
      48             :                 }
      49             :         }
      50             : 
      51          23 :         if (inputBindingIndex == -1 || outputBindingIndex == -1) {  // Verify both input and output were found
      52           0 :                 throw std::runtime_error("Could not find input and output bindings");
      53             :         }
      54             : 
      55          23 :         inputDims = engine->getBindingDimensions(inputBindingIndex);  // Get input tensor dimensions
      56          23 :         outputDims = engine->getBindingDimensions(outputBindingIndex);  // Get output tensor dimensions
      57             : 
      58          23 :         if (inputDims.d[0] == -1) {  // If input has dynamic batch dimension
      59           0 :                 nvinfer1::Dims4 explicitDims(1, inputSize.height, inputSize.width, 1);  // Define explicit batch size and dimensions
      60           0 :                 context->setBindingDimensions(inputBindingIndex, explicitDims);  // Set explicit input dimensions
      61           0 :                 inputDims = context->getBindingDimensions(inputBindingIndex);  // Update inputDims after setting
      62             :         }
      63             : 
      64          23 :         outputDims = context->getBindingDimensions(outputBindingIndex);  // Confirm and update outputDims
      65             : 
      66         115 :         for (int i = 0; i < outputDims.nbDims; i++) {  // Check if any output dimension is dynamic
      67          92 :                 if (outputDims.d[i] < 0) {
      68           0 :                         throw std::runtime_error("Output shape is undefined or dynamic");  // Throw error if output is not fully defined
      69             :                 }
      70             :         }
      71             : 
      72          23 :         inputElementCount = 1;  // Initialize input element count
      73         115 :         for (int i = 0; i < inputDims.nbDims; i++) {  // Multiply all input dimensions
      74          92 :                 inputElementCount *= static_cast<size_t>(inputDims.d[i]);
      75             :         }
      76          23 :         inputByteSize = inputElementCount * sizeof(float);  // Calculate input buffer size in bytes
      77             : 
      78          23 :         outputElementCount = 1;  // Initialize output element count
      79         115 :         for (int i = 0; i < outputDims.nbDims; i++) {  // Multiply all output dimensions
      80          92 :                 outputElementCount *= static_cast<size_t>(outputDims.d[i]);
      81             :         }
      82          23 :         outputByteSize = outputElementCount * sizeof(float);  // Calculate output buffer size in bytes
      83             : 
      84             :         cudaError_t status;  // Define variable for checking CUDA errors
      85             : 
      86          23 :         status = cudaStreamCreate(&stream);  // Create a CUDA stream for async operations
      87          23 :         if (status != cudaSuccess) {  // Check stream creation
      88           0 :                 throw std::runtime_error("Failed to create CUDA stream: " + std::string(cudaGetErrorString(status)));
      89             :         }
      90             : 
      91          23 :         status = cudaMalloc(&deviceInput, inputByteSize);  // Allocate device memory for input tensor
      92          23 :         if (status != cudaSuccess) {  // Check input memory allocation
      93           0 :                 throw std::runtime_error("Failed to allocate input memory on GPU: " + std::string(cudaGetErrorString(status)));
      94             :         }
      95             : 
      96          23 :         status = cudaMalloc(&deviceOutput, outputByteSize);  // Allocate device memory for output tensor
      97          23 :         if (status != cudaSuccess) {  // Check output memory allocation
      98           0 :                 cudaFree(deviceInput);  // Free previously allocated input memory if failed
      99           0 :                 throw std::runtime_error("Failed to allocate output memory on GPU: " + std::string(cudaGetErrorString(status)));
     100             :         }
     101             : 
     102          23 :         bindings.resize(engine->getNbBindings());  // Resize bindings array to number of bindings
     103          23 :         bindings[inputBindingIndex] = deviceInput;  // Assign device input buffer
     104          23 :         bindings[outputBindingIndex] = deviceOutput;  // Assign device output buffer
     105             : 
     106          23 :         Publisher::instance(5556); // Initialize publisher for inference results
     107             : 
     108          23 :         initUndistortMaps();  // Initialize undistortion maps for camera calibration
     109          23 :         cudaStream = cv::cuda::Stream();  // CUDA stream for asynchronous operations
     110          23 : }
     111             : 
     112             : // Clean up allocated GPU resources (device memory, streams)
     113          23 : void TensorRTInferencer::cleanupResources() {
     114          23 :         if (deviceInput) cudaFree(deviceInput);   // Free input buffer if allocated
     115          23 :         if (deviceOutput) cudaFree(deviceOutput); // Free output buffer if allocated
     116          23 :         if (stream) cudaStreamDestroy(stream);    // Destroy CUDA stream if created
     117          23 :         if (lanePostProcessor) delete lanePostProcessor; // Delete post-processor object
     118          23 :         if (laneCurveFitter) delete laneCurveFitter; // Delete post-processor and curve fitter objects
     119          23 :         deviceInput = nullptr;    // Set pointers to nullptr after freeing
     120          23 :         deviceOutput = nullptr;
     121          23 :         stream = nullptr;
     122          23 : }
     123             : 
     124             : // Destructor: free all allocated resources
     125          23 : TensorRTInferencer::~TensorRTInferencer() {
     126          23 :         if (hostInput) cudaFreeHost(hostInput);   // Free pinned host memory for input
     127          23 :         if (hostOutput) cudaFreeHost(hostOutput); // Free pinned host memory for output
     128          23 :         cleanupResources();  // Free GPU resources
     129             : 
     130          23 :         if (context) {
     131          23 :                 context->destroy();   // Destroy TensorRT execution context
     132             :         }
     133          23 :         if (engine) {
     134          23 :                 engine->destroy();    // Destroy TensorRT engine
     135             :         }
     136          23 :         if (runtime) {
     137          23 :                 runtime->destroy();   // Destroy TensorRT runtime
     138             :         }
     139          23 : }
     140             : 
     141             : // Read the serialized TensorRT engine file into memory
     142          24 : std::vector<char> TensorRTInferencer::readEngineFile(const std::string& enginePath) {
     143          48 :         std::ifstream file(enginePath, std::ios::binary | std::ios::ate);  // Open file in binary mode, go to end
     144          24 :         if (!file.good()) {   // Check if file opened successfully
     145           1 :                 throw std::runtime_error("Engine file not found: " + enginePath);
     146             :         }
     147             : 
     148          23 :         size_t size = file.tellg();  // Get file size
     149          23 :         file.seekg(0, std::ios::beg);  // Go back to beginning of file
     150             : 
     151          23 :         std::vector<char> buffer(size);  // Create buffer of the correct size
     152          23 :         if (!file.read(buffer.data(), size)) {  // Read file into buffer
     153           0 :                 throw std::runtime_error("Failed to read engine file");
     154             :         }
     155             : 
     156          46 :         return buffer;  // Return loaded engine buffer
     157             : }
     158             : 
     159             : // Preprocess input image on GPU: convert to grayscale, resize, normalize
     160          28 : cv::cuda::GpuMat TensorRTInferencer::preprocessImage(const cv::cuda::GpuMat& gpuImage) {
     161          28 :         if (gpuImage.empty()) {  // Validate input image
     162           2 :                 throw std::runtime_error("Input image is empty");
     163             :         }
     164             : 
     165          26 :         if (gpuImage.type() != CV_8UC3 && gpuImage.type() != CV_8UC1) {  // Check if input image is in expected format
     166           2 :                 throw std::runtime_error("Input image must be CV_8UC3 (color) or CV_8UC1 (grayscale)");
     167             :         }
     168             : 
     169          48 :         cv::cuda::GpuMat gpuGray;
     170          24 :         if (gpuImage.channels() > 1) {   // If input has multiple channels (color)
     171          22 :                 cv::cuda::cvtColor(gpuImage, gpuGray, cv::COLOR_BGR2GRAY); // Convert to grayscale
     172             :         } else {
     173           2 :                 gpuGray = gpuImage;  // Already grayscale, no conversion needed
     174             :         }
     175             : 
     176          48 :         cv::cuda::GpuMat gpuResized;
     177          24 :         cv::cuda::resize(gpuGray, gpuResized, inputSize, 0, 0, cv::INTER_LINEAR); // Resize to network input size
     178             : 
     179          24 :         cv::cuda::GpuMat gpuFloat;
     180          24 :         gpuResized.convertTo(gpuFloat, CV_32F, 1.0 / 255.0); // Normalize to [0,1] and convert to float32
     181             : 
     182          48 :         return gpuFloat;  // Return preprocessed image (still on GPU)
     183             : }
     184             : 
     185             : // Run inference given a GpuMat input (already preprocessed)
     186          21 : void TensorRTInferencer::runInference(const cv::cuda::GpuMat& gpuInput) {
     187          21 :         if (gpuInput.rows != inputSize.height || gpuInput.cols != inputSize.width) {  // Verify input dimensions
     188           2 :                 throw std::runtime_error("Input image dimensions do not match expected dimensions");
     189             :         }
     190             : 
     191          57 :         cudaError_t err = cudaMemcpy2DAsync(
     192             :                 deviceInput,                          // Destination: TensorRT input buffer
     193          19 :                 inputSize.width * sizeof(float),      // Destination row stride
     194          19 :                 gpuInput.ptr<float>(),                // Source pointer: GpuMat data
     195          19 :                 gpuInput.step,                        // Source stride
     196          19 :                 inputSize.width * sizeof(float),      // Width to copy in bytes
     197          19 :                 inputSize.height,                     // Height to copy (rows)
     198             :                 cudaMemcpyDeviceToDevice,             // Type of copy: GPU to GPU
     199             :                 stream                                // Use CUDA stream
     200             :         );
     201             : 
     202          19 :         if (err != cudaSuccess) {  // Check if memory copy failed
     203           0 :                 throw std::runtime_error("cudaMemcpy2DAsync failed: " + std::string(cudaGetErrorString(err)));
     204             :         }
     205             : 
     206             :         /* if (!context->enqueueV2(bindings.data(), stream, nullptr)) {  // Enqueue inference on the GPU
     207             :                 throw std::runtime_error("TensorRT inference execution failed");
     208             :         } */
     209             : 
     210          19 :         context->executeV2(bindings.data()); // Or executeAsyncV3 if needed
     211          19 : }
     212             : 
     213             : // Perform full prediction pipeline: preprocess, inference, and extract output
     214          17 : cv::cuda::GpuMat TensorRTInferencer::makePrediction(const cv::cuda::GpuMat& gpuImage) {
     215          34 :         cv::cuda::GpuMat gpuInputFloat = preprocessImage(gpuImage);  // Preprocess input image on GPU
     216             : 
     217          17 :         runInference(gpuInputFloat);  // Run inference
     218             : 
     219          17 :         int height = outputDims.d[1];
     220          17 :         int width  = outputDims.d[2];
     221             : 
     222             :         // Allocate or resize the output mask on GPU if it's not allocated or has wrong size
     223          17 :         if (outputMaskGpu.empty() || outputMaskGpu.rows != height || outputMaskGpu.cols != width) {
     224           7 :                 outputMaskGpu = cv::cuda::GpuMat(height, width, CV_32F);
     225             :         }
     226             : 
     227             :         // Copy the raw prediction output from TensorRT device memory to `outputMaskGpu`
     228             :         // - Assumes output is already in device memory (deviceOutput)
     229             :         // - No CPU-GPU transfer, all device-to-device
     230          17 :         cudaMemcpy2DAsync(
     231          17 :                 outputMaskGpu.ptr<float>(), outputMaskGpu.step,
     232          17 :                 deviceOutput, width * sizeof(float),
     233          17 :                 width * sizeof(float), height,
     234             :                 cudaMemcpyDeviceToDevice, stream
     235             :         );
     236             : 
     237             :         //post-process starts here
     238             : /*      cv::cuda::GpuMat postProcessedMaskGpu = lanePostProcessor->process(outputMaskGpu);
     239             : 
     240             :         // Download post-processed binary mask for polyfitting
     241             :         cv::Mat maskCpu;
     242             :         postProcessedMaskGpu.download(maskCpu);
     243             : 
     244             :         // Fit lanes and compute centerline
     245             :         std::vector<LaneCurveFitter::LaneCurve> lanes = laneCurveFitter->fitLanes(maskCpu);
     246             :         std::cout << "[DEBUG] Number of fitted lanes: " << lanes.size() << std::endl;
     247             :         auto centerlineOpt = laneCurveFitter->computeVirtualCenterline(lanes, maskCpu.cols, maskCpu.rows);
     248             :         if (!centerlineOpt.has_value()) {
     249             :                 std::cout << "[DEBUG] No centerline could be computed." << std::endl;
     250             :         }
     251             : 
     252             :         // Draw centerline on CPU
     253             :         if (centerlineOpt.has_value()) {
     254             :                 const auto& centerline = centerlineOpt.value().blended;
     255             :                 for (size_t i = 1; i < centerline.size(); ++i) {
     256             :                         cv::line(maskCpu,
     257             :                                         centerline[i - 1],
     258             :                                         centerline[i],
     259             :                                         cv::Scalar(255), // White
     260             :                                         2,               // Thickness
     261             :                                         cv::LINE_AA);
     262             :                 }
     263             :         }
     264             : 
     265             :         // Upload mask with centerline back to GPU
     266             :         postProcessedMaskGpu.upload(maskCpu); */
     267             : 
     268          34 :         return outputMaskGpu;
     269             : }
     270             : 
     271          23 : void TensorRTInferencer::initUndistortMaps() {
     272          23 :         cv::Mat cameraMatrix, distCoeffs;
     273          46 :         cv::FileStorage fs("/home/jetson/models/lane-detection/camera_calibration.yml", cv::FileStorage::READ);  // Open calibration file
     274             : 
     275          23 :         if (!fs.isOpened()) {
     276           0 :                 std::cerr << "[Error] Failed to open camera_calibration.yml" << std::endl;
     277           0 :                 return;  // Handle file opening error
     278             :         }
     279             : 
     280          23 :         fs["camera_matrix"] >> cameraMatrix;  // Read camera matrix
     281          23 :         fs["distortion_coefficients"] >> distCoeffs;  // Read distortion coefficients
     282          23 :         fs.release();  // Close file
     283             : 
     284          46 :         cv::Mat mapx, mapy;
     285          69 :         cv::initUndistortRectifyMap(
     286          46 :                 cameraMatrix, distCoeffs, cv::Mat(), cameraMatrix,
     287             :                 cv::Size(1280, 720),
     288             :                 CV_32FC1, mapx, mapy
     289             :         );  // Compute undistortion mapping
     290             : 
     291          23 :         d_mapx.upload(mapx);  // Upload X map to GPU
     292          23 :         d_mapy.upload(mapy);  // Upload Y map to GPU
     293             : }
     294             : 
     295           0 : void TensorRTInferencer::doInference(const cv::Mat& frame) {
     296           0 :         if (frame.empty()) {
     297           0 :                 throw std::runtime_error("Input frame is empty");
     298             :         }
     299             : 
     300           0 :         cv::cuda::GpuMat d_frame(frame);  // Upload frame to GPU
     301           0 :         cv::cuda::GpuMat d_undistorted;
     302           0 :         cv::cuda::remap(d_frame, d_undistorted, d_mapx, d_mapy, cv::INTER_LINEAR, 0, cv::Scalar(), cudaStream);  // Undistort frame
     303             : 
     304           0 :         cv::cuda::GpuMat d_prediction_mask = makePrediction(d_undistorted);  // Run model inference
     305             : 
     306             :         // Convert to 8-bit (0 or 255) in a new GpuMat
     307           0 :         cv::cuda::GpuMat d_mask_u8;
     308           0 :         d_prediction_mask.convertTo(d_mask_u8, CV_8U, 255.0);  // Multiply 0/1 float to 0/255
     309             : 
     310           0 :         cv::Mat binary_mask_cpu;
     311           0 :         d_mask_u8.download(binary_mask_cpu, cudaStream);
     312           0 :         cv::threshold(binary_mask_cpu, binary_mask_cpu, 128, 255, cv::THRESH_BINARY);
     313           0 :         cudaStream.waitForCompletion();  // Ensure async operations are complete
     314             : 
     315             :         // Convert model output to 8-bit binary mask on GPU
     316           0 :         cv::cuda::GpuMat d_visualization;
     317           0 :         d_prediction_mask.convertTo(d_visualization, CV_8U, 255.0, 0, cudaStream);
     318             : 
     319           0 :         cv::cuda::GpuMat d_resized_mask;
     320             : 
     321           0 :         cv::cuda::resize(d_visualization, d_resized_mask,
     322           0 :                                                 cv::Size(frame.cols * 0.5, frame.rows * 0.5),
     323           0 :                                                 0, 0, cv::INTER_LINEAR, cudaStream);  // Resize for display
     324           0 :         cudaStream.waitForCompletion();  // Synchronize
     325             : 
     326           0 :         Publisher::instance(5556)->publishInferenceFrame("inference_frame", d_resized_mask); //Publish frame to ZeroMQ publisher
     327           0 : }

Generated by: LCOV version 1.14