Line data Source code
1 : #include "../../includes/inference/TensorRTInferencer.hpp" // Include TensorRTInferencer header
2 : #include <fstream>
3 : #include <iostream>
4 : #include <stdexcept>
5 : #include <numeric>
6 :
7 : // Constructor: loads TensorRT engine, allocates memory and sets up execution context
8 24 : TensorRTInferencer::TensorRTInferencer(const std::string& enginePath) :
9 : runtime(nullptr), // Initialize runtime pointer to nullptr
10 : engine(nullptr), // Initialize engine pointer to nullptr
11 : context(nullptr), // Initialize execution context pointer to nullptr
12 : inputBindingIndex(-1), // Initialize input binding index
13 : outputBindingIndex(-1), // Initialize output binding index
14 : inputSize(208, 208), // Set default input image size
15 : deviceInput(nullptr), // Initialize device input pointer to nullptr
16 : deviceOutput(nullptr), // Initialize device output pointer to nullptr
17 : stream(nullptr), // Initialize CUDA stream pointer to nullptr
18 : hostInput(nullptr), // Initialize host input pointer to nullptr
19 31 : hostOutput(nullptr) { // Initialize host output pointer to nullptr
20 :
21 24 : cudaSetDevice(0); // Set CUDA device to GPU 0
22 :
23 24 : engineData = readEngineFile(enginePath); // Load serialized engine file into memory
24 :
25 23 : runtime = nvinfer1::createInferRuntime(Logger::instance()); // Create TensorRT runtime with logger
26 23 : if (!runtime) { // Check if runtime creation failed
27 0 : throw std::runtime_error("Failed to create TensorRT Runtime");
28 : }
29 :
30 23 : engine = runtime->deserializeCudaEngine(engineData.data(), engineData.size()); // Deserialize the engine from the loaded data
31 23 : if (!engine) { // Check if deserialization failed
32 0 : throw std::runtime_error("Failed to deserialize engine");
33 : }
34 :
35 23 : context = engine->createExecutionContext(); // Create execution context from the engine
36 23 : if (!context) { // Check if context creation failed
37 0 : throw std::runtime_error("Failed to create execution context");
38 : }
39 :
40 23 : lanePostProcessor = new LanePostProcessor(350, 260, 10.0f, 10.0f); // Initialize lane post-processor with parameters
41 23 : laneCurveFitter = new LaneCurveFitter(5.0f, 20, 20, 300); // Initialize lane curve fitter with parameters
42 :
43 69 : for (int i = 0; i < engine->getNbBindings(); i++) { // Loop through all bindings
44 46 : if (engine->bindingIsInput(i)) { // If binding is input
45 23 : inputBindingIndex = i; // Save input binding index
46 : } else {
47 23 : outputBindingIndex = i; // Otherwise, save output binding index
48 : }
49 : }
50 :
51 23 : if (inputBindingIndex == -1 || outputBindingIndex == -1) { // Verify both input and output were found
52 0 : throw std::runtime_error("Could not find input and output bindings");
53 : }
54 :
55 23 : inputDims = engine->getBindingDimensions(inputBindingIndex); // Get input tensor dimensions
56 23 : outputDims = engine->getBindingDimensions(outputBindingIndex); // Get output tensor dimensions
57 :
58 23 : if (inputDims.d[0] == -1) { // If input has dynamic batch dimension
59 0 : nvinfer1::Dims4 explicitDims(1, inputSize.height, inputSize.width, 1); // Define explicit batch size and dimensions
60 0 : context->setBindingDimensions(inputBindingIndex, explicitDims); // Set explicit input dimensions
61 0 : inputDims = context->getBindingDimensions(inputBindingIndex); // Update inputDims after setting
62 : }
63 :
64 23 : outputDims = context->getBindingDimensions(outputBindingIndex); // Confirm and update outputDims
65 :
66 115 : for (int i = 0; i < outputDims.nbDims; i++) { // Check if any output dimension is dynamic
67 92 : if (outputDims.d[i] < 0) {
68 0 : throw std::runtime_error("Output shape is undefined or dynamic"); // Throw error if output is not fully defined
69 : }
70 : }
71 :
72 23 : inputElementCount = 1; // Initialize input element count
73 115 : for (int i = 0; i < inputDims.nbDims; i++) { // Multiply all input dimensions
74 92 : inputElementCount *= static_cast<size_t>(inputDims.d[i]);
75 : }
76 23 : inputByteSize = inputElementCount * sizeof(float); // Calculate input buffer size in bytes
77 :
78 23 : outputElementCount = 1; // Initialize output element count
79 115 : for (int i = 0; i < outputDims.nbDims; i++) { // Multiply all output dimensions
80 92 : outputElementCount *= static_cast<size_t>(outputDims.d[i]);
81 : }
82 23 : outputByteSize = outputElementCount * sizeof(float); // Calculate output buffer size in bytes
83 :
84 : cudaError_t status; // Define variable for checking CUDA errors
85 :
86 23 : status = cudaStreamCreate(&stream); // Create a CUDA stream for async operations
87 23 : if (status != cudaSuccess) { // Check stream creation
88 0 : throw std::runtime_error("Failed to create CUDA stream: " + std::string(cudaGetErrorString(status)));
89 : }
90 :
91 23 : status = cudaMalloc(&deviceInput, inputByteSize); // Allocate device memory for input tensor
92 23 : if (status != cudaSuccess) { // Check input memory allocation
93 0 : throw std::runtime_error("Failed to allocate input memory on GPU: " + std::string(cudaGetErrorString(status)));
94 : }
95 :
96 23 : status = cudaMalloc(&deviceOutput, outputByteSize); // Allocate device memory for output tensor
97 23 : if (status != cudaSuccess) { // Check output memory allocation
98 0 : cudaFree(deviceInput); // Free previously allocated input memory if failed
99 0 : throw std::runtime_error("Failed to allocate output memory on GPU: " + std::string(cudaGetErrorString(status)));
100 : }
101 :
102 23 : bindings.resize(engine->getNbBindings()); // Resize bindings array to number of bindings
103 23 : bindings[inputBindingIndex] = deviceInput; // Assign device input buffer
104 23 : bindings[outputBindingIndex] = deviceOutput; // Assign device output buffer
105 :
106 23 : Publisher::instance(5556); // Initialize publisher for inference results
107 :
108 23 : initUndistortMaps(); // Initialize undistortion maps for camera calibration
109 23 : cudaStream = cv::cuda::Stream(); // CUDA stream for asynchronous operations
110 23 : }
111 :
112 : // Clean up allocated GPU resources (device memory, streams)
113 23 : void TensorRTInferencer::cleanupResources() {
114 23 : if (deviceInput) cudaFree(deviceInput); // Free input buffer if allocated
115 23 : if (deviceOutput) cudaFree(deviceOutput); // Free output buffer if allocated
116 23 : if (stream) cudaStreamDestroy(stream); // Destroy CUDA stream if created
117 23 : if (lanePostProcessor) delete lanePostProcessor; // Delete post-processor object
118 23 : if (laneCurveFitter) delete laneCurveFitter; // Delete post-processor and curve fitter objects
119 23 : deviceInput = nullptr; // Set pointers to nullptr after freeing
120 23 : deviceOutput = nullptr;
121 23 : stream = nullptr;
122 23 : }
123 :
124 : // Destructor: free all allocated resources
125 23 : TensorRTInferencer::~TensorRTInferencer() {
126 23 : if (hostInput) cudaFreeHost(hostInput); // Free pinned host memory for input
127 23 : if (hostOutput) cudaFreeHost(hostOutput); // Free pinned host memory for output
128 23 : cleanupResources(); // Free GPU resources
129 :
130 23 : if (context) {
131 23 : context->destroy(); // Destroy TensorRT execution context
132 : }
133 23 : if (engine) {
134 23 : engine->destroy(); // Destroy TensorRT engine
135 : }
136 23 : if (runtime) {
137 23 : runtime->destroy(); // Destroy TensorRT runtime
138 : }
139 23 : }
140 :
141 : // Read the serialized TensorRT engine file into memory
142 24 : std::vector<char> TensorRTInferencer::readEngineFile(const std::string& enginePath) {
143 48 : std::ifstream file(enginePath, std::ios::binary | std::ios::ate); // Open file in binary mode, go to end
144 24 : if (!file.good()) { // Check if file opened successfully
145 1 : throw std::runtime_error("Engine file not found: " + enginePath);
146 : }
147 :
148 23 : size_t size = file.tellg(); // Get file size
149 23 : file.seekg(0, std::ios::beg); // Go back to beginning of file
150 :
151 23 : std::vector<char> buffer(size); // Create buffer of the correct size
152 23 : if (!file.read(buffer.data(), size)) { // Read file into buffer
153 0 : throw std::runtime_error("Failed to read engine file");
154 : }
155 :
156 46 : return buffer; // Return loaded engine buffer
157 : }
158 :
159 : // Preprocess input image on GPU: convert to grayscale, resize, normalize
160 28 : cv::cuda::GpuMat TensorRTInferencer::preprocessImage(const cv::cuda::GpuMat& gpuImage) {
161 28 : if (gpuImage.empty()) { // Validate input image
162 2 : throw std::runtime_error("Input image is empty");
163 : }
164 :
165 26 : if (gpuImage.type() != CV_8UC3 && gpuImage.type() != CV_8UC1) { // Check if input image is in expected format
166 2 : throw std::runtime_error("Input image must be CV_8UC3 (color) or CV_8UC1 (grayscale)");
167 : }
168 :
169 48 : cv::cuda::GpuMat gpuGray;
170 24 : if (gpuImage.channels() > 1) { // If input has multiple channels (color)
171 22 : cv::cuda::cvtColor(gpuImage, gpuGray, cv::COLOR_BGR2GRAY); // Convert to grayscale
172 : } else {
173 2 : gpuGray = gpuImage; // Already grayscale, no conversion needed
174 : }
175 :
176 48 : cv::cuda::GpuMat gpuResized;
177 24 : cv::cuda::resize(gpuGray, gpuResized, inputSize, 0, 0, cv::INTER_LINEAR); // Resize to network input size
178 :
179 24 : cv::cuda::GpuMat gpuFloat;
180 24 : gpuResized.convertTo(gpuFloat, CV_32F, 1.0 / 255.0); // Normalize to [0,1] and convert to float32
181 :
182 48 : return gpuFloat; // Return preprocessed image (still on GPU)
183 : }
184 :
185 : // Run inference given a GpuMat input (already preprocessed)
186 21 : void TensorRTInferencer::runInference(const cv::cuda::GpuMat& gpuInput) {
187 21 : if (gpuInput.rows != inputSize.height || gpuInput.cols != inputSize.width) { // Verify input dimensions
188 2 : throw std::runtime_error("Input image dimensions do not match expected dimensions");
189 : }
190 :
191 57 : cudaError_t err = cudaMemcpy2DAsync(
192 : deviceInput, // Destination: TensorRT input buffer
193 19 : inputSize.width * sizeof(float), // Destination row stride
194 19 : gpuInput.ptr<float>(), // Source pointer: GpuMat data
195 19 : gpuInput.step, // Source stride
196 19 : inputSize.width * sizeof(float), // Width to copy in bytes
197 19 : inputSize.height, // Height to copy (rows)
198 : cudaMemcpyDeviceToDevice, // Type of copy: GPU to GPU
199 : stream // Use CUDA stream
200 : );
201 :
202 19 : if (err != cudaSuccess) { // Check if memory copy failed
203 0 : throw std::runtime_error("cudaMemcpy2DAsync failed: " + std::string(cudaGetErrorString(err)));
204 : }
205 :
206 : /* if (!context->enqueueV2(bindings.data(), stream, nullptr)) { // Enqueue inference on the GPU
207 : throw std::runtime_error("TensorRT inference execution failed");
208 : } */
209 :
210 19 : context->executeV2(bindings.data()); // Or executeAsyncV3 if needed
211 19 : }
212 :
213 : // Perform full prediction pipeline: preprocess, inference, and extract output
214 17 : cv::cuda::GpuMat TensorRTInferencer::makePrediction(const cv::cuda::GpuMat& gpuImage) {
215 34 : cv::cuda::GpuMat gpuInputFloat = preprocessImage(gpuImage); // Preprocess input image on GPU
216 :
217 17 : runInference(gpuInputFloat); // Run inference
218 :
219 17 : int height = outputDims.d[1];
220 17 : int width = outputDims.d[2];
221 :
222 : // Allocate or resize the output mask on GPU if it's not allocated or has wrong size
223 17 : if (outputMaskGpu.empty() || outputMaskGpu.rows != height || outputMaskGpu.cols != width) {
224 7 : outputMaskGpu = cv::cuda::GpuMat(height, width, CV_32F);
225 : }
226 :
227 : // Copy the raw prediction output from TensorRT device memory to `outputMaskGpu`
228 : // - Assumes output is already in device memory (deviceOutput)
229 : // - No CPU-GPU transfer, all device-to-device
230 17 : cudaMemcpy2DAsync(
231 17 : outputMaskGpu.ptr<float>(), outputMaskGpu.step,
232 17 : deviceOutput, width * sizeof(float),
233 17 : width * sizeof(float), height,
234 : cudaMemcpyDeviceToDevice, stream
235 : );
236 :
237 : //post-process starts here
238 : /* cv::cuda::GpuMat postProcessedMaskGpu = lanePostProcessor->process(outputMaskGpu);
239 :
240 : // Download post-processed binary mask for polyfitting
241 : cv::Mat maskCpu;
242 : postProcessedMaskGpu.download(maskCpu);
243 :
244 : // Fit lanes and compute centerline
245 : std::vector<LaneCurveFitter::LaneCurve> lanes = laneCurveFitter->fitLanes(maskCpu);
246 : std::cout << "[DEBUG] Number of fitted lanes: " << lanes.size() << std::endl;
247 : auto centerlineOpt = laneCurveFitter->computeVirtualCenterline(lanes, maskCpu.cols, maskCpu.rows);
248 : if (!centerlineOpt.has_value()) {
249 : std::cout << "[DEBUG] No centerline could be computed." << std::endl;
250 : }
251 :
252 : // Draw centerline on CPU
253 : if (centerlineOpt.has_value()) {
254 : const auto& centerline = centerlineOpt.value().blended;
255 : for (size_t i = 1; i < centerline.size(); ++i) {
256 : cv::line(maskCpu,
257 : centerline[i - 1],
258 : centerline[i],
259 : cv::Scalar(255), // White
260 : 2, // Thickness
261 : cv::LINE_AA);
262 : }
263 : }
264 :
265 : // Upload mask with centerline back to GPU
266 : postProcessedMaskGpu.upload(maskCpu); */
267 :
268 34 : return outputMaskGpu;
269 : }
270 :
271 23 : void TensorRTInferencer::initUndistortMaps() {
272 23 : cv::Mat cameraMatrix, distCoeffs;
273 46 : cv::FileStorage fs("/home/jetson/models/lane-detection/camera_calibration.yml", cv::FileStorage::READ); // Open calibration file
274 :
275 23 : if (!fs.isOpened()) {
276 0 : std::cerr << "[Error] Failed to open camera_calibration.yml" << std::endl;
277 0 : return; // Handle file opening error
278 : }
279 :
280 23 : fs["camera_matrix"] >> cameraMatrix; // Read camera matrix
281 23 : fs["distortion_coefficients"] >> distCoeffs; // Read distortion coefficients
282 23 : fs.release(); // Close file
283 :
284 46 : cv::Mat mapx, mapy;
285 69 : cv::initUndistortRectifyMap(
286 46 : cameraMatrix, distCoeffs, cv::Mat(), cameraMatrix,
287 : cv::Size(1280, 720),
288 : CV_32FC1, mapx, mapy
289 : ); // Compute undistortion mapping
290 :
291 23 : d_mapx.upload(mapx); // Upload X map to GPU
292 23 : d_mapy.upload(mapy); // Upload Y map to GPU
293 : }
294 :
295 0 : void TensorRTInferencer::doInference(const cv::Mat& frame) {
296 0 : if (frame.empty()) {
297 0 : throw std::runtime_error("Input frame is empty");
298 : }
299 :
300 0 : cv::cuda::GpuMat d_frame(frame); // Upload frame to GPU
301 0 : cv::cuda::GpuMat d_undistorted;
302 0 : cv::cuda::remap(d_frame, d_undistorted, d_mapx, d_mapy, cv::INTER_LINEAR, 0, cv::Scalar(), cudaStream); // Undistort frame
303 :
304 0 : cv::cuda::GpuMat d_prediction_mask = makePrediction(d_undistorted); // Run model inference
305 :
306 : // Convert to 8-bit (0 or 255) in a new GpuMat
307 0 : cv::cuda::GpuMat d_mask_u8;
308 0 : d_prediction_mask.convertTo(d_mask_u8, CV_8U, 255.0); // Multiply 0/1 float to 0/255
309 :
310 0 : cv::Mat binary_mask_cpu;
311 0 : d_mask_u8.download(binary_mask_cpu, cudaStream);
312 0 : cv::threshold(binary_mask_cpu, binary_mask_cpu, 128, 255, cv::THRESH_BINARY);
313 0 : cudaStream.waitForCompletion(); // Ensure async operations are complete
314 :
315 : // Convert model output to 8-bit binary mask on GPU
316 0 : cv::cuda::GpuMat d_visualization;
317 0 : d_prediction_mask.convertTo(d_visualization, CV_8U, 255.0, 0, cudaStream);
318 :
319 0 : cv::cuda::GpuMat d_resized_mask;
320 :
321 0 : cv::cuda::resize(d_visualization, d_resized_mask,
322 0 : cv::Size(frame.cols * 0.5, frame.rows * 0.5),
323 0 : 0, 0, cv::INTER_LINEAR, cudaStream); // Resize for display
324 0 : cudaStream.waitForCompletion(); // Synchronize
325 :
326 0 : Publisher::instance(5556)->publishInferenceFrame("inference_frame", d_resized_mask); //Publish frame to ZeroMQ publisher
327 0 : }
|