ReUseX  0.0.1
3D Point Cloud Processing for Building Reuse
Loading...
Searching...
No Matches
Sam3.hpp
Go to the documentation of this file.
1#pragma once
4
11#include <unordered_map>
12#include <vector>
13
14#include <tokenizers_cpp.h>
15
17
18/* TensorRTSam3 is an implementation of the SAM3 model using TensorRT for
19efficient inference. It supports batch processing of images and prompts, with
20careful memory management to optimize performance on GPU. The class handles the
21loading of TensorRT engines for the vision encoder, text encoder, geometry
22encoder, and decoder, and provides a forward method to process input data and
23produce segmentation results. The implementation includes preprocessing of
24images, gathering of features from the vision encoder, and post-processing of
25the decoder outputs to generate final masks and bounding boxes. The design
26allows for flexibility in handling different types of prompts (text and
27geometry) and is optimized for use in real-time applications where latency is a
28concern. */
29class TensorRTSam3 : public IModel {
30 private:
32 using InferResultArray =
33 std::vector<::ReUseX::vision::common::object::DetectionBoxArray>;
34
35 public:
36 /* Constructor for TensorRTSam3 without geometry encoder. Initializes the
37 * model with the specified paths for the vision encoder, text encoder, and
38 * decoder, and sets the GPU ID for inference.
39 * @param vision_encoder_path: Path to the TensorRT engine file for the vision
40 * encoder.
41 * @param text_encoder_path: Path to the TensorRT engine file for the text
42 * encoder.
43 * @param decoder_path: Path to the TensorRT engine file for the decoder.
44 * @param gpu_id: ID of the GPU to use for inference.
45 */
46 TensorRTSam3(const std::string vision_encoder_path,
47 const std::string text_encoder_path,
48 const std::string geometry_encoder_path,
49 const std::string decoder_path, const std::string tokenizer_path,
50 int gpu_id);
51
52 /* Destructor for TensorRTSam3. Cleans up any resources used by the model.
53 * @param model_path: Path to the model files (not used in this
54 * implementation).
55 * @return A unique pointer to an instance of TensorRTSam3.
56 */
57 static std::unique_ptr<TensorRTSam3>
58 create(const std::filesystem::path &model_path);
59
60 /* Forward method for TensorRTSam3. Takes a span of input pairs and processes
61 * them through the model to produce output pairs. This method handles the
62 * entire inference pipeline, including preprocessing, feature extraction,
63 * decoding, and post-processing to generate the final segmentation results.
64 * @param input: A span of input pairs containing the data to be processed.
65 * @return A vector of output pairs containing the results of the inference.
66 */
67 std::vector<IDataset::Pair>
68 forward(const std::span<IDataset::Pair> &input) override;
69
70 protected:
71 // virtual ~TensorRTSam3() = default;
72
73 /* Loads the TensorRT engines for the vision encoder, text encoder, geometry
74 * encoder, and decoder. This method is responsible for initializing the
75 * engines based on the provided model paths and ensuring that they are ready
76 * for inference. It returns true if all engines are loaded successfully, and
77 * false otherwise.
78 * @return A boolean indicating whether the engines were loaded successfully.
79 */
81
82 static std::string load_bytes_from_file(const std::string &file_path);
83
85 const cv::Mat &image, const std::string &label,
86 const std::vector<std::pair<std::string, std::array<float, 4>>> &boxes);
87
88 /*
89 // Core implementation
90 virtual InferResultArray forwards(const std::vector<Sam3Input> &inputs,
91 bool return_mask = false,
92 void *stream = nullptr) override;
93 virtual InferResultArray forwards(const std::vector<Sam3Input> &inputs,
94 const std::string &geom_label,
95 bool return_mask = false,
96 void *stream = nullptr) override;
97 */
98
99 private:
100 // Define internal structure for flattening Prompt
101 struct PromptMeta {
102 int image_idx; // Which image this Prompt belongs to
103 int original_idx; // The index of this Prompt in the original image vector
104 const Sam3PromptUnit *ptr; // Pointer to the original Prompt data
105 };
106
107 // Internal processing function
108 void preprocess(const TensorRTData &input, int ibatch, void *stream);
109
110 bool encode_image(int batch_size, void *stream);
111
112 // Modification: Gather features, collect data from Vision features according
113 // to the image index corresponding to the current Prompt Batch
114 void gather_vision_features(const std::vector<PromptMeta> &batch_prompts,
115 int batch_size, void *stream);
116
117 // Modified encoding function, based on the current batch size
118 bool encode_text(const std::vector<PromptMeta> &batch_prompts, int batch_size,
119 void *stream);
120 bool encode_boxes(const std::vector<PromptMeta> &batch_prompts,
121 int batch_size, int max_boxes, void *stream);
122 bool decode(int batch_size, int prompt_len, void *stream);
123
124 // Post-processing
125 void postprocess(InferResult &image_result, int batch_idx, int image_idx,
126 const std::string &label, const int label_id,
127 float confidence_threshold, bool return_mask, void *stream);
128
129 /* Allocates memory for all the necessary buffers used during inference. This
130 * method is designed to be called only once during the initialization phase
131 * of the model, and it sets up the memory structures based on the maximum
132 * batch sizes and input dimensions defined in the class. It ensures that all
133 * buffers are properly allocated and ready for use during the forward pass,
134 * optimizing memory usage and performance on the GPU.
135 */
136 void allocate_memory_once();
137
138 void set_binding_dim(std::shared_ptr<TensorRT::Engine> &engine,
139 int binding_index, const std::vector<int> &dims);
140
141 private:
142 // Configuration
143 bool isdynamic_model_ = true;
144 int input_image_width_ = 1008;
145 int input_image_height_ = 1008;
146 int gpu_id_ = 0;
147
148 // --- Batch processing limit configuration ---
149 // Can be adjusted according to VRAM size
150 const int max_image_batch_ =
151 2; // This Vision Encoder is relatively large, limit the number of images
152 // processed simultaneously
153 const int max_prompt_batch_ =
154 4; // Decoder is smaller, but VRAM is limited, limit the number of Prompts
155 // decoded each time
156 const int max_boxes_per_prompt_ =
157 20; // Preset maximum number of supported Boxes
158
159 // State variables
160 std::vector<std::pair<int, int>>
161 original_image_sizes_; // Size: max_image_batch_
162 int num_queries_ = 200;
163 int mask_height_ = 288;
164 int mask_width_ = 288;
165
166 // Model path
167 std::string vision_encoder_path_;
168 std::string text_encoder_path_;
169 std::string geometry_encoder_path_;
170 std::string decoder_path_;
171
172 // TRT engine
173 std::shared_ptr<TensorRT::Engine> vision_encoder_trt_;
174 std::shared_ptr<TensorRT::Engine> text_encoder_trt_;
175 std::shared_ptr<TensorRT::Engine> decoder_trt_;
176 std::shared_ptr<TensorRT::Engine> geometry_encoder_trt_;
177
178 // std::unordered_map<
179 // std::string, std::pair<std::array<int64_t, 32>, std::array<int64_t,
180 // 32>>> text_input_map_;
181 // INFO:
182 // The first array is for input_ids, the second is for attention_mask. The
183 // last int is the prompt ID
184 std::unordered_map<std::string, std::tuple<std::array<int64_t, 32>,
185 std::array<int64_t, 32>, int>>
186 text_input_map_;
187
188 // --- Memory management ---
190 1.0f / 127.5f, -1.0f, norm_image::ChannelType::SwapRB);
191
192 std::vector<int> vision_input_shape_;
193 std::vector<int> fpn_feat_0_shape_;
194 std::vector<int> text_ids_shape_;
195 std::vector<int> geom_box_shape_;
196
197 // Image Batch buffers (Size: max_image_batch_)
198 tensor::Memory<float> preprocessed_images_;
199 std::vector<std::shared_ptr<tensor::Memory<uint8_t>>> original_images_buf_;
200 tensor::Memory<float> affine_matrix_;
201 // Mask post-processing requires the corresponding matrix of the original
202 // image (Size: max_image_batch_)
203 tensor::Memory<float> mask_affine_matrix_;
204
205 // Vision Encoder Outputs (Size: max_image_batch_)
206 tensor::Memory<float> fpn_feat_0_;
207 tensor::Memory<float> fpn_feat_1_;
208 tensor::Memory<float> fpn_feat_2_;
209 tensor::Memory<float> fpn_pos_2_;
210
211 // Decoder Input Buffers (Size: max_prompt_batch_)
212 // These are gathered from Vision Output
213 tensor::Memory<float> fpn_feat_0_gather_;
214 tensor::Memory<float> fpn_feat_1_gather_;
215 tensor::Memory<float> fpn_feat_2_gather_;
216 tensor::Memory<float> fpn_pos_2_gather_;
217
218 // Prompt Inputs (Size: max_prompt_batch_)
219 tensor::Memory<int64_t> text_input_ids_;
220 tensor::Memory<int64_t> text_attention_mask_;
221
222 tensor::Memory<float> geom_boxes_;
223 tensor::Memory<int64_t> geom_labels_;
224
225 tensor::Memory<float> text_features_;
226 tensor::Memory<bool> text_mask_;
227
228 tensor::Memory<float> geom_features_;
229 tensor::Memory<bool> geom_mask_;
230
231 // Used to store the results of pre-set geometry models
232 std::unordered_map<std::string, std::shared_ptr<tensor::Memory<float>>>
233 geom_features_cache_;
234 std::unordered_map<std::string, std::shared_ptr<tensor::Memory<bool>>>
235 geom_mask_cache_;
236
237 tensor::Memory<float> prompt_features_;
238 tensor::Memory<bool> prompt_mask_;
239
240 // Decoder Output (Size: max_prompt_batch_)
241 tensor::Memory<float> pred_masks_;
242 tensor::Memory<float> pred_boxes_;
243 tensor::Memory<float> pred_logits_;
244 tensor::Memory<float> presence_logits_;
245
246 // Postprocess (Size: max_prompt_batch_)
247 tensor::Memory<float> filter_boxes_;
248 tensor::Memory<float> filter_scores_;
249 tensor::Memory<int> filter_indices_;
250 tensor::Memory<int> box_count_;
251 tensor::Memory<uint8_t> mask_buffer_;
253 box_affine_matrices_; // Matrix for each Box during Mask recovery
254
255 // Tokenizer
256 std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
257};
258} // namespace ReUseX::vision::tensor_rt
bool setup_geometry_input(const cv::Mat &image, const std::string &label, const std::vector< std::pair< std::string, std::array< float, 4 > > > &boxes)
static std::unique_ptr< TensorRTSam3 > create(const std::filesystem::path &model_path)
TensorRTSam3(const std::string vision_encoder_path, const std::string text_encoder_path, const std::string geometry_encoder_path, const std::string decoder_path, const std::string tokenizer_path, int gpu_id)
static std::string load_bytes_from_file(const std::string &file_path)
std::vector< IDataset::Pair > forward(const std::span< IDataset::Pair > &input) override
std::vector< DetectionBox > DetectionBoxArray
Convenience alias for a collection of DetectionBox results.
Definition object.hpp:210
static Norm alpha_beta(float alpha, float beta=0, ChannelType channel_type=ChannelType::None)