32 using InferResultArray =
33 std::vector<::ReUseX::vision::common::object::DetectionBoxArray>;
47 const std::string text_encoder_path,
48 const std::string geometry_encoder_path,
49 const std::string decoder_path,
const std::string tokenizer_path,
57 static std::unique_ptr<TensorRTSam3>
58 create(
const std::filesystem::path &model_path);
67 std::vector<IDataset::Pair>
68 forward(
const std::span<IDataset::Pair> &input)
override;
85 const cv::Mat &image,
const std::string &label,
86 const std::vector<std::pair<std::string, std::array<float, 4>>> &boxes);
108 void preprocess(
const TensorRTData &input,
int ibatch,
void *stream);
110 bool encode_image(
int batch_size,
void *stream);
114 void gather_vision_features(
const std::vector<PromptMeta> &batch_prompts,
115 int batch_size,
void *stream);
118 bool encode_text(
const std::vector<PromptMeta> &batch_prompts,
int batch_size,
120 bool encode_boxes(
const std::vector<PromptMeta> &batch_prompts,
121 int batch_size,
int max_boxes,
void *stream);
122 bool decode(
int batch_size,
int prompt_len,
void *stream);
125 void postprocess(InferResult &image_result,
int batch_idx,
int image_idx,
126 const std::string &label,
const int label_id,
127 float confidence_threshold,
bool return_mask,
void *stream);
136 void allocate_memory_once();
138 void set_binding_dim(std::shared_ptr<TensorRT::Engine> &engine,
139 int binding_index,
const std::vector<int> &dims);
143 bool isdynamic_model_ =
true;
144 int input_image_width_ = 1008;
145 int input_image_height_ = 1008;
150 const int max_image_batch_ =
153 const int max_prompt_batch_ =
156 const int max_boxes_per_prompt_ =
160 std::vector<std::pair<int, int>>
161 original_image_sizes_;
162 int num_queries_ = 200;
163 int mask_height_ = 288;
164 int mask_width_ = 288;
167 std::string vision_encoder_path_;
168 std::string text_encoder_path_;
169 std::string geometry_encoder_path_;
170 std::string decoder_path_;
173 std::shared_ptr<TensorRT::Engine> vision_encoder_trt_;
174 std::shared_ptr<TensorRT::Engine> text_encoder_trt_;
175 std::shared_ptr<TensorRT::Engine> decoder_trt_;
176 std::shared_ptr<TensorRT::Engine> geometry_encoder_trt_;
184 std::unordered_map<std::string, std::tuple<std::array<int64_t, 32>,
185 std::array<int64_t, 32>,
int>>
192 std::vector<int> vision_input_shape_;
193 std::vector<int> fpn_feat_0_shape_;
194 std::vector<int> text_ids_shape_;
195 std::vector<int> geom_box_shape_;
199 std::vector<std::shared_ptr<tensor::Memory<uint8_t>>> original_images_buf_;
232 std::unordered_map<std::string, std::shared_ptr<tensor::Memory<float>>>
233 geom_features_cache_;
234 std::unordered_map<std::string, std::shared_ptr<tensor::Memory<bool>>>
253 box_affine_matrices_;
256 std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
TensorRTSam3(const std::string vision_encoder_path, const std::string text_encoder_path, const std::string geometry_encoder_path, const std::string decoder_path, const std::string tokenizer_path, int gpu_id)