从LeNet到实战：手把手教你用ONNX Runtime和TensorRT实现多Batch推理（Python/C++双版本）

张

张建站

2026/6/1 17:56:13

10分钟阅读

从LeNet到实战手把手教你用ONNX Runtime和TensorRT实现多Batch推理Python/C双版本在工业级AI部署中处理批量数据是提升推理效率的关键。本文将以经典LeNet模型为例深入对比ONNX Runtime与TensorRT在多Batch推理中的实现差异涵盖Python和C双语言版本。我们将从工程化角度剖析内存管理、流水线设计等核心问题帮助开发者掌握生产环境部署的关键技术。1. 环境准备与模型导出1.1 LeNet模型的多Batch适配传统LeNet模型输入为单张28x28灰度图像。为支持多Batch推理需在模型导出时显式指定动态Batch维度。以PyTorch导出ONNX为例import torch import torch.nn as nn class LeNet(nn.Module): def __init__(self): super().__init__() self.conv1 nn.Conv2d(1, 6, 5) self.conv2 nn.Conv2d(6, 16, 5) self.fc1 nn.Linear(16*4*4, 120) self.fc2 nn.Linear(120, 84) self.fc3 nn.Linear(84, 10) def forward(self, x): x torch.relu(self.conv1(x)) x torch.max_pool2d(x, 2) x torch.relu(self.conv2(x)) x torch.max_pool2d(x, 2) x x.view(x.size(0), -1) # 保持Batch维度 x torch.relu(self.fc1(x)) x torch.relu(self.fc2(x)) x self.fc3(x) return x model LeNet() dummy_input torch.randn(2, 1, 28, 28) # Batch2的示例输入 torch.onnx.export(model, dummy_input, lenet.onnx, input_names[input], output_names[output], dynamic_axes{input: {0: batch}, output: {0: batch}})关键修改点view操作保留Batch维度导出时通过dynamic_axes指定动态Batch1.2 TensorRT引擎构建TensorRT需要从ONNX转换生成优化后的引擎文件import tensorrt as trt logger trt.Logger(trt.Logger.WARNING) builder trt.Builder(logger) network builder.create_network(1 int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser trt.OnnxParser(network, logger) with open(lenet.onnx, rb) as f: parser.parse(f.read()) config builder.create_builder_config() config.max_workspace_size 1 30 # 1GB profile builder.create_optimization_profile() # 设置动态Batch范围 profile.set_shape(input, (1,1,28,28), (2,1,28,28), (4,1,28,28)) config.add_optimization_profile(profile) engine builder.build_engine(network, config) with open(lenet.engine, wb) as f: f.write(engine.serialize())2. ONNX Runtime多Batch推理实现2.1 Python版本import cv2 import numpy as np import onnxruntime def preprocess_image(image_path): img cv2.imread(image_path, 0) blob cv2.dnn.blobFromImage(img, 1/255., (28,28), swapRBTrue) return blob # 初始化推理会话 onnx_session onnxruntime.InferenceSession( lenet.onnx, providers[CUDAExecutionProvider, CPUExecutionProvider] ) # 构建多Batch输入 batch_images [2.png, 10.png, 3.png, 7.png] # 示例图像 batch_data np.concatenate([preprocess_image(img) for img in batch_images]) # 执行推理 input_name onnx_session.get_inputs()[0].name outputs onnx_session.run(None, {input_name: batch_data})[0] # 解析结果 predictions np.argmax(outputs, axis1) print(fBatch predictions: {predictions})性能优化技巧使用IOBinding减少数据拷贝设置线程数优化CPU推理options onnxruntime.SessionOptions() options.intra_op_num_threads 4 options.execution_mode onnxruntime.ExecutionMode.ORT_SEQUENTIAL2.2 C版本#include onnxruntime_cxx_api.h #include opencv2/opencv.hpp #include numeric struct ONNXModel { Ort::Env env; Ort::Session session; Ort::AllocatorWithDefaultOptions allocator; ONNXModel(const wchar_t* model_path) : env(ORT_LOGGING_LEVEL_WARNING, onnx), session(env, model_path, Ort::SessionOptions{}) {} }; std::vectorfloat preprocess_image(const cv::Mat image) { cv::Mat processed; image.convertTo(processed, CV_32F, 1.0/255); return std::vectorfloat(processed.beginfloat(), processed.endfloat()); } int main() { ONNXModel model(Llenet.onnx); // 准备Batch数据 std::vectorcv::Mat images { cv::imread(2.png, 0), cv::imread(10.png, 0) }; // 合并Batch std::vectorfloat input_tensor; for (const auto img : images) { auto img_data preprocess_image(img); input_tensor.insert(input_tensor.end(), img_data.begin(), img_data.end()); } // 创建输入Tensor std::vectorint64_t input_shape {2, 1, 28, 28}; Ort::Value input_tensor Ort::Value::CreateTensorfloat( Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault), input_tensor.data(), input_tensor.size(), input_shape.data(), input_shape.size() ); // 执行推理 const char* input_names[] {input}; const char* output_names[] {output}; auto outputs model.session.Run( Ort::RunOptions{nullptr}, input_names, input_tensor, 1, output_names, 1 ); // 解析输出 float* output_data outputs[0].GetTensorDatafloat(); std::vectorint predictions { std::max_element(output_data, output_data10) - output_data, std::max_element(output_data10, output_data20) - (output_data10) }; std::cout Predictions: ; for (auto pred : predictions) std::cout pred ; return 0; }3. TensorRT多Batch推理实现3.1 Python版本import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit class TRTInference: def __init__(self, engine_path): self.logger trt.Logger(trt.Logger.WARNING) with open(engine_path, rb) as f, trt.Runtime(self.logger) as runtime: self.engine runtime.deserialize_cuda_engine(f.read()) self.context self.engine.create_execution_context() # 绑定输入输出 self.bindings [] for binding in self.engine: size trt.volume(self.engine.get_binding_shape(binding)) dtype trt.nptype(self.engine.get_binding_dtype(binding)) if self.engine.binding_is_input(binding): self.input_shape self.engine.get_binding_shape(binding) self.input_size size self.input_dtype dtype device_mem cuda.mem_alloc(size * dtype.itemsize) else: self.output_size size self.output_dtype dtype device_mem cuda.mem_alloc(size * dtype.itemsize) self.bindings.append(int(device_mem)) self.stream cuda.Stream() def infer(self, batch_data): # 设置动态Batch维度 self.context.set_binding_shape(0, batch_data.shape) # 拷贝输入数据 host_input cuda.pagelocked_empty(self.input_size, dtypeself.input_dtype) np.copyto(host_input, batch_data.ravel()) cuda.memcpy_htod_async(self.bindings[0], host_input, self.stream) # 执行推理 self.context.execute_async_v2( bindingsself.bindings, stream_handleself.stream.handle ) # 获取输出 host_output cuda.pagelocked_empty(self.output_size, dtypeself.output_dtype) cuda.memcpy_dtoh_async(host_output, self.bindings[1], self.stream) self.stream.synchronize() return host_output.reshape(batch_data.shape[0], -1) # 使用示例 trt_engine TRTInference(lenet.engine) batch_images np.concatenate([ cv2.dnn.blobFromImage(cv2.imread(2.png, 0), 1/255., (28,28)), cv2.dnn.blobFromImage(cv2.imread(10.png, 0), 1/255., (28,28)) ]) output trt_engine.infer(batch_images) print(Predictions:, np.argmax(output, axis1))3.2 C版本#include NvInfer.h #include cuda_runtime_api.h #include opencv2/opencv.hpp class TensorRTInference { nvinfer1::ICudaEngine* engine; nvinfer1::IExecutionContext* context; void* bindings[2]; cudaStream_t stream; public: TensorRTInference(const std::string engine_path) { std::ifstream engine_file(engine_path, std::ios::binary); engine_file.seekg(0, std::ios::end); size_t size engine_file.tellg(); engine_file.seekg(0, std::ios::beg); std::vectorchar engine_data(size); engine_file.read(engine_data.data(), size); nvinfer1::IRuntime* runtime nvinfer1::createInferRuntime(logger); engine runtime-deserializeCudaEngine(engine_data.data(), size); context engine-createExecutionContext(); // 分配设备内存 for (int i 0; i engine-getNbBindings(); i) { size_t binding_size getSizeByDim(engine-getBindingDimensions(i)) * sizeof(float); cudaMalloc(bindings[i], binding_size); } cudaStreamCreate(stream); } std::vectorint infer(const std::vectorcv::Mat images) { // 预处理并合并Batch float* host_input new float[images.size() * 1 * 28 * 28]; for (size_t i 0; i images.size(); i) { cv::Mat processed; images[i].convertTo(processed, CV_32F, 1.0/255); memcpy(host_input i*28*28, processed.data, 28*28*sizeof(float)); } // 拷贝到设备 cudaMemcpyAsync(bindings[0], host_input, images.size()*1*28*28*sizeof(float), cudaMemcpyHostToDevice, stream); // 设置动态Batch nvinfer1::Dims input_dims engine-getBindingDimensions(0); input_dims.d[0] images.size(); context-setBindingDimensions(0, input_dims); // 执行推理 context-enqueueV2(bindings, stream, nullptr); // 获取输出 float host_output[20]; // 假设最大Batch2 cudaMemcpyAsync(host_output, bindings[1], images.size()*10*sizeof(float), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); // 解析结果 std::vectorint predictions; for (size_t i 0; i images.size(); i) { predictions.push_back(std::max_element( host_output i*10, host_output (i1)*10 ) - (host_output i*10)); } delete[] host_input; return predictions; } ~TensorRTInference() { cudaFree(bindings[0]); cudaFree(bindings[1]); cudaStreamDestroy(stream); context-destroy(); engine-destroy(); } };4. 工程化部署关键考量4.1 性能对比与选型建议特性ONNX RuntimeTensorRT部署复杂度低单一DLL依赖中需CUDA环境硬件支持CPU/GPU/专用加速器NVIDIA GPU only动态Batch支持完善需要显式配置延迟Batch212msCPU / 5msGPU3ms内存占用中等低显存优化适用场景多硬件部署/快速原型开发高性能GPU服务器部署选型建议当需要跨平台部署或快速验证时选择ONNX Runtime当追求极致性能且运行在NVIDIA环境时选择TensorRT对于边缘设备考虑ONNX RuntimeOpenVINO组合4.2 常见问题解决方案内存管理陷阱ONNX Runtime内存泄漏C中确保Ort::Value生命周期管理使用Ort::Allocator统一管理内存TensorRT显存碎片# 在长时间运行的推理服务中定期重置context def reset_context(trt_engine): trt_engine.context trt_engine.engine.create_execution_context()动态Batch处理技巧预处理阶段实现队列缓冲class BatchProcessor: def __init__(self, batch_size4): self.batch_queue [] self.batch_size batch_size def add_image(self, image): self.batch_queue.append(image) if len(self.batch_queue) self.batch_size: return self.process_batch() return None def process_batch(self): batch np.stack(self.batch_queue) self.batch_queue.clear() return batch4.3 生产环境最佳实践服务化部署架构Client → Load Balancer → [Inference Server x N] → Result Aggregator ↑ Model Repository性能监控指标吞吐量requests/sec平均/百分位延迟GPU利用率显存占用率自动化测试方案def benchmark(model, batch_sizes[1,2,4,8], iterations100): results {} for bs in batch_sizes: dummy_input np.random.randn(bs, 1, 28, 28).astype(np.float32) start time.time() for _ in range(iterations): model.infer(dummy_input) avg_time (time.time()-start)/iterations results[bs] avg_time*1000 # ms return results

Sora 2短片爆款公式（含17个已验证Prompt模板+时序控制参数表）

更多请点击： https://codechina.net 第一章：Sora 2短片爆款底层逻辑与创作范式演进 Sora 2并非单纯的技术迭代，而是视频生成范式从“帧序列拟合”向“时空语义编排”的结构性跃迁。其爆款短片的共性不在于分辨率或时长，而在于对人…...

2026/6/1 17:54:03 阅读更多 →

智能开关双控方案：从传统电路到WiFi模块的接线与避坑指南

1. 项目概述：从传统开关到智能联动的跨越如果你和我一样，是个喜欢折腾家里各种电器的“技术宅”，那么对传统墙壁开关的局限性一定深有体会。下班回家摸黑找开关、出门后总在担心灯有没有关、想躺在床上关掉客厅的灯……这些场景催生了我们对智…...

2026/6/1 17:51:37 阅读更多 →

树莓派GPIO自制复古游戏手柄：从硬件连接到GPIOnext配置全攻略

1. 项目概述与核心思路想给家里的树莓派复古游戏机配个趁手又有个性的手柄，但市面上的成品要么太贵，要么手感不对味？或者，你手头正好有一堆闲置的微动开关和面包板，想折腾点硬核又有趣的东西？那这个基于树莓…...

2026/6/1 17:46:15 阅读更多 →

掌握Markdown实时预览：打造高效写作工作流的3个关键策略

掌握Markdown实时预览：打造高效写作工作流的3个关键策略【免费下载链接】markn Lightweight markdown viewer. 项目地址: https://gitcode.com/gh_mirrors/ma/markn 在当今数字创作时代，Markdown已成为技术文档、博客文章和个人笔记的首选格式。…...

2026/6/1 1:01:46 阅读更多 →

Win10/Win11下Realtek 8188GU网卡驱动感叹号？别急着扔，试试这个手动安装的野路子

Realtek 8188GU网卡驱动故障深度修复指南：从原理到实战当设备管理器里那个顽固的黄色感叹号挥之不去，而你已经尝试了所有"标准操作"——Windows自动更新、第三方驱动工具、甚至重启大法——却依然无济于事时，是时候换个思路了。这篇…...

2026/5/31 0:02:02 阅读更多 →

前轮驱动自行车机器人建模与自适应控制策略优化【附代码】

✨ 长期致力于自行车机器人、前轮驱动、Lagrange方程、自适应模糊控制、RBF网络自适应控制研究工作，擅长数据搜集与处理、建模仿真、程序编写、仿真设计。 ✅ 专业定制毕设、代码 ✅ 如需沟通交流，点击《获取方式》 （1）基于瞬时转…...

2026/5/31 0:03:05 阅读更多 →

ModTheSpire终极指南：5分钟安全安装《杀戮尖塔》模组管理器

ModTheSpire终极指南：5分钟安全安装《杀戮尖塔》模组管理器【免费下载链接】ModTheSpire External mod loader for Slay The Spire 项目地址: https://gitcode.com/gh_mirrors/mo/ModTheSpire 还在为《杀戮尖塔》模组安装的复杂流程而头疼吗？Mod…...

2026/5/31 0:04:06 阅读更多 →