【TensorRT】TensorRT从安装到推理——Python 环境下 MobileNetV4 三分类任务
我想开发一个基于深度学习的分类小软件,逐渐了解到了TensorRT在模型推理速度上的优势,经过一下午资料的查找实现了将onnx模型转为TensorRT格式模型的推理及测试过程。将实现过程记录下来方便日后查看。
实际上本文的测试方法并不准确,在我的代码,将TensorRT分配和释放显存的时间、数据预处理的时间都计算入了推理时间中,如果要准确的测量出推理时间,应该只测量模型推理的时间而将其余步骤排除。尽管如此TensorRT在我的设备上也比ONNX快的多。
本文实验设备是MX350显卡 2G显存
一 、安装TensorRT
点击TensorRT下载链接,选择合适的TensorRT版本下载,读者选择使用TensorRT进行推理,默认已经配置好cuda和cudnn环境,如果没配置好请移步这篇博客Windows配置深度学习环境(从查询合适的torch版本开始)——torch+CUDA+cuDNN
TensorRT与cuda版本对应方式查看如下:
-
点击TensorRT版本
-
点击同意
-
点击版本号
-
查看cuda版本是否符合你设备,点击下载即可
二、环境配置
- 下载后得到文件结构如下所示
- 添加环境变量,右键此电脑点击属性,根据图中序号依次点击并添加环境变量
我的环境变量如下所示
D:\Software\TensorRT-8.6.1.6\lib
D:\Software\TensorRT-8.6.1.6\bin
三、模型转换
打开命令行窗口,切换到D:\Software\TensorRT-8.6.1.6\bin目录,执行如下命令
trtexec --onnx=mymodel.onnx --saveEngine=model.trt --fp16
这里的–fp16应该也可以改成int8,但是精度损失会有点大,我没有实验
这个mymodel.onnx需要你自己的onnx文件名,这个model.trt 就随便起名字了
如下图所示为转换成功
四、TensorRT与ONNX推理速度与精度测试
(1)推理时间测试
- TensorRTPredictor
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import os
import numpy as np
import time
from typing import Tuple
from utils.utils import softmax, preprocess_imageclass TensorRTPredictor:def __init__(self, engine_path: str):"""初始化TensorRT预测器(显存分配在初始化阶段完成)"""self.logger = trt.Logger(trt.Logger.WARNING)self.engine = self._load_engine(engine_path)self.context = self.engine.create_execution_context()self.input_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(0)))self.output_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(1)))# 预分配显存self._setup_buffers()# 创建固定流self.stream = cuda.Stream()self.is_warmed_up = Falsedef _load_engine(self, engine_path: str) -> trt.ICudaEngine:"""加载TensorRT引擎"""load_start_time = time.time()with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:engine = runtime.deserialize_cuda_engine(f.read())load_end_time = time.time()load_time = (load_end_time - load_start_time) * 1000print(f"加载引擎时间: {load_time:.2f} ms")return enginedef _setup_buffers(self):"""预分配输入输出显存"""# 计算缓冲区大小input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsize# 分配固定显存self.d_input = cuda.mem_alloc(input_size)self.d_output = cuda.mem_alloc(output_size)# 预分配主机锁页内存self.h_output = cuda.pagelocked_empty(self.output_shape, dtype=np.float32)def warmup(self, iterations: int = 10):"""模型预热(使用预分配显存)"""if self.is_warmed_up:print("模型已经预热,跳过预热步骤")returnwarmup_start_time = time.time()dummy_input = np.random.rand(*self.input_shape).astype(np.float32)for _ in range(iterations):cuda.memcpy_htod_async(self.d_input, dummy_input, self.stream)self.context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)],stream_handle=self.stream.handle)self.stream.synchronize()warmup_end_time = time.time()warmup_time = (warmup_end_time - warmup_start_time) * 1000print(f" 预热时间: {warmup_time:.2f} ms")self.is_warmed_up = Truedef infer(self, image) -> Tuple[float, np.ndarray]:image = preprocess_image(image)"""执行推理(复用预分配显存)"""if not self.is_warmed_up:print("警告:模型尚未预热,推理性能可能受影响")# 预处理input_data = np.ascontiguousarray(np.expand_dims(image, axis=0), dtype=np.float32)# 异步拷贝数据cuda.memcpy_htod_async(self.d_input, input_data, self.stream)# 执行推理self.context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)],stream_handle=self.stream.handle)# 异步拷贝结果回主机cuda.memcpy_dtoh_async(self.h_output, self.d_output, self.stream)self.stream.synchronize()# 后处理confidence = softmax(self.h_output[0])return self.h_output.copy(), confidencedef __del__(self):"""析构函数自动释放显存"""if hasattr(self, 'd_input'):self.d_input.free()if hasattr(self, 'd_output'):self.d_output.free()print("显存资源已释放")
- ONNXPredictor
import onnxruntime as ort
import numpy as np
from torchvision import transforms
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
import time
import os
from utils.utils import softmax, preprocess_imageclass ONNXPredictor:def __init__(self, model_path="mobilenetv4_hybrid_medium.onnx", size=224):# 自动检测可用providerself.providers = self._get_available_providers()print(f"可用推理后端: {self.providers}")# 初始化ONNX Runtime会话self.session = ort.InferenceSession(model_path, providers=self.providers)# 获取当前使用的provider信息current_provider = self.session.get_providers()print(f"实际使用的推理后端: {current_provider}")# 获取输入输出名称self.input_name = self.session.get_inputs()[0].nameself.output_name = self.session.get_outputs()[0].name# 预处理变换self.transform = self.build_transform(size)# 预热标志self.is_warmed_up = Falsedef _get_available_providers(self):"""获取可用的推理后端,优先使用CUDA且仅使用CUDA(如果可用)"""available_providers = ort.get_available_providers()# 优先使用CUDA且仅使用CUDAif 'CUDAExecutionProvider' in available_providers:return ['CUDAExecutionProvider'] # 仅返回CUDA# 如果没有CUDA,则回退到CPUelif 'CPUExecutionProvider' in available_providers:return ['CPUExecutionProvider']else:raise RuntimeError("没有可用的执行提供程序(既没有CUDA也没有CPU)")def build_transform(self, size: int):"""构建图像预处理流水线"""return transforms.Compose([transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC),transforms.CenterCrop(size),transforms.ToTensor(),transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)])def warmup(self, iterations=10):"""预热模型"""dummy_input = np.random.rand(1, 3, 224, 224).astype(np.float32)for _ in range(iterations):self.session.run([self.output_name], {self.input_name: dummy_input})self.is_warmed_up = Trueprint(f"模型已预热 {iterations} 次")def preprocess(self, image):return preprocess_image(image)def infer(self, image):"""执行预测"""# 预处理input_data = self.preprocess(image)# 运行模型outputs = self.session.run([self.output_name], {self.input_name: input_data})[0]confidence = softmax(outputs[0])return outputs, confidence
- Predictor
import numpy as np
from typing import Tuple
from onnxPredictor import ONNXPredictor
from TensorRTPredictor import TensorRTPredictor
import timeclass Predictor:def __init__(self,model_path: str = None,mode: str = "speed",engine_path: str = None,size: int = 224):"""混合预测器,根据模式选择 ONNX 或 TensorRT 后端参数:model_path: ONNX 模型路径mode: "speed"(速度优先) 或 "precision"(精度优先)engine_path: TensorRT 引擎路径 (当 mode="speed" 时必需)size: 输入图像尺寸"""assert mode in ["speed", "precision"], "mode 必须是 'speed' 或 'precision'"self.mode = modeself.size = sizeif self.mode == "speed":if engine_path is None:raise ValueError("TensorRT 引擎路径必须提供当选择速度优先模式")print("初始化 TensorRT 预测器 (速度优先模式)...")self.predictor = TensorRTPredictor(engine_path)else:print("初始化 ONNX 预测器 (精度优先模式)...")self.predictor = ONNXPredictor(model_path, size)self.is_warmed_up = Falsedef warmup(self, iterations: int = 10):"""预热模型"""self.predictor.warmup(iterations)self.is_warmed_up = Truedef infer(self, image) -> Tuple[np.ndarray, np.ndarray]:"""执行推理返回:对于速度模式: (输出张量, 置信度)对于精度模式: (输出张量, 置信度)"""if not self.is_warmed_up:print("警告: 模型尚未预热,性能可能受影响")if self.mode == "speed":outputs, confidence = self.predictor.infer(image)return outputs, confidenceelse:outputs, confidence = self.predictor.infer(image)return outputs, confidencedef get_backend(self) -> str:"""获取当前使用的后端"""return "TensorRT" if self.mode == "speed" else "ONNX"
- 测试代码
if __name__ == "__main__":# 配置路径PATHS = {"image_folder": "D:/Desktop/DATA/balance_bei_liao_hu/temp", # 图片文件夹路径"engine": "../assets/weights/mnv4.engine" , # TensorRT引擎文件路径"model_path": "../assets/weights/mobilenetv4_hybrid_medium.onnx" # ONNX模型文件路径}# 验证文件夹和文件存在if not os.path.exists(PATHS["image_folder"]):print(f"错误: 图片文件夹不存在 -> {os.path.abspath(PATHS['image_folder'])}")exit(1)if not os.path.exists(PATHS["engine"]):print(f"错误: 引擎文件不存在 -> {os.path.abspath(PATHS['engine'])}")exit(1)# 获取文件夹中所有图片文件(包括子文件夹)image_files = []for root, _, files in os.walk(PATHS["image_folder"]):for file in files:if file.endswith(('.jpg', '.png', '.bmp', '.jpeg')):image_files.append(os.path.join(root, file))if not image_files:print(f"错误: 文件夹中没有图片文件 -> {PATHS['image_folder']}")exit(1)# 初始化预测器predictor = Predictor(engine_path=PATHS["engine"], mode="speed", size=224)predictor.warmup(iterations=10) # 预热模型total_time = 0for image_path in image_files:try:print(f"处理图片: {image_path}")# torch.cuda.synchronize(device="cuda")# start_time = time.perf_counter()start_time = time.time()predictions, confidence = predictor.infer(image_path)end_time = time.time()# end_time = time.perf_counter()# torch.cuda.synchronize(device="cuda")print(end_time - start_time)inference_time = (end_time - start_time) * 1000 # 转换为毫秒print(f" ONNX 推理时间: {inference_time:.2f} ms")print(f" ONNX 输出: {np.argmax(predictions)} (置信度: {np.max(confidence):.4f})")total_time += inference_timeexcept Exception as e:print(f"处理图片时出错: {image_path} -> {str(e)}")avg_time = total_time / len(image_files)print(f"\n平均推理时间: {avg_time:.2f} ms")
TensorRT推理150张224×224图片平均速度为6.40ms,而ONNX推理需要10ms左右
两种格式的模型分别预测了150张尺寸为224×224的三类图片,每一类有50张,调用TensorRT平均每张图片需要5.17ms,而onnx平均每张图片需要11.11ms,TensorRT模型的推理速度缩短为onnx的二分之一,根据查找的资料显示,转换后的模型推理时间的缩短可能与设备有关。
(2)精度测试
- TensorRT推理代码
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import os
import numpy as np
import time
from typing import Tuple
from sklearn.metrics import classification_report, accuracy_score, f1_score
from collections import Counterclass TensorRTPredictor:def __init__(self, engine_path: str):"""初始化TensorRT预测器"""self.logger = trt.Logger(trt.Logger.WARNING)self.engine = self._load_engine(engine_path)self.context = self.engine.create_execution_context()self.input_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(0)))self.output_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(1)))self.is_warmed_up = Falseself.warmup(iterations=10) # 在初始化时进行预热def _load_engine(self, engine_path: str) -> trt.ICudaEngine:"""加载TensorRT引擎"""load_start_time = time.time()with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:engine = runtime.deserialize_cuda_engine(f.read())load_end_time = time.time()load_time = (load_end_time - load_start_time) * 1000print(f"加载引擎时间: {load_time:.2f} ms")return enginedef preprocess_image(self, image_path: str) -> np.ndarray:"""图像预处理"""preprocess_start_time = time.time()if not os.path.exists(image_path):raise FileNotFoundError(f"图像文件不存在: {os.path.abspath(image_path)}")image = cv2.imread(image_path)if image is None:raise ValueError("无法读取图像,请检查文件格式和完整性")try:image = cv2.resize(image, (224, 224))image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)image = np.ascontiguousarray(image.transpose(2, 0, 1).astype(np.float32) / 255.0)mean = np.array([0.362, 0.279, 0.258]).reshape(3, 1, 1)std = np.array([0.222, 0.191, 0.185]).reshape(3, 1, 1)image = (image - mean) / stdexcept Exception as e:raise RuntimeError(f"图像预处理失败: {str(e)}")preprocess_end_time = time.time()preprocess_time = (preprocess_end_time - preprocess_start_time) * 1000print(f" 预处理时间: {preprocess_time:.2f} ms")return imagedef warmup(self, iterations: int = 10):"""模型预热"""if self.is_warmed_up:print("模型已经预热,跳过预热步骤")returnwarmup_start_time = time.time()input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsized_input = cuda.mem_alloc(input_size)d_output = cuda.mem_alloc(output_size)stream = cuda.Stream()dummy_input = np.random.rand(*self.input_shape).astype(np.float32)for _ in range(iterations):cuda.memcpy_htod_async(d_input, dummy_input, stream)self.context.execute_async_v2(bindings=[int(d_input), int(d_output)],stream_handle=stream.handle)stream.synchronize()d_input.free()d_output.free()warmup_end_time = time.time()warmup_time = (warmup_end_time - warmup_start_time) * 1000print(f" 预热时间: {warmup_time:.2f} ms")self.is_warmed_up = Truedef infer(self, image: np.ndarray) -> Tuple[float, np.ndarray]:"""执行TensorRT推理"""if not self.is_warmed_up:print("警告:模型尚未预热,推理性能可能受影响")input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsized_input = cuda.mem_alloc(input_size)d_output = cuda.mem_alloc(output_size)stream = cuda.Stream()input_data = np.ascontiguousarray(np.expand_dims(image, axis=0), dtype=np.float32)# 正式推理infer_start_time = time.time()cuda.memcpy_htod_async(d_input, input_data, stream)self.context.execute_async_v2(bindings=[int(d_input), int(d_output)],stream_handle=stream.handle)stream.synchronize()infer_end_time = time.time()infer_time = (infer_end_time - infer_start_time) * 1000print(f" TensorRT 推理时间: {infer_time:.2f} ms")# 获取输出output_data = np.empty(self.output_shape, dtype=np.float32)output_start_time = time.time()cuda.memcpy_dtoh_async(output_data, d_output, stream)stream.synchronize()output_end_time = time.time()output_time = (output_end_time - output_start_time) * 1000print(f" 获取输出时间: {output_time:.2f} ms")d_input.free()d_output.free()return infer_time, output_dataif __name__ == "__main__":# 配置路径PATHS = {"image_folder": "D:/Desktop/DATA/balance_bei_liao_hu/temp", # 图片文件夹路径"engine": "mnv4.engine" # TensorRT引擎文件路径}# 验证文件夹和文件存在if not os.path.exists(PATHS["image_folder"]):print(f"错误: 图片文件夹不存在 -> {os.path.abspath(PATHS['image_folder'])}")exit(1)if not os.path.exists(PATHS["engine"]):print(f"错误: 引擎文件不存在 -> {os.path.abspath(PATHS['engine'])}")exit(1)# 获取文件夹中所有图片文件(包括子文件夹)image_files = []for root, _, files in os.walk(PATHS["image_folder"]):for file in files:if file.endswith(('.jpg', '.png', '.bmp', '.jpeg')):image_files.append(os.path.join(root, file))if not image_files:print(f"错误: 文件夹中没有图片文件 -> {PATHS['image_folder']}")exit(1)# 初始化预测器predictor = TensorRTPredictor(PATHS["engine"])# 初始化分类结果统计true_labels = []predicted_labels = []label_mapping = {0: "B", 1: "D", 2: "E"}total_time = 0for image_path in image_files:try:print(f"处理图片: {image_path}")img = predictor.preprocess_image(image_path)trt_time, trt_out = predictor.infer(img)print(f" TensorRT 推理时间: {trt_time:.2f} ms")predicted_label = np.argmax(trt_out)predicted_labels.append(predicted_label)# 从文件路径中提取真实标签true_label = os.path.basename(os.path.dirname(image_path))true_labels.append(true_label)total_time += trt_timeexcept Exception as e:print(f"处理图片时出错: {image_path} -> {str(e)}")avg_time = total_time / len(image_files)print(f"\n平均推理时间: {avg_time:.2f} ms")# 计算分类结果true_labels = [label for label in true_labels]predicted_labels = [label_mapping[label] for label in predicted_labels]print("\n分类结果统计:")print(f"图片总数: {len(image_files)}")print(f"分类结果: {Counter(predicted_labels)}")# 计算准确率和 F1 分数accuracy = accuracy_score(true_labels, predicted_labels)f1 = f1_score(true_labels, predicted_labels, average='weighted')print(f"准确率: {accuracy:.4f}") # 保留四位小数print(f"F1 分数: {f1:.4f}") # 保留四位小数# 输出详细的分类报告print("\n分类报告:")print(classification_report(true_labels, predicted_labels, digits=4)) # 保留四位小数
- onnx推理代码
from datasets.split_data import read_split_data
from datasets.mydataset import MyDataset
from torchvision import transforms
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
import torch
from estimate_model import Predictor, Plot_ROC
from timm.models import create_model
import os, cv2, json, random
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as pltdef read_test_data(root, plot_image=False):filepaths = []labels = []bad_images = []random.seed(0)assert os.path.exists(root), 'Your root does not exists!!!'classes = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]classes.sort()class_indices = {k: v for v, k in enumerate(classes)}json_str = json.dumps({v: k for k, v in class_indices.items()}, indent=4)with open('output/classes_indices.json', 'w') as json_file:json_file.write(json_str)every_class_num = []supported = ['.jpg', '.png', '.jpeg', '.PNG', '.JPG', '.JPEG', '.bmp']for klass in classes:classpath = os.path.join(root, klass)images = [os.path.join(root, klass, i) for i in os.listdir(classpath) if os.path.splitext(i)[-1] in supported]every_class_num.append(len(images))flist = sorted(os.listdir(classpath))desc = f'{klass:23s}'for f in tqdm(flist, ncols=110, desc=desc, unit='file', colour='blue'):fpath = os.path.join(classpath, f)fl = f.lower()index = fl.rfind('.')ext = fl[index:]if ext in supported:try:img = cv2.imread(fpath)filepaths.append(fpath)labels.append(klass)except:bad_images.append(fpath)print('defective image file: ', fpath)else:bad_images.append(fpath)Fseries = pd.Series(filepaths, name='filepaths')Lseries = pd.Series(labels, name='labels')df = pd.concat([Fseries, Lseries], axis=1)print(f'{len(df.labels.unique())} kind of images were found in the dataset')test_image_path = df['filepaths'].tolist()test_image_label = [class_indices[i] for i in df['labels'].tolist()]sample_df = df.sample(n=50, replace=False)ht, wt, count = 0, 0, 0for i in range(len(sample_df)):fpath = sample_df['filepaths'].iloc[i]try:img = cv2.imread(fpath)h = img.shape[0]w = img.shape[1]ht += hwt += wcount += 1except:passhave = int(ht / count)wave = int(wt / count)aspect_ratio = have / waveprint('{} images were found in the dataset.\n{} for test'.format(sum(every_class_num), len(test_image_path)))print('average image height= ', have, ' average image width= ', wave, ' aspect ratio h/w= ', aspect_ratio)if plot_image:plt.bar(range(len(classes)), every_class_num, align='center')plt.xticks(range(len(classes)), classes)for i, v in enumerate(every_class_num):plt.text(x=i, y=v + 5, s=str(v), ha='center')plt.xlabel('image class')plt.ylabel('number of images')plt.title('class distribution')plt.show()return test_image_path, test_image_labeltest_image_path, test_image_label = read_test_data('D:/Desktop/DATA/balance_bei_liao_hu/temp', False)def build_transform(img_size):t = []t.append(# to maintain same ratio w.r.t. 224 imagestransforms.Resize(img_size, interpolation=3),)t.append(transforms.CenterCrop(img_size))t.append(transforms.ToTensor())t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))return transforms.Compose(t)test_transform = build_transform(224)test_set = MyDataset(test_image_path, test_image_label, test_transform)sampler_val = torch.utils.data.SequentialSampler(test_set)data_loader_val = torch.utils.data.DataLoader(test_set, sampler=sampler_val,batch_size=int(1.5 * 24),num_workers=0,pin_memory=True,drop_last=False
)
model_predict = create_model('mobilenetv4_hybrid_medium')model_predict.reset_classifier(num_classes=3)
model_predict.to('cuda')
device = torch.device('cuda')Predictor(model_predict, data_loader_val, f'./output/mobilenetv4_hybrid_medium_best_checkpoint.pth', device)
Plot_ROC(model_predict, data_loader_val, f'./output/mobilenetv4_hybrid_medium_best_checkpoint.pth', device)
- 结果:
-
TensorRT:
-
onnx:
可以观察到在转成TensorRT推理后模型精度下降明显,宏平均Precision下降了约4%,宏平均召回下降了约10%,宏平均F1下降了约10%。
注:
(1)预处理方式
def preprocess_image(image_path: str) -> np.ndarray:"""图像预处理Args:image_path: 输入图像路径Returns:np.ndarray: 预处理后的图像张量,形状为(1, 3, H, W),类型为float32"""preprocess_start_time = time.time()if not os.path.exists(image_path):raise FileNotFoundError(f"图像文件不存在: {os.path.abspath(image_path)}")# 读取图像image = cv2.imread(image_path)if image is None:raise ValueError("无法读取图像,请检查文件格式和完整性")try:# 调整尺寸和颜色空间转换image = cv2.resize(image, (224, 224))image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)# 转换为CHW格式并确保数据类型为float32image = np.ascontiguousarray(image.transpose(2, 0, 1)).astype(np.float32)# 归一化到[0,1]范围image /= 255.0# 标准化处理(确保使用float32运算)mean = np.array([0.362, 0.279, 0.258], dtype=np.float32).reshape(3, 1, 1)std = np.array([0.222, 0.191, 0.185], dtype=np.float32).reshape(3, 1, 1)image = (image - mean) / std# 添加batch维度image = np.expand_dims(image, axis=0)except Exception as e:raise RuntimeError(f"图像预处理失败: {str(e)}")# 确保最终输出是float32if image.dtype != np.float32:image = image.astype(np.float32)preprocess_end_time = time.time()preprocess_time = (preprocess_end_time - preprocess_start_time) * 1000print(f"预处理时间: {preprocess_time:.2f} ms")return image
(2)TensorRT与torchvision包导入
torchvision包与TensorRT包同时导入可能会抢夺cuda资源从而报错
当先导入tensorrt后导入torchvision时
import tensorrt as trt
import torchvision.transforms as transforms
Traceback (most recent call last):File "D:\Desktop\SRM\srm_GUI\Predictor\TensorRTPredictor.py", line 2, in <module>import torchvision.transforms as transformsFile "D:\Software\anaconda3\envs\CV\lib\site-packages\torchvision\__init__.py", line 5, in <module>import torchFile "D:\Software\anaconda3\envs\CV\lib\site-packages\torch\__init__.py", line 122, in <module>raise err
OSError: [WinError 127] 找不到指定的程序。 Error loading "D:\Software\anaconda3\envs\CV\lib\site-packages\torch\lib\cublas64_11.dll" or one of its dependencies.