tensorrt内存释放 笔记
目录
load_model char * 释放
cudaMallocHost float*释放
void* buffers和 cudaStream_t 释放
推理释放模型内存:
load_model char * 释放
yolo 代码示例
bool YOLO::load_model(std::string trt_path) {size_t size{ 0 };char *trtModelStream{ nullptr };std::ifstream file(trt_path, std::ios::binary);if (file.good()) {file.seekg(0, file.end);size = file.tellg();file.seekg(0, file.beg);trtModelStream = new char[size];assert(trtModelStream);file.read(trtModelStream, size);file.close();}std::cout << "engine init finished" << std::endl;runtime = createInferRuntime(gLogger);assert(runtime != nullptr);engine = runtime->deserializeCudaEngine(trtModelStream, size);assert(engine != nullptr);context = engine->createExecutionContext();assert(context != nullptr);delete[] trtModelStream;return true;
}
cudaMallocHost float*释放
float* input_data_host = nullptr;//cudaMallocHost(&input_data_host, batch_size * sizeof(float));cudaMallocHost(&input_data_host, batch_size * 3 * this->INPUT_H * this->INPUT_W * sizeof(float));auto t_1 = std::chrono::high_resolution_clock::now();for (int i = 0; i < image_list.size(); i++) {cv::Mat img_o = image_list.at(i);cv::Mat img_raw = this->static_resize(img_o);int input_height = img_raw.rows;int input_width = img_raw.cols;int image_area = img_raw.cols * img_raw.rows;unsigned char* pimage = img_raw.data;float* phost_b = input_data_host + image_area * 0 + i * input_channel * input_height * input_width;float* phost_g = input_data_host + image_area * 1 + i * input_channel * input_height * input_width;float* phost_r = input_data_host + image_area * 2 + i * input_channel * input_height * input_width;for (int mm = 0; mm < image_area; ++mm, pimage += 3) {*phost_r++ = pimage[0] / 255.0f;// (pimage[0] / 255.0f - mean[0]) / std[0];*phost_g++ = pimage[1] / 255.0f;;// (pimage[1] / 255.0f - mean[1]) / std[1];*phost_b++ = pimage[2] / 255.0f;;//(pimage[2] / 255.0f - mean[2]) / std[2];}}...checkRuntime(cudaFreeHost(input_data_host));
void* buffers和 cudaStream_t 释放
void* buffers[2];// In order to bind the buffers, we need to know the names of the input and output tensors.// Note that indices are guaranteed to be less than IEngine::getNbBindings()const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);int mBatchSize = engine.getMaxBatchSize();// Create GPU buffers on deviceCHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));CHECK(cudaMalloc(&buffers[outputIndex], output_size * sizeof(float)));// Create streamcudaStream_t stream;CHECK(cudaStreamCreate(&stream));// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to hostCHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));//context.enqueue(1, buffers, stream, nullptr);context->enqueueV2(buffers, stream, nullptr);CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));cudaStreamSynchronize(stream);// Release stream and bufferscudaStreamDestroy(stream);CHECK(cudaFree(buffers[inputIndex]));CHECK(cudaFree(buffers[outputIndex]));
推理释放模型内存:
这个报错:
context->destroy();engine->destroy();runtime->destroy();
后来发现把engine->destroy();注释掉,就不报错了。
原因未知。
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
