推理管道

如需使用 OpenVINO™ 运行时推理模型,通常您需要在应用管道中执行以下步骤:

  1. 创建核心对象

  2. 从驱动器中读取模型

  3. 将模型加载到设备

  4. 创建推理请求

  5. 用数据填充输入张量

  6. 开始推理

  7. 处理推理结果

以下代码将基于这些步骤演示如何更改应用代码以迁移到 API 2.0。

1. 创建核心对象

推理引擎 API

import numpy as np
import openvino.inference_engine as ie
core = ie.IECore()
    InferenceEngine::Core core;
    ie_core_t *core = nullptr;
    ie_core_create("", &core);

API 2.0

import openvino.runtime as ov
core = ov.Core()
    ov::Core core;
    ov_core_t* core = NULL;
    ov_core_create(&core);

1.1 (可选)加载扩展

如需通过自定义操作加载模型,您需要为这些操作添加扩展。强烈建议您使用 OpenVINO™ 扩展性 API 编写扩展。但是,您也可以将旧扩展加载到新的 OpenVINO™ 运行时:

推理引擎 API

core.add_extension("path_to_extension_library.so", "CPU")
    core.AddExtension(std::make_shared<InferenceEngine::Extension>("path_to_extension_library.so"));
    ie_core_add_extension(core, "path_to_extension_library.so", "CPU");

API 2.0

core.add_extension("path_to_extension_library.so")
    core.add_extension(std::make_shared<InferenceEngine::Extension>("path_to_extension_library.so"));
    // For C API 2.0 "add_extension()" is not supported for now

2.从驱动器中读取模型

推理引擎 API

network = core.read_network("model.xml")
    InferenceEngine::CNNNetwork network = core.ReadNetwork("model.xml");
    ie_network_t *network = nullptr;
    ie_core_read_network(core, "model.xml", nullptr, &network);

API 2.0

model = core.read_model("model.xml")
    std::shared_ptr<ov::Model> model = core.read_model("model.xml");
    ov_model_t* model = NULL;
    ov_core_read_model(core, "model.xml", NULL, &model);

读取模型采用与 模型创建迁移指南 中的示例相同的结构。

您可以在单次 ov::Core::compile_model(filename, devicename) 调用中组合模型读取和编译。

2.1 (可选)执行模型预处理

当应用输入数据与模型输入格式不完全匹配时,可能需要进行预处理。请参阅 API 2.0 中的预处理 了解更多详情。

3.将模型加载到设备

推理引擎 API

# Load network to the device and create infer requests
exec_network = core.load_network(network, "CPU", num_requests=4)
    InferenceEngine::ExecutableNetwork exec_network = core.LoadNetwork(network, "CPU");
    ie_executable_network_t *exe_network = nullptr;
    ie_core_load_network(core, network, "CPU", nullptr, &exe_network);

API 2.0

compiled_model = core.compile_model(model, "CPU")
    ov::CompiledModel compiled_model = core.compile_model(model, "CPU");
    ov_compiled_model_t* compiled_model = NULL;
    ov_core_compile_model(core, model, "CPU", 0, &compiled_model);

如果需要用 OpenVINO™ 运行时的其他参数配置设备,请参阅 配置设备

4.创建推理请求

推理引擎 API

# Done in the previous step
    InferenceEngine::InferRequest infer_request = exec_network.CreateInferRequest();
    ie_infer_request_t *infer_request = nullptr;
    ie_exec_network_create_infer_request(exe_network, &infer_request);

API 2.0

infer_request = compiled_model.create_infer_request()
    ov::InferRequest infer_request = compiled_model.create_infer_request();
    ov_infer_request_t* infer_request = NULL;
    ov_compiled_model_create_infer_request(compiled_model, &infer_request);

5.用数据填充输入张量

推理引擎 API

推理引擎 API 用 I32 精度(与原始模型 一致)的数据填充输入:

infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
    InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
    // fill first blob
    InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
    if (minput1) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput1->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }

    InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
    // fill second blob
    InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
    if (minput2) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput2->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }
    // fill first blob
    ie_blob_t *input_blob1 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob1, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
    // fill second blob
    ie_blob_t *input_blob2 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob2, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
    InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
    // fill first blob
    InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
    if (minput1) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput1->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }

    InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
    // fill second blob
    InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
    if (minput2) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput2->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }
    // fill first blob
    ie_blob_t *input_blob1 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob1, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
    // fill second blob
    ie_blob_t *input_blob2 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob2, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
    InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
    // fill first blob
    InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
    if (minput1) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput1->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }

    InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
    // fill second blob
    InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
    if (minput2) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput2->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }
    // fill first blob
    ie_blob_t *input_blob1 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob1, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
    // fill second blob
    ie_blob_t *input_blob2 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob2, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
    InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
    // fill first blob
    InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
    if (minput1) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput1->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }

    InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
    // fill second blob
    InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
    if (minput2) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = minput2->wmap();
        // Original I64 precision was converted to I32
        auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // Fill data ...
    }
    // fill first blob
    ie_blob_t *input_blob1 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob1, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }
    // fill second blob
    ie_blob_t *input_blob2 = nullptr;
    {
    ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
    ie_blob_buffer_t buffer;
    ie_blob_get_buffer(input_blob2, &buffer);
    // Original I64 precision was converted to I32
    int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
    // Fill data ...
    }

API 2.0

API 2.0 用 I64 精度(与原始模型一致)的数据填充输入:

# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# IR v10 works with converted precisions (i64 -> i32)
assert input_tensor1.data.dtype == np.int32
# Fill the first data ...

# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("data2_t")
# IR v10 works with converted precisions (i64 -> i32)
assert input_tensor2.data.dtype == np.int32
# Fill the second data ..
    // Get input tensor by index
    ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
    // IR v10 works with converted precisions (i64 -> i32)
    auto data1 = input_tensor1.data<int32_t>();
    // Fill first data ...

    // Get input tensor by tensor name
    ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
    // IR v10 works with converted precisions (i64 -> i32)
    auto data2 = input_tensor1.data<int32_t>();
    // Fill first data ...
    ov_tensor_t* input_tensor1 = NULL;
    ov_tensor_t* input_tensor2 = NULL;
    void* data = NULL;

    {
    // Get input tensor by index
    ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
    // IR v10 works with converted precisions (i64 -> i32)
    ov_tensor_data(input_tensor1, &data);
    int32_t* data1 = (int32_t*)data;
    // Fill first data ...
    }

    {
    // Get input tensor by tensor name
    ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
    // IR v10 works with converted precisions (i64 -> i32)
    ov_tensor_data(input_tensor2, &data);
    int32_t* data2 = (int32_t*)data;
    // Fill first data ...
    }

    ov_tensor_free(input_tensor1);
    ov_tensor_free(input_tensor2);
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# Element types, names and layouts are aligned with framework
assert input_tensor1.data.dtype == np.int64
# Fill the first data ...

# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("data2_t")
assert input_tensor2.data.dtype == np.int64
# Fill the second data ...
    // Get input tensor by index
    ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
    // Element types, names and layouts are aligned with framework
    auto data1 = input_tensor1.data<int64_t>();
    // Fill first data ...

    // Get input tensor by tensor name
    ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
    // Element types, names and layouts are aligned with framework
    auto data2 = input_tensor1.data<int64_t>();
    // Fill first data ...
    ov_tensor_t* input_tensor1 = NULL;
    ov_tensor_t* input_tensor2 = NULL;
    void* data = NULL;
    {
    // Get input tensor by index
    ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(input_tensor1, &data);
    // Fill first data ...
    }

    {
    // Get input tensor by tensor name
    ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(input_tensor2, &data);
    // Fill first data ...
    }

    ov_tensor_free(input_tensor1);
    ov_tensor_free(input_tensor2);
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# Element types, names and layouts are aligned with framework
assert input_tensor1.data.dtype == np.int64
# Fill the first data ...

# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("data2_t")
assert input_tensor2.data.dtype == np.int64
# Fill the second data ...
    // Get input tensor by index
    ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
    // Element types, names and layouts are aligned with framework
    auto data1 = input_tensor1.data<int64_t>();
    // Fill first data ...

    // Get input tensor by tensor name
    ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
    // Element types, names and layouts are aligned with framework
    auto data2 = input_tensor1.data<int64_t>();
    // Fill first data ...
    ov_tensor_t* input_tensor1 = NULL;
    ov_tensor_t* input_tensor2 = NULL;
    void* data = NULL;
    {
    // Get input tensor by index
    ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(input_tensor1, &data);
    // Fill first data ...
    }

    {
    // Get input tensor by tensor name
    ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(input_tensor2, &data);
    // Fill first data ...
    }

    ov_tensor_free(input_tensor1);
    ov_tensor_free(input_tensor2);
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# Element types, names and layouts are aligned with framework
assert input_tensor1.data.dtype == np.int64
# Fill the first data ...

# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("data2_t")
assert input_tensor2.data.dtype == np.int64
# Fill the second data ...
    // Get input tensor by index
    ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
    // Element types, names and layouts are aligned with framework
    auto data1 = input_tensor1.data<int64_t>();
    // Fill first data ...

    // Get input tensor by tensor name
    ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
    // Element types, names and layouts are aligned with framework
    auto data2 = input_tensor1.data<int64_t>();
    // Fill first data ...
    ov_tensor_t* input_tensor1 = NULL;
    ov_tensor_t* input_tensor2 = NULL;
    void* data = NULL;
    {
    // Get input tensor by index
    ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(input_tensor1, &data);
    // Fill first data ...
    }

    {
    // Get input tensor by tensor name
    ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(input_tensor2, &data);
    // Fill first data ...
    }

    ov_tensor_free(input_tensor1);
    ov_tensor_free(input_tensor2);

6.开始推理

推理引擎 API

results = infer_request.infer()
    infer_request.Infer();
    ie_infer_request_infer(infer_request);
# Start async inference on a single infer request
infer_request.async_infer()
# Wait for 1 milisecond
infer_request.wait(1)
# Wait for inference completion
infer_request.wait()

# Demonstrates async pipeline using ExecutableNetwork

results = []

# Callback to process inference results
def callback(output_blobs, _):
    # Copy the data from output blobs to numpy array
    results_copy = {out_name: out_blob.buffer[:] for out_name, out_blob in output_blobs.items()}
    results.append(process_results(results_copy))

# Setting callback for each infer requests
for infer_request in exec_network.requests:
    infer_request.set_completion_callback(callback, py_data=infer_request.output_blobs)

# Async pipline is managed by ExecutableNetwork
total_frames = 100
for _ in range(total_frames):
    # Wait for at least one free request
    exec_network.wait(num_request=1)
    # Get idle id
    idle_id = exec_network.get_idle_request_id()
    # Start asynchronous inference on idle request
    exec_network.start_async(request_id=idle_id, inputs=next(input_data))
# Wait for all requests to complete
exec_network.wait()
    // NOTE: For demonstration purposes we are trying to set callback
    // which restarts inference inside one more time, so two inferences happen here

    // Start inference without blocking current thread
    auto restart_once = true;
    infer_request.SetCompletionCallback<std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>>(
        [&, restart_once](InferenceEngine::InferRequest request, InferenceEngine::StatusCode status) mutable {
            if (status != InferenceEngine::OK) {
                // Process error code
            } else {
                // Extract inference result
                InferenceEngine::Blob::Ptr output_blob = request.GetBlob(outputs.begin()->first);
                // Restart inference if needed
                if (restart_once) {
                    request.StartAsync();
                    restart_once = false;
                }
            }
        });
    infer_request.StartAsync();
    // Get inference status immediately
    InferenceEngine::StatusCode status = infer_request.Wait(InferenceEngine::InferRequest::STATUS_ONLY);
    // Wait for 1 milisecond
    status = infer_request.Wait(1);
    // Wait for inference completion
    infer_request.Wait(InferenceEngine::InferRequest::RESULT_READY);
    // NOTE: For demonstration purposes we are trying to set callback
    ie_complete_call_back_t callback;
    callback.completeCallBackFunc = completion_callback;
    callback.args = infer_request;
    ie_infer_set_completion_callback(infer_request, &callback);
    // Start inference without blocking current thread
    ie_infer_request_infer_async(infer_request);
    // Wait for 10 milisecond
    IEStatusCode waitStatus = ie_infer_request_wait(infer_request, 10);
    // Wait for inference completion
    ie_infer_request_wait(infer_request, -1);

API 2.0

results = infer_request.infer()
    infer_request.infer();
    ov_infer_request_infer(infer_request);
# Start async inference on a single infer request
infer_request.start_async()
# Wait for 1 milisecond
infer_request.wait_for(1)
# Wait for inference completion
infer_request.wait()

# Demonstrates async pipeline using AsyncInferQueue

results = []

def callback(request, frame_id):
    # Copy the data from output tensors to numpy array and process it
    results_copy = {output: data[:] for output, data in request.results.items()}
    results.append(process_results(results_copy, frame_id))

# Create AsyncInferQueue with 4 infer requests
infer_queue = ov.AsyncInferQueue(compiled_model, jobs=4)
# Set callback for each infer request in the queue
infer_queue.set_callback(callback)

total_frames = 100
for i in range(total_frames):
    # Wait for at least one available infer request and start asynchronous inference
    infer_queue.start_async(next(input_data), userdata=i)
# Wait for all requests to complete
infer_queue.wait_all()
    // NOTE: For demonstration purposes we are trying to set callback
    // which restarts inference inside one more time, so two inferences happen here

    auto restart_once = true;
    infer_request.set_callback([&, restart_once](std::exception_ptr exception_ptr) mutable {
        if (exception_ptr) {
            // procces exception or rethrow it.
            std::rethrow_exception(exception_ptr);
        } else {
            // Extract inference result
            ov::Tensor output_tensor = infer_request.get_output_tensor();
            // Restart inference if needed
            if (restart_once) {
                infer_request.start_async();
                restart_once = false;
            }
        }
    });
    // Start inference without blocking current thread
    infer_request.start_async();
    // Get inference status immediately
    bool status = infer_request.wait_for(std::chrono::milliseconds{0});
    // Wait for one milisecond
    status = infer_request.wait_for(std::chrono::milliseconds{1});
    // Wait for inference completion
    infer_request.wait();
    // NOTE: For demonstration purposes we are trying to set callback
    ov_callback_t callback;
    callback.callback_func = infer_request_callback;
    callback.args = infer_request;
    ov_infer_request_set_callback(infer_request, &callback);
    // Start inference without blocking current thread
    ov_infer_request_start_async(infer_request);
    // Wait for inference completion
    ov_infer_request_wait(infer_request);
    // Wait for 10 milisecond
    ov_infer_request_wait_for(infer_request, 10);

7.处理推理结果

推理引擎 API

推理引擎 API 处理输出的原因是,输出精度为 I32 (与原始模型 一致):

# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["out1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
    InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
    InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
    if (moutput) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = moutput->rmap();
        // Original I64 precision was converted to I32
        auto data =
            minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // process output data
    }
    // get output blob by name
    ie_blob_t *output_blob = nullptr;
    ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
    // get blob buffer
    ie_blob_buffer_t out_buffer;
    ie_blob_get_buffer(output_blob, &out_buffer);
    // get data
    float *data = (float *)(out_buffer.buffer);
    // process output data
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["out1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
    InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
    InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
    if (moutput) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = moutput->rmap();
        // Original I64 precision was converted to I32
        auto data =
            minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // process output data
    }
    // get output blob by name
    ie_blob_t *output_blob = nullptr;
    ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
    // get blob buffer
    ie_blob_buffer_t out_buffer;
    ie_blob_get_buffer(output_blob, &out_buffer);
    // get data
    float *data = (float *)(out_buffer.buffer);
    // process output data
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["out1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
    InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
    InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
    if (moutput) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = moutput->rmap();
        // Original I64 precision was converted to I32
        auto data =
            minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // process output data
    }
    // get output blob by name
    ie_blob_t *output_blob = nullptr;
    ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
    // get blob buffer
    ie_blob_buffer_t out_buffer;
    ie_blob_get_buffer(output_blob, &out_buffer);
    // get data
    float *data = (float *)(out_buffer.buffer);
    // process output data
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["out1"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
    InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
    InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
    if (moutput) {
        // locked memory holder should be alive all time while access to its
        // buffer happens
        auto minputHolder = moutput->rmap();
        // Original I64 precision was converted to I32
        auto data =
            minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
        // process output data
    }
    // get output blob by name
    ie_blob_t *output_blob = nullptr;
    ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
    // get blob buffer
    ie_blob_buffer_t out_buffer;
    ie_blob_get_buffer(output_blob, &out_buffer);
    // get data
    float *data = (float *)(out_buffer.buffer);
    // process output data

API 2.0

API 2.0 处理输出的原因如下,输出精度:

  • 对于 OpenVINO IR v10 模型为 I32 (与原始模型 一致),与 旧行为 匹配。

  • 对于 OpenVINO IR v11、ONNX、ov::Model 和 PaddlePaddle 模型为 I64 (与原始模型一致),与 新行为 匹配。

# Model has only one output
output_tensor = infer_request.get_output_tensor()
# IR v10 works with converted precisions (i64 -> i32)
assert output_tensor.data.dtype == np.int32
# process output data ...
    // model has only one output
    ov::Tensor output_tensor = infer_request.get_output_tensor();
    // IR v10 works with converted precisions (i64 -> i32)
    auto out_data = output_tensor.data<int32_t>();
    // process output data
    ov_tensor_t* output_tensor = NULL;
    void* data = NULL;

    // model has only one output
    ov_infer_request_get_output_tensor(infer_request, &output_tensor);
    // IR v10 works with converted precisions (i64 -> i32)
    ov_tensor_data(output_tensor, &data);
    int32_t* out_data = (int32_t*)data;
    // process output data
    
    ov_tensor_free(output_tensor);
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# Element types, names and layouts are aligned with framework
assert output_tensor.data.dtype == np.int64
# process output data ...
    // model has only one output
    ov::Tensor output_tensor = infer_request.get_output_tensor();
    // Element types, names and layouts are aligned with framework
    auto out_data = output_tensor.data<int64_t>();
    // process output data
    ov_tensor_t* output_tensor = NULL;
    void* out_data = NULL;
    
    // model has only one output
    ov_infer_request_get_output_tensor(infer_request, &output_tensor);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(output_tensor, &out_data);
    // process output data
    
    ov_tensor_free(output_tensor);
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# Element types, names and layouts are aligned with framework
assert output_tensor.data.dtype == np.int64
# process output data ...
    // model has only one output
    ov::Tensor output_tensor = infer_request.get_output_tensor();
    // Element types, names and layouts are aligned with framework
    auto out_data = output_tensor.data<int64_t>();
    // process output data
    ov_tensor_t* output_tensor = NULL;
    void* out_data = NULL;
    
    // model has only one output
    ov_infer_request_get_output_tensor(infer_request, &output_tensor);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(output_tensor, &out_data);
    // process output data
    
    ov_tensor_free(output_tensor);
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# Element types, names and layouts are aligned with framework
assert output_tensor.data.dtype == np.int64
# process output data ...
    // model has only one output
    ov::Tensor output_tensor = infer_request.get_output_tensor();
    // Element types, names and layouts are aligned with framework
    auto out_data = output_tensor.data<int64_t>();
    // process output data
    ov_tensor_t* output_tensor = NULL;
    void* out_data = NULL;
    
    // model has only one output
    ov_infer_request_get_output_tensor(infer_request, &output_tensor);
    // Element types, names and layouts are aligned with framework
    ov_tensor_data(output_tensor, &out_data);
    // process output data
    
    ov_tensor_free(output_tensor);