Inference Pipeline¶
To infer models with OpenVINO™ Runtime, you usually need to perform the following steps in the application pipeline:
Based on the steps, the following code demonstrates how to change the application code to migrate to API 2.0.
1. Create a Core Object¶
Inference Engine API
import numpy as np
import openvino.inference_engine as ie
core = ie.IECore()
InferenceEngine::Core core;
ie_core_t *core = nullptr;
ie_core_create("", &core);
API 2.0
import openvino as ov
core = ov.Core()
ov::Core core;
ov_core_t* core = NULL;
ov_core_create(&core);
1.1 (Optional) Load Extensions¶
To load a model with custom operations, you need to add extensions for these operations. It is highly recommended to use OpenVINO Extensibility API to write extensions. However, you can also load the old extensions to the new OpenVINO™ Runtime:
Inference Engine API
core.add_extension(path_to_extension_library, "CPU")
core.AddExtension(std::make_shared<InferenceEngine::Extension>("path_to_extension_library.so"));
ie_core_add_extension(core, "path_to_extension_library.so", "CPU");
API 2.0
core.add_extension(path_to_extension_library)
core.add_extension(std::make_shared<InferenceEngine::Extension>("path_to_extension_library.so"));
// For C API 2.0 "add_extension()" is not supported for now
2. Read a Model from a Drive¶
Inference Engine API
network = core.read_network(model_path)
InferenceEngine::CNNNetwork network = core.ReadNetwork("model.xml");
ie_network_t *network = nullptr;
ie_core_read_network(core, "model.xml", nullptr, &network);
API 2.0
model = core.read_model(model_path)
std::shared_ptr<ov::Model> model = core.read_model("model.xml");
ov_model_t* model = NULL;
ov_core_read_model(core, "model.xml", NULL, &model);
Reading a model has the same structure as the example in the model creation migration guide.
You can combine reading and compiling a model into a single call ov::Core::compile_model(filename, devicename)
.
2.1 (Optional) Perform Model Preprocessing¶
When the application input data does not perfectly match the model input format, preprocessing may be necessary. See preprocessing in API 2.0 for more details.
3. Load the Model to the Device¶
Inference Engine API
# Load network to the device and create infer requests
exec_network = core.load_network(network, "CPU", num_requests=4)
InferenceEngine::ExecutableNetwork exec_network = core.LoadNetwork(network, "CPU");
ie_executable_network_t *exe_network = nullptr;
ie_core_load_network(core, network, "CPU", nullptr, &exe_network);
API 2.0
compiled_model = core.compile_model(model, "CPU")
ov::CompiledModel compiled_model = core.compile_model(model, "CPU");
ov_compiled_model_t* compiled_model = NULL;
ov_core_compile_model(core, model, "CPU", 0, &compiled_model);
If you need to configure devices with additional parameters for OpenVINO Runtime, refer to Configuring Devices.
4. Create an Inference Request¶
Inference Engine API
# Done in the previous step
InferenceEngine::InferRequest infer_request = exec_network.CreateInferRequest();
ie_infer_request_t *infer_request = nullptr;
ie_exec_network_create_infer_request(exe_network, &infer_request);
API 2.0
infer_request = compiled_model.create_infer_request()
ov::InferRequest infer_request = compiled_model.create_infer_request();
ov_infer_request_t* infer_request = NULL;
ov_compiled_model_create_infer_request(compiled_model, &infer_request);
5. Fill Input Tensors with Data¶
Inference Engine API
The Inference Engine API fills inputs with data of the I32
precision (not aligned with the original model):
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
// fill first blob
InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
if (minput1) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput1->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
// fill second blob
InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
if (minput2) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput2->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
// fill first blob
ie_blob_t *input_blob1 = nullptr;
{
ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob1, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
// fill second blob
ie_blob_t *input_blob2 = nullptr;
{
ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob2, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
// fill first blob
InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
if (minput1) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput1->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
// fill second blob
InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
if (minput2) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput2->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
// fill first blob
ie_blob_t *input_blob1 = nullptr;
{
ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob1, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
// fill second blob
ie_blob_t *input_blob2 = nullptr;
{
ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob2, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
// fill first blob
InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
if (minput1) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput1->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
// fill second blob
InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
if (minput2) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput2->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
// fill first blob
ie_blob_t *input_blob1 = nullptr;
{
ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob1, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
// fill second blob
ie_blob_t *input_blob2 = nullptr;
{
ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob2, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
infer_request = exec_network.requests[0]
# Get input blobs mapped to input layers names
input_blobs = infer_request.input_blobs
data = input_blobs["data"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Fill the first blob ...
InferenceEngine::Blob::Ptr input_blob1 = infer_request.GetBlob(inputs.begin()->first);
// fill first blob
InferenceEngine::MemoryBlob::Ptr minput1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob1);
if (minput1) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput1->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
InferenceEngine::Blob::Ptr input_blob2 = infer_request.GetBlob("data2");
// fill second blob
InferenceEngine::MemoryBlob::Ptr minput2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob2);
if (minput2) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = minput2->wmap();
// Original I64 precision was converted to I32
auto data = minputHolder.as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// Fill data ...
}
// fill first blob
ie_blob_t *input_blob1 = nullptr;
{
ie_infer_request_get_blob(infer_request, input_name, &input_blob1);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob1, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
// fill second blob
ie_blob_t *input_blob2 = nullptr;
{
ie_infer_request_get_blob(infer_request, "data2", &input_blob2);
ie_blob_buffer_t buffer;
ie_blob_get_buffer(input_blob2, &buffer);
// Original I64 precision was converted to I32
int32_t* blob_internal_buffer = (int32_t*)buffer.buffer;
// Fill data ...
}
API 2.0
API 2.0 fills inputs with data of the I64
precision (aligned with the original model):
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# IR v10 works with converted precisions (i64 -> i32)
assert input_tensor1.data.dtype == np.int32
# Fill the first data ...
# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("input")
# IR v10 works with converted precisions (i64 -> i32)
assert input_tensor2.data.dtype == np.int32
# Fill the second data ..
// Get input tensor by index
ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
// IR v10 works with converted precisions (i64 -> i32)
auto data1 = input_tensor1.data<int32_t>();
// Fill first data ...
// Get input tensor by tensor name
ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
// IR v10 works with converted precisions (i64 -> i32)
auto data2 = input_tensor1.data<int32_t>();
// Fill first data ...
ov_tensor_t* input_tensor1 = NULL;
ov_tensor_t* input_tensor2 = NULL;
void* data = NULL;
{
// Get input tensor by index
ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
// IR v10 works with converted precisions (i64 -> i32)
ov_tensor_data(input_tensor1, &data);
int32_t* data1 = (int32_t*)data;
// Fill first data ...
}
{
// Get input tensor by tensor name
ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
// IR v10 works with converted precisions (i64 -> i32)
ov_tensor_data(input_tensor2, &data);
int32_t* data2 = (int32_t*)data;
// Fill first data ...
}
ov_tensor_free(input_tensor1);
ov_tensor_free(input_tensor2);
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# Element types, names and layouts are aligned with framework
assert input_tensor1.data.dtype == np.int64
# Fill the first data ...
# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("input")
assert input_tensor2.data.dtype == np.int64
# Fill the second data ...
// Get input tensor by index
ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
// Element types, names and layouts are aligned with framework
auto data1 = input_tensor1.data<int64_t>();
// Fill first data ...
// Get input tensor by tensor name
ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
// Element types, names and layouts are aligned with framework
auto data2 = input_tensor1.data<int64_t>();
// Fill first data ...
ov_tensor_t* input_tensor1 = NULL;
ov_tensor_t* input_tensor2 = NULL;
void* data = NULL;
{
// Get input tensor by index
ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
// Element types, names and layouts are aligned with framework
ov_tensor_data(input_tensor1, &data);
// Fill first data ...
}
{
// Get input tensor by tensor name
ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
// Element types, names and layouts are aligned with framework
ov_tensor_data(input_tensor2, &data);
// Fill first data ...
}
ov_tensor_free(input_tensor1);
ov_tensor_free(input_tensor2);
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# Element types, names and layouts are aligned with framework
assert input_tensor1.data.dtype == np.int64
# Fill the first data ...
# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("input")
assert input_tensor2.data.dtype == np.int64
# Fill the second data ...
// Get input tensor by index
ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
// Element types, names and layouts are aligned with framework
auto data1 = input_tensor1.data<int64_t>();
// Fill first data ...
// Get input tensor by tensor name
ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
// Element types, names and layouts are aligned with framework
auto data2 = input_tensor1.data<int64_t>();
// Fill first data ...
ov_tensor_t* input_tensor1 = NULL;
ov_tensor_t* input_tensor2 = NULL;
void* data = NULL;
{
// Get input tensor by index
ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
// Element types, names and layouts are aligned with framework
ov_tensor_data(input_tensor1, &data);
// Fill first data ...
}
{
// Get input tensor by tensor name
ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
// Element types, names and layouts are aligned with framework
ov_tensor_data(input_tensor2, &data);
// Fill first data ...
}
ov_tensor_free(input_tensor1);
ov_tensor_free(input_tensor2);
# Get input tensor by index
input_tensor1 = infer_request.get_input_tensor(0)
# Element types, names and layouts are aligned with framework
assert input_tensor1.data.dtype == np.int64
# Fill the first data ...
# Get input tensor by tensor name
input_tensor2 = infer_request.get_tensor("input")
assert input_tensor2.data.dtype == np.int64
# Fill the second data ...
// Get input tensor by index
ov::Tensor input_tensor1 = infer_request.get_input_tensor(0);
// Element types, names and layouts are aligned with framework
auto data1 = input_tensor1.data<int64_t>();
// Fill first data ...
// Get input tensor by tensor name
ov::Tensor input_tensor2 = infer_request.get_tensor("data2_t");
// Element types, names and layouts are aligned with framework
auto data2 = input_tensor1.data<int64_t>();
// Fill first data ...
ov_tensor_t* input_tensor1 = NULL;
ov_tensor_t* input_tensor2 = NULL;
void* data = NULL;
{
// Get input tensor by index
ov_infer_request_get_input_tensor_by_index(infer_request, 0, &input_tensor1);
// Element types, names and layouts are aligned with framework
ov_tensor_data(input_tensor1, &data);
// Fill first data ...
}
{
// Get input tensor by tensor name
ov_infer_request_get_tensor(infer_request, "data2_t", &input_tensor2);
// Element types, names and layouts are aligned with framework
ov_tensor_data(input_tensor2, &data);
// Fill first data ...
}
ov_tensor_free(input_tensor1);
ov_tensor_free(input_tensor2);
6. Start Inference¶
Inference Engine API
results = infer_request.infer()
infer_request.Infer();
ie_infer_request_infer(infer_request);
# Start async inference on a single infer request
infer_request.async_infer()
# Wait for 1 milisecond
infer_request.wait(1)
# Wait for inference completion
infer_request.wait()
# Demonstrates async pipeline using ExecutableNetwork
results = []
# Callback to process inference results
def callback(output_blobs, _):
# Copy the data from output blobs to numpy array
results_copy = {out_name: out_blob.buffer[:] for out_name, out_blob in output_blobs.items()}
results.append(process_results(results_copy))
# Setting callback for each infer requests
for infer_request in exec_network.requests:
infer_request.set_completion_callback(callback, py_data=infer_request.output_blobs)
# Async pipline is managed by ExecutableNetwork
total_frames = 100
for _ in range(total_frames):
# Wait for at least one free request
exec_network.wait(num_requests=1)
# Get idle id
idle_id = exec_network.get_idle_request_id()
# Start asynchronous inference on idle request
exec_network.start_async(request_id=idle_id, inputs={"data": input_data})
# Wait for all requests to complete
exec_network.wait()
// NOTE: For demonstration purposes we are trying to set callback
// which restarts inference inside one more time, so two inferences happen here
// Start inference without blocking current thread
auto restart_once = true;
infer_request.SetCompletionCallback<std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>>(
[&, restart_once](InferenceEngine::InferRequest request, InferenceEngine::StatusCode status) mutable {
if (status != InferenceEngine::OK) {
// Process error code
} else {
// Extract inference result
InferenceEngine::Blob::Ptr output_blob = request.GetBlob(outputs.begin()->first);
// Restart inference if needed
if (restart_once) {
request.StartAsync();
restart_once = false;
}
}
});
infer_request.StartAsync();
// Get inference status immediately
InferenceEngine::StatusCode status = infer_request.Wait(InferenceEngine::InferRequest::STATUS_ONLY);
// Wait for 1 milisecond
status = infer_request.Wait(1);
// Wait for inference completion
infer_request.Wait(InferenceEngine::InferRequest::RESULT_READY);
// NOTE: For demonstration purposes we are trying to set callback
ie_complete_call_back_t callback;
callback.completeCallBackFunc = completion_callback;
callback.args = infer_request;
ie_infer_set_completion_callback(infer_request, &callback);
// Start inference without blocking current thread
ie_infer_request_infer_async(infer_request);
// Wait for 10 milisecond
IEStatusCode waitStatus = ie_infer_request_wait(infer_request, 10);
// Wait for inference completion
ie_infer_request_wait(infer_request, -1);
API 2.0
results = infer_request.infer()
infer_request.infer();
ov_infer_request_infer(infer_request);
# Start async inference on a single infer request
infer_request.start_async()
# Wait for 1 milisecond
infer_request.wait_for(1)
# Wait for inference completion
infer_request.wait()
# Demonstrates async pipeline using AsyncInferQueue
results = []
def callback(request, frame_id):
# Copy the data from output tensors to numpy array and process it
results_copy = {output: data[:] for output, data in request.results.items()}
results.append(process_results(results_copy, frame_id))
# Create AsyncInferQueue with 4 infer requests
infer_queue = ov.AsyncInferQueue(compiled_model, jobs=4)
# Set callback for each infer request in the queue
infer_queue.set_callback(callback)
total_frames = 100
for i in range(total_frames):
# Wait for at least one available infer request and start asynchronous inference
infer_queue.start_async(input_data, userdata=i)
# Wait for all requests to complete
infer_queue.wait_all()
// NOTE: For demonstration purposes we are trying to set callback
// which restarts inference inside one more time, so two inferences happen here
auto restart_once = true;
infer_request.set_callback([&, restart_once](std::exception_ptr exception_ptr) mutable {
if (exception_ptr) {
// procces exception or rethrow it.
std::rethrow_exception(exception_ptr);
} else {
// Extract inference result
ov::Tensor output_tensor = infer_request.get_output_tensor();
// Restart inference if needed
if (restart_once) {
infer_request.start_async();
restart_once = false;
}
}
});
// Start inference without blocking current thread
infer_request.start_async();
// Get inference status immediately
bool status = infer_request.wait_for(std::chrono::milliseconds{0});
// Wait for one milisecond
status = infer_request.wait_for(std::chrono::milliseconds{1});
// Wait for inference completion
infer_request.wait();
// NOTE: For demonstration purposes we are trying to set callback
ov_callback_t callback;
callback.callback_func = infer_request_callback;
callback.args = infer_request;
ov_infer_request_set_callback(infer_request, &callback);
// Start inference without blocking current thread
ov_infer_request_start_async(infer_request);
// Wait for inference completion
ov_infer_request_wait(infer_request);
// Wait for 10 milisecond
ov_infer_request_wait_for(infer_request, 10);
7. Process the Inference Results¶
Inference Engine API
The Inference Engine API processes outputs as they are of the I32
precision (not aligned with the original model):
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["relu"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
if (moutput) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = moutput->rmap();
// Original I64 precision was converted to I32
auto data =
minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// process output data
}
// get output blob by name
ie_blob_t *output_blob = nullptr;
ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
// get blob buffer
ie_blob_buffer_t out_buffer;
ie_blob_get_buffer(output_blob, &out_buffer);
// get data
float *data = (float *)(out_buffer.buffer);
// process output data
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["relu"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
if (moutput) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = moutput->rmap();
// Original I64 precision was converted to I32
auto data =
minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// process output data
}
// get output blob by name
ie_blob_t *output_blob = nullptr;
ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
// get blob buffer
ie_blob_buffer_t out_buffer;
ie_blob_get_buffer(output_blob, &out_buffer);
// get data
float *data = (float *)(out_buffer.buffer);
// process output data
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["relu"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
if (moutput) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = moutput->rmap();
// Original I64 precision was converted to I32
auto data =
minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// process output data
}
// get output blob by name
ie_blob_t *output_blob = nullptr;
ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
// get blob buffer
ie_blob_buffer_t out_buffer;
ie_blob_get_buffer(output_blob, &out_buffer);
// get data
float *data = (float *)(out_buffer.buffer);
// process output data
# Get output blobs mapped to output layers names
output_blobs = infer_request.output_blobs
data = output_blobs["relu"].buffer
# Original I64 precision was converted to I32
assert data.dtype == np.int32
# Process output data
InferenceEngine::Blob::Ptr output_blob = infer_request.GetBlob(outputs.begin()->first);
InferenceEngine::MemoryBlob::Ptr moutput = InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
if (moutput) {
// locked memory holder should be alive all time while access to its
// buffer happens
auto minputHolder = moutput->rmap();
// Original I64 precision was converted to I32
auto data =
minputHolder.as<const InferenceEngine::PrecisionTrait<InferenceEngine::Precision::I32>::value_type*>();
// process output data
}
// get output blob by name
ie_blob_t *output_blob = nullptr;
ie_infer_request_get_blob(infer_request, "output_name", &output_blob);
// get blob buffer
ie_blob_buffer_t out_buffer;
ie_blob_get_buffer(output_blob, &out_buffer);
// get data
float *data = (float *)(out_buffer.buffer);
// process output data
API 2.0
API 2.0 processes outputs as they are of:
the
I32
precision (not aligned with the original model) for OpenVINO IR v10 models, to match the old behavior.the
I64
precision (aligned with the original model) for OpenVINO IR v11, ONNX, ov::Model, PaddlePaddle and TensorFlow models, to match the new behavior.
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# IR v10 works with converted precisions (i64 -> i32)
assert output_tensor.data.dtype == np.int32
# process output data ...
// model has only one output
ov::Tensor output_tensor = infer_request.get_output_tensor();
// IR v10 works with converted precisions (i64 -> i32)
auto out_data = output_tensor.data<int32_t>();
// process output data
ov_tensor_t* output_tensor = NULL;
void* data = NULL;
// model has only one output
ov_infer_request_get_output_tensor(infer_request, &output_tensor);
// IR v10 works with converted precisions (i64 -> i32)
ov_tensor_data(output_tensor, &data);
int32_t* out_data = (int32_t*)data;
// process output data
ov_tensor_free(output_tensor);
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# Element types, names and layouts are aligned with framework
assert output_tensor.data.dtype == np.int64
# process output data ...
// model has only one output
ov::Tensor output_tensor = infer_request.get_output_tensor();
// Element types, names and layouts are aligned with framework
auto out_data = output_tensor.data<int64_t>();
// process output data
ov_tensor_t* output_tensor = NULL;
void* out_data = NULL;
// model has only one output
ov_infer_request_get_output_tensor(infer_request, &output_tensor);
// Element types, names and layouts are aligned with framework
ov_tensor_data(output_tensor, &out_data);
// process output data
ov_tensor_free(output_tensor);
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# Element types, names and layouts are aligned with framework
assert output_tensor.data.dtype == np.int64
# process output data ...
// model has only one output
ov::Tensor output_tensor = infer_request.get_output_tensor();
// Element types, names and layouts are aligned with framework
auto out_data = output_tensor.data<int64_t>();
// process output data
ov_tensor_t* output_tensor = NULL;
void* out_data = NULL;
// model has only one output
ov_infer_request_get_output_tensor(infer_request, &output_tensor);
// Element types, names and layouts are aligned with framework
ov_tensor_data(output_tensor, &out_data);
// process output data
ov_tensor_free(output_tensor);
# Model has only one output
output_tensor = infer_request.get_output_tensor()
# Element types, names and layouts are aligned with framework
assert output_tensor.data.dtype == np.int64
# process output data ...
// model has only one output
ov::Tensor output_tensor = infer_request.get_output_tensor();
// Element types, names and layouts are aligned with framework
auto out_data = output_tensor.data<int64_t>();
// process output data
ov_tensor_t* output_tensor = NULL;
void* out_data = NULL;
// model has only one output
ov_infer_request_get_output_tensor(infer_request, &output_tensor);
// Element types, names and layouts are aligned with framework
ov_tensor_data(output_tensor, &out_data);
// process output data
ov_tensor_free(output_tensor);