Synchronous Inference Request¶
InferRequest
class functionality:
Allocate input and output tensors needed for a backend-dependent network inference.
Define functions for inference process stages (for example,
preprocess
,upload
,infer
,download
,postprocess
). These functions can later be used to define an execution pipeline during Asynchronous Inference Request implementation.Call inference stages one by one synchronously.
InferRequest Class¶
OpenVINO Plugin API provides the interface ov::ISyncInferRequest which should be used as a base class for a synchronous inference request implementation. Based of that, a declaration of a synchronous request class can look as follows:
class InferRequest : public ov::ISyncInferRequest {
public:
explicit InferRequest(const std::shared_ptr<const ov::template_plugin::CompiledModel>& compiled_model);
~InferRequest();
void infer() override;
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
std::vector<ov::ProfilingInfo> get_profiling_info() const override;
// pipeline methods-stages which are used in async infer request implementation and assigned to particular executor
void infer_preprocess();
void start_pipeline();
void wait_pipeline();
void infer_postprocess();
void cancel();
void set_tensors_impl(const ov::Output<const ov::Node> port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
private:
std::shared_ptr<const CompiledModel> get_template_model() const;
enum { Preprocess, Postprocess, StartPipeline, WaitPipeline, numOfStages };
std::array<openvino::itt::handle_t, numOfStages> m_profiling_task;
// for performance counters
std::array<std::chrono::duration<float, std::micro>, numOfStages> m_durations;
std::vector<ov::Tensor> m_backend_input_tensors;
std::vector<ov::Tensor> m_backend_output_tensors;
std::shared_ptr<ov::runtime::Executable> m_executable;
ov::EvaluationContext m_eval_context;
std::vector<ov::SoPtr<ov::IVariableState>> m_variable_states;
};
Class Fields¶
The example class has several fields:
m_profiling_task
- array of thestd::array<openvino::itt::handle_t, numOfStages>
type. Defines names for pipeline stages. Used to profile an inference pipeline execution with the Intel® instrumentation and tracing technology (ITT).m_durations
- array of durations of each pipeline stage.backend-specific fields:
m_backend_input_tensors
- input backend tensors.m_backend_output_tensors
- output backend tensors.m_executable
- an executable object / backend computational graph.m_eval_context
- an evaluation context to save backend states after the inference.m_variable_states
- a vector of variable states.
InferRequest Constructor¶
The constructor initializes helper fields and calls methods which allocate tensors:
ov::template_plugin::InferRequest::InferRequest(const std::shared_ptr<const ov::template_plugin::CompiledModel>& model)
: ov::ISyncInferRequest(model) {
// TODO: allocate infer request device and host buffers if needed, fill actual list of profiling tasks
auto requestID = std::to_string(get_template_model()->m_request_id.fetch_add(1));
std::string name = get_template_model()->m_model->get_friendly_name() + "_Req" + requestID;
m_profiling_task = {
openvino::itt::handle("Template" + std::to_string(get_template_model()->m_cfg.device_id) + "_" + name +
"_Preprocess"),
openvino::itt::handle("Template" + std::to_string(get_template_model()->m_cfg.device_id) + "_" + name +
"_Postprocess"),
openvino::itt::handle("Template" + std::to_string(get_template_model()->m_cfg.device_id) + "_" + name +
"_StartPipeline"),
openvino::itt::handle("Template" + std::to_string(get_template_model()->m_cfg.device_id) + "_" + name +
"_WaitPipline"),
};
m_executable = get_template_model()->get_template_plugin()->m_backend->compile(get_template_model()->m_model);
// Allocate plugin backend specific memory handles
m_backend_input_tensors.resize(get_inputs().size());
m_backend_output_tensors.resize(get_outputs().size());
// Allocate input/output tensors
for (const auto& input : get_inputs()) {
allocate_tensor(input, [input](ov::SoPtr<ov::ITensor>& tensor) {
// Can add a check to avoid double work in case of shared tensors
allocate_tensor_impl(tensor,
input.get_element_type(),
input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape());
});
}
for (const auto& output : get_outputs()) {
allocate_tensor(output, [output](ov::SoPtr<ov::ITensor>& tensor) {
// Can add a check to avoid double work in case of shared tensors
allocate_tensor_impl(tensor,
output.get_element_type(),
output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape());
});
}
// Save variable states
ov::op::util::VariableContext variable_context;
for (const auto& variable : m_executable->get_model()->get_variables()) {
if (!variable_context.get_variable_value(variable)) {
auto shape = variable->get_info().data_shape.is_dynamic() ? ov::Shape{0}
: variable->get_info().data_shape.to_shape();
ov::Tensor tensor = ov::Tensor(variable->get_info().data_type, shape);
variable_context.set_variable_value(variable, std::make_shared<ov::op::util::VariableValue>(tensor));
}
auto state = std::make_shared<VariableState>(variable->get_info().variable_id,
variable_context.get_variable_value(variable));
m_variable_states.emplace_back(state);
}
m_eval_context.emplace("VariableContext", variable_context);
}
Note
Use inputs/outputs information from the compiled model to understand shape and element type of tensors, which you can set with ov::InferRequest::set_tensor and get with ov::InferRequest::get_tensor. A plugin uses these hints to determine its internal layouts and element types for input and output tensors if needed.
~InferRequest Destructor¶
Destructor can contain plugin specific logic to finish and destroy infer request.
ov::template_plugin::InferRequest::~InferRequest() = default;
set_tensors_impl()¶
The method allows to set batched tensors in case if the plugin supports it.
void ov::template_plugin::InferRequest::set_tensors_impl(const ov::Output<const ov::Node> port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
for (const auto& input : get_inputs()) {
if (input == port) {
m_batched_tensors[input.get_tensor_ptr()] = tensors;
return;
}
}
OPENVINO_THROW("Cannot find input tensors for port ", port);
}
query_state()¶
The method returns variable states from the model.
std::vector<ov::SoPtr<ov::IVariableState>> ov::template_plugin::InferRequest::query_state() const {
return m_variable_states;
}
infer()¶
The method calls actual pipeline stages synchronously. Inside the method plugin should check input/output tensors, move external tensors to backend and run the inference.
void ov::template_plugin::InferRequest::infer() {
// TODO: fill with actual list of pipeline stages, which are executed synchronously for sync infer requests
infer_preprocess();
start_pipeline();
wait_pipeline(); // does nothing in current implementation
infer_postprocess();
}
1. infer_preprocess()¶
Below is the code of the infer_preprocess()
method. The method checks user input/output tensors and demonstrates conversion from user tensor to backend specific representation:
void ov::template_plugin::InferRequest::infer_preprocess() {
OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[Preprocess]);
auto start = Time::now();
convert_batched_tensors();
check_tensors();
// Allocate backend tensors
OPENVINO_ASSERT(get_inputs().size() == m_backend_input_tensors.size());
for (size_t i = 0; i < get_inputs().size(); i++) {
auto tensor = get_tensor(get_inputs()[i]);
if (std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr)) {
auto vector_tensor = std::dynamic_pointer_cast<ov::template_plugin::VectorImpl>(tensor._ptr);
OPENVINO_ASSERT(vector_tensor, "Template plugin supports only VectorTensor with remote context.");
auto element_type = vector_tensor->get_element_type();
void* data = vector_tensor->get_data();
OPENVINO_ASSERT(data != nullptr);
// Create backend tenor
m_backend_input_tensors[i] =
get_template_model()->get_template_plugin()->m_backend->create_tensor(element_type,
vector_tensor->get_shape(),
data);
} else if (tensor->is_continuous()) {
// No ROI extraction is needed
m_backend_input_tensors[i] =
get_template_model()->get_template_plugin()->m_backend->create_tensor(tensor->get_element_type(),
tensor->get_shape(),
tensor->data());
} else {
OPENVINO_ASSERT(tensor->get_element_type().bitwidth() % 8 == 0,
"Template plugin: Unsupported ROI tensor with element type having ",
std::to_string(tensor->get_element_type().bitwidth()),
" bits size");
ov::Shape shape = tensor->get_shape();
// Perform manual extraction of ROI tensor
// Basic implementation doesn't take axis order into account `desc.getBlockingDesc().getOrder()`
// Performance of manual extraction is not optimal, but it is ok for template implementation
m_backend_input_tensors[i] =
get_template_model()->get_template_plugin()->m_backend->create_tensor(tensor->get_element_type(),
tensor->get_shape());
tensor->copy_to(ov::get_tensor_impl(m_backend_input_tensors[i])._ptr);
}
}
// Tensors can be dynamic, so in this case we need to allocate tensors with right shape
OPENVINO_ASSERT(get_outputs().size() == m_backend_output_tensors.size());
for (size_t i = 0; i < get_outputs().size(); i++) {
const auto& result = get_template_model()->m_model->get_results()[i];
if (result->get_output_partial_shape(0).is_dynamic()) {
m_backend_output_tensors[i] = get_template_model()->get_template_plugin()->m_backend->create_tensor();
continue;
}
auto tensor = make_tensor(get_tensor(get_outputs()[i]));
if (tensor.is_continuous() && !tensor.is<ov::RemoteTensor>())
m_backend_output_tensors[i] =
get_template_model()->get_template_plugin()->m_backend->create_tensor(tensor.get_element_type(),
tensor.get_shape(),
tensor.data());
else
m_backend_output_tensors[i] =
get_template_model()->get_template_plugin()->m_backend->create_tensor(tensor.get_element_type(),
tensor.get_shape());
}
m_durations[Preprocess] = Time::now() - start;
}
2. start_pipeline()¶
Executes a pipeline synchronously using m_executable
object:
void ov::template_plugin::InferRequest::start_pipeline() {
OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[StartPipeline])
auto start = Time::now();
m_executable->call(m_backend_output_tensors,
m_backend_input_tensors,
m_eval_context,
get_template_model()->m_cfg.perf_count);
m_durations[StartPipeline] = Time::now() - start;
}
3. wait_pipeline()¶
Waits a pipeline in case of plugin asynchronous execution:
void ov::template_plugin::InferRequest::wait_pipeline() {
OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[WaitPipeline])
auto start = Time::now();
// TODO: Wait pipeline using driver API or other synchronizations methods
// NOTE: not used in current implementation since `startPipeline` executes pipiline synchronously
m_durations[WaitPipeline] = Time::now() - start;
}
4. infer_postprocess()¶
Converts backend specific tensors to tensors passed by user:
void ov::template_plugin::InferRequest::infer_postprocess() {
OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[Postprocess]);
auto start = Time::now();
OPENVINO_ASSERT(get_outputs().size() == m_backend_output_tensors.size());
for (size_t i = 0; i < get_outputs().size(); i++) {
const auto& result = get_template_model()->m_model->get_results()[i];
auto host_tensor = m_backend_output_tensors[i];
auto tensor = get_tensor(get_outputs()[i]);
if (result->get_output_partial_shape(0).is_dynamic()) {
ov::Output<const ov::Node> output{result->output(0).get_node(), result->output(0).get_index()};
allocate_tensor(output, [host_tensor](ov::SoPtr<ov::ITensor>& tensor) {
allocate_tensor_impl(tensor, host_tensor.get_element_type(), host_tensor.get_shape());
host_tensor.copy_to(ov::make_tensor(tensor));
});
} else if (!tensor->is_continuous()) {
host_tensor.copy_to(ov::make_tensor(tensor));
} else if (std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr)) {
auto vector_tensor = std::dynamic_pointer_cast<ov::template_plugin::VectorImpl>(tensor._ptr);
OPENVINO_ASSERT(vector_tensor, "Template plugin supports only VectorTensor with remote context.");
void* data = vector_tensor->get_data();
// Copy to vector
std::memcpy(data, host_tensor.data(), tensor->get_byte_size());
}
}
m_durations[Postprocess] = Time::now() - start;
}
get_profiling_info()¶
The method returns the profiling info which was measured during pipeline stages execution:
std::vector<ov::ProfilingInfo> ov::template_plugin::InferRequest::get_profiling_info() const {
std::vector<ov::ProfilingInfo> info;
const auto fill_profiling_info = [](const std::string& name,
const std::chrono::duration<float, std::micro>& time) -> ov::ProfilingInfo {
ov::ProfilingInfo p_info;
p_info.status = ov::ProfilingInfo::Status::EXECUTED;
p_info.node_name = name;
p_info.cpu_time = p_info.real_time = std::chrono::duration_cast<std::chrono::milliseconds>(time);
return p_info;
};
info.emplace_back(fill_profiling_info("input preprocessing", m_durations[Preprocess]));
info.emplace_back(fill_profiling_info("execution time", m_durations[StartPipeline]));
auto template_model = get_template_model();
for (const auto& op : template_model->get_runtime_model()->get_ops()) {
auto rt_info = op->get_rt_info();
const auto& it = rt_info.find(ov::runtime::interpreter::PERF_COUNTER_NAME);
OPENVINO_ASSERT(it != rt_info.end(), "Operation ", op, " doesn't contain performance counter");
auto counter = it->second.as<std::shared_ptr<ov::runtime::interpreter::PerfCounter>>();
info.emplace_back(fill_profiling_info(op->get_friendly_name(), counter->duration()));
}
info.emplace_back(fill_profiling_info("output postprocessing", m_durations[Postprocess]));
return info;
}
cancel()¶
The plugin specific method allows to interrupt the synchronous execution from the AsyncInferRequest:
void ov::template_plugin::InferRequest::cancel() {
m_executable->cancel();
}
The next step in the plugin library implementation is the Asynchronous Inference Request class.