Inference with OpenVINO GenAI#

OpenVINO™ GenAI is a library of pipelines and methods, extending the OpenVINO runtime to work with generative AI models more efficiently. This article provides reference code and guidance on its usage. Note that the base OpenVINO version will not work with these instructions, make sure to install OpenVINO with GenAI.

Here is sample code for several Generative AI use case scenarios. Note that these are very basic examples and may need adjustments for your specific needs, like changing the inference device.

For a more extensive instruction and additional options, see the step-by-step chat-bot guide below.

Text-to-Image Generation

OpenVINO GenAI introduces openvino_genai.Text2ImagePipeline for inference of text-to-image models such as: as Stable Diffusion 1.5, 2.1, XL, LCM, Flex, and more. See the following usage example for reference.

Python

text2image.py

import argparse

import openvino_genai
from PIL import Image


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('model_dir')
    parser.add_argument('prompt')
    args = parser.parse_args()

    device = 'CPU'  # GPU can be used as well
    pipe = openvino_genai.Text2ImagePipeline(args.model_dir, device)

    image_tensor = pipe.generate(
        args.prompt,
        width=512,
        height=512,
        num_inference_steps=20,
        num_images_per_prompt=1)

    image = Image.fromarray(image_tensor.data[0])
    image.save("image.bmp")

lora_text2image.py

import openvino as ov
import openvino_genai

def image_write(path: str, image_tensor: ov.Tensor):
    from PIL import Image
    image = Image.fromarray(image_tensor.data[0])
    image.save(path)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('models_path')
    parser.add_argument('prompt')
    args, adapters = parser.parse_known_args()

    prompt = args.prompt

    device = "CPU"  # GPU, NPU can be used as well
    adapter_config = openvino_genai.AdapterConfig()

    # Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters:
    for i in range(int(len(adapters) / 2)):
        adapter = openvino_genai.Adapter(adapters[2 * i])
        alpha = float(adapters[2 * i + 1])
        adapter_config.add(adapter, alpha)

    # LoRA adapters passed to the constructor will be activated by default in next generates
    pipe = openvino_genai.Text2ImagePipeline(args.models_path, device, adapters=adapter_config)

    print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp")
    image = pipe.generate(prompt,
                          width=512,
                          height=896,
                          num_inference_steps=20,
                          rng_seed=42)

    image_write("lora.bmp", image)
    print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp")
    image = pipe.generate(prompt,
                          # passing adapters in generate overrides adapters set in the constructor; openvino_genai.AdapterConfig() means no adapters
                          adapters=openvino_genai.AdapterConfig(),
                          width=512,
                          height=896,
                          num_inference_steps=20,
                          rng_seed=42)
    image_write("baseline.bmp", image)

For more information, refer to the Python sample

C++

text2image.cpp

#include "openvino/genai/image_generation/text2image_pipeline.hpp"

#include "imwrite.hpp"

int32_t main(int32_t argc, char* argv[]) try {
    OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'");

    const std::string models_path = argv[1], prompt = argv[2];
    const std::string device = "CPU";  // GPU can be used as well

    ov::genai::Text2ImagePipeline pipe(models_path, device);
    ov::Tensor image = pipe.generate(prompt,
        ov::genai::width(512),
        ov::genai::height(512),
        ov::genai::num_inference_steps(20),
        ov::genai::num_images_per_prompt(1));

    // writes `num_images_per_prompt` images by pattern name
    imwrite("image_%d.bmp", image, true);

    return EXIT_SUCCESS;
} catch (const std::exception& error) {
    try {
        std::cerr << error.what() << '\n';
    } catch (const std::ios_base::failure&) {}
    return EXIT_FAILURE;
} catch (...) {
    try {
        std::cerr << "Non-exception object thrown\n";
    } catch (const std::ios_base::failure&) {}
    return EXIT_FAILURE;
}

lora_text2image.cpp

#include "openvino/genai/image_generation/text2image_pipeline.hpp"

#include "imwrite.hpp"

int32_t main(int32_t argc, char* argv[]) try {
    OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' [<LORA_SAFETENSORS> <ALPHA> ...]]");

    const std::string models_path = argv[1], prompt = argv[2];
    const std::string device = "CPU";  // GPU, NPU can be used as well

    ov::genai::AdapterConfig adapter_config;
    // Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters:
    for(size_t i = 0; i < (argc - 3)/2; ++i) {
        ov::genai::Adapter adapter(argv[3 + 2*i]);
        float alpha = std::atof(argv[3 + 2*i + 1]);
        adapter_config.add(adapter, alpha);
    }

    // LoRA adapters passed to the constructor will be activated by default in next generates
    ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config));

    std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n";
    ov::Tensor image = pipe.generate(prompt,
        ov::genai::width(512),
        ov::genai::height(896),
        ov::genai::num_inference_steps(20),
        ov::genai::rng_seed(42));
    imwrite("lora.bmp", image, true);

    std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n";
    image = pipe.generate(prompt,
        ov::genai::adapters(),  // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters
        ov::genai::width(512),
        ov::genai::height(896),
        ov::genai::num_inference_steps(20),
        ov::genai::rng_seed(42));
    imwrite("baseline.bmp", image, true);

    return EXIT_SUCCESS;
} catch (const std::exception& error) {
    try {
        std::cerr << error.what() << '\n';
    } catch (const std::ios_base::failure&) {}
    return EXIT_FAILURE;
} catch (...) {
    try {
        std::cerr << "Non-exception object thrown\n";
    } catch (const std::ios_base::failure&) {}
    return EXIT_FAILURE;
}

For more information, refer to the C++ sample

Speech Recognition

The application performs inference on speech recognition Whisper Models. The samples include the WhisperPipeline class and use audio files in WAV format at a sampling rate of 16 kHz as input.

Python

import openvino_genai
import librosa


def read_wav(filepath):
    raw_speech, samplerate = librosa.load(filepath, sr=16000)
    return raw_speech.tolist()


def infer(model_dir: str, wav_file_path: str):
    device = "CPU"  # GPU or NPU can be used as well.
    pipe = openvino_genai.WhisperPipeline(model_dir, device)

    # The pipeline expects normalized audio with a sampling rate of 16kHz.
    raw_speech = read_wav(wav_file_path)
    result = pipe.generate(
        raw_speech,
        max_new_tokens=100,
        language="<|en|>",
        task="transcribe",
        return_timestamps=True,
    )

    print(result)

    for chunk in result.chunks:
        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")

For more information, refer to the Python sample.

C++

#include "audio_utils.hpp"
#include "openvino/genai/whisper_pipeline.hpp"

int main(int argc, char* argv[]) try {
    if (3 > argc) {
        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<WAV_FILE_PATH>\"");
    }

    std::filesystem::path models_path = argv[1];
    std::string wav_file_path = argv[2];
    std::string device = "CPU";  // GPU or NPU can be used as well.

    ov::genai::WhisperPipeline pipeline(models_path, device);

    ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json");
    config.max_new_tokens = 100;
    config.language = "<|en|>";
    config.task = "transcribe";
    config.return_timestamps = true;

    // The pipeline expects normalized audio with a sampling rate of 16kHz.
    ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path);
    auto result = pipeline.generate(raw_speech, config);

    std::cout << result << "\n";

    for (auto& chunk : *result.chunks) {
        std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
    }

} catch (const std::exception& error) {
    try {
        std::cerr << error.what() << '\n';
    } catch (const std::ios_base::failure&) {
    }
    return EXIT_FAILURE;
} catch (...) {
    try {
        std::cerr << "Non-exception object thrown\n";
    } catch (const std::ios_base::failure&) {
    }
    return EXIT_FAILURE;
}

For more information, refer to the C++ sample.

Using GenAI with Vision Language Models

OpenVINO GenAI introduces the openvino_genai.VLMPipeline pipeline for inference of multimodal text-generation Vision Language Models (VLMs). With a text prompt and an image as input, VLMPipeline can generate text using models such as LLava or MiniCPM-V. See the chat scenario presented in the samples below:

Python

import numpy as np
import openvino_genai
from PIL import Image
from openvino import Tensor
from pathlib import Path


def streamer(subword: str) -> bool:
    print(subword, end='', flush=True)


def read_image(path: str) -> Tensor:
    pic = Image.open(path).convert("RGB")
    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
    return Tensor(image_data)


def read_images(path: str) -> list[Tensor]:
    entry = Path(path)
    if entry.is_dir():
        return [read_image(str(file)) for file in sorted(entry.iterdir())]
    return [read_image(path)]


def infer(model_dir: str, image_dir: str):
    rgbs = read_images(image_dir)
    device = 'CPU'  # GPU can be used as well.
    enable_compile_cache = dict()
    if "GPU" == device:
        enable_compile_cache["CACHE_DIR"] = "vlm_cache"
    pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache)

    config = openvino_genai.GenerationConfig()
    config.max_new_tokens = 100

    pipe.start_chat()
    prompt = input('question:\n')
    pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer)

    while True:
        try:
            prompt = input("\n----------\n"
                "question:\n")
        except EOFError:
            break
        pipe.generate(prompt, generation_config=config, streamer=streamer)
    pipe.finish_chat()

For more information, refer to the Python sample.

C++

#include "load_image.hpp"
#include <openvino/genai/visual_language/pipeline.hpp>
#include <filesystem>

bool print_subword(std::string&& subword) {
    return !(std::cout << subword << std::flush);
}

int main(int argc, char* argv[]) try {
    if (3 != argc) {
        throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
    }

    std::vector<ov::Tensor> rgbs = utils::load_images(argv[2]);

    std::string device = "CPU";  // GPU can be used as well.
    ov::AnyMap enable_compile_cache;
    if ("GPU" == device) {
        enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
    }
    ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache);

    ov::genai::GenerationConfig generation_config;
    generation_config.max_new_tokens = 100;

    std::string prompt;

    pipe.start_chat();
    std::cout << "question:\n";

    std::getline(std::cin, prompt);
    pipe.generate(prompt,
                  ov::genai::images(rgbs),
                  ov::genai::generation_config(generation_config),
                  ov::genai::streamer(print_subword));
    std::cout << "\n----------\n"
        "question:\n";
    while (std::getline(std::cin, prompt)) {
        pipe.generate(prompt,
                      ov::genai::generation_config(generation_config),
                      ov::genai::streamer(print_subword));
        std::cout << "\n----------\n"
            "question:\n";
    }
    pipe.finish_chat();
} catch (const std::exception& error) {
    try {
        std::cerr << error.what() << '\n';
    } catch (const std::ios_base::failure&) {}
    return EXIT_FAILURE;
} catch (...) {
    try {
        std::cerr << "Non-exception object thrown\n";
    } catch (const std::ios_base::failure&) {}
    return EXIT_FAILURE;
}

For more information, refer to the C++ sample

Comparing with Hugging Face Results#

You can compare the results of the above example with those generated by Hugging Face models by running the following code:

Python

from transformers import AutoTokenizer, AutoModelForCausalLM
import openvino_genai as ov_genai

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

max_new_tokens = 32
prompt = 'table is made of'

encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
print(f'hf_output: {hf_output}')

pipe = ov_genai.LLMPipeline('TinyLlama-1.1B-Chat-v1.0')
ov_output = pipe.generate(prompt, max_new_tokens=max_new_tokens)
print(f'ov_output: {ov_output}')

assert hf_output == ov_output

GenAI API#

The use case described here regards the following OpenVINO GenAI API classes:

generation_config - defines a configuration class for text generation, enabling customization of the generation process such as the maximum length of the generated text, whether to ignore end-of-sentence tokens, and the specifics of the decoding strategy (greedy, beam search, or multinomial sampling).
llm_pipeline - provides classes and utilities for processing inputs, text generation, and managing outputs with configurable options.
streamer_base - an abstract base class for creating streamers.
tokenizer - the tokenizer class for text encoding and decoding.

Learn more from the GenAI API reference.

Additional Resources#

OpenVINO GenAI Repo
OpenVINO GenAI Samples
A Jupyter notebook demonstrating Visual-language assistant with MiniCPM-V2 and OpenVINO
OpenVINO Tokenizers
Neural Network Compression Framework

Inference with OpenVINO GenAI#

Chat-bot use case - step by step#

Running the model#

Streaming the Output#

Optimizing Generation with Grouped Beam Search#

Efficient Text Generation via Speculative Decoding#

Comparing with Hugging Face Results#

GenAI API#

Additional Resources#