Generative AI Use Cases#
Introduction#
Beside Tensorflow Serving API (/v1
) and KServe API (/v2
) frontends, the model server supports a range of endpoints for generative use cases (v3
). They are extendible using MediaPipe graphs.
Currently supported endpoints are:
OpenAI compatible endpoints:
Cohere Compatible endpoint:
OpenAI API Clients#
When creating a Python-based client application, you can use OpenAI client library - openai.
Alternatively, it is possible to use just a curl
command or requests
python library.
Install the Package#
pip3 install openai
pip3 install requests
pip3 install cohere
Request chat completions with unary calls#
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v3", api_key="unused")
response = client.chat.completions.create(
model="meta-llama/Llama-2-7b-chat-hf",
messages=[{"role": "user", "content": "Say this is a test"}],
stream=False,
)
print(response.choices[0].message)
import requests
payload = {"model": "meta-llama/Llama-2-7b-chat-hf", "messages": [ {"role": "user","content": "Say this is a test" }]}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
print(response.text)
curl http://localhost:8000/v3/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Llama-2-7b-chat-hf", "messages": [ {"role": "user","content": "Say this is a test" }]}'
Request chat completions with unary calls (with image input)#
import base64
from openai import OpenAI
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
image_path = "/path/to/image"
image = encode_image(image_path)
client = OpenAI(base_url="http://localhost:8000/v3", api_key="unused")
response = client.chat.completions.create(
model="openbmb/MiniCPM-V-2_6",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is in this image?",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
},
],
}
],
stream=False,
)
print(response.choices[0].message)
Check LLM quick start and end to end demo of text generation.
Request completions with unary calls#
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v3", api_key="unused")
response = client.completions.create(
model="meta-llama/Llama-2-7b",
prompt="Say this is a test",
stream=False,
)
print(response.choices[0].text)
import requests
payload = {"model": "meta-llama/Llama-2-7b", "prompt": "Say this is a test"}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/completions", json=payload, headers=headers)
print(response.text)
curl http://localhost:8000/v3/completions \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Llama-2-7b", "prompt": "Say this is a test"}'
Check LLM quick start and end to end demo of text generation.
Request chat completions with streaming#
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v3",
api_key="unused"
)
stream = client.chat.completions.create(
model="meta-llama/Llama-2-7b-chat-hf",
messages=[{"role": "user", "content": "Say this is a test"}],
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")
Request chat completions with streaming (with image input)#
import base64
from openai import OpenAI
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
image_path = "/path/to/image"
image = encode_image(image_path)
client = OpenAI(base_url="http://localhost:8000/v3", api_key="unused")
stream = client.chat.completions.create(
model="openbmb/MiniCPM-V-2_6",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is in this image?",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
},
],
}
],
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")
Check LLM quick start and end to end demo of text generation.
Request completions with streaming#
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v3",
api_key="unused"
)
stream = client.completions.create(
model="meta-llama/Llama-2-7b",
prompt="Say this is a test",
stream=True,
)
for chunk in stream:
if chunk.choices[0].text is not None:
print(chunk.choices[0].text, end="")
Check LLM quick start and end to end demo of text generation.
Text embeddings#
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v3",
api_key="unused"
)
responses = client.embeddings.create(input=['hello world'], model='Alibaba-NLP/gte-large-en-v1.5')
for data in responses.data:
print(data.embedding)
import requests
payload = {"model": "Alibaba-NLP/gte-large-en-v1.5", "input": "hello world"}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/embeddings", json=payload, headers=headers)
print(response.text)
curl http://localhost:8000/v3/embeddings \
-H "Content-Type: application/json" \
-d '{"model": "Alibaba-NLP/gte-large-en-v1.5", "input": "hello world"}'
Image generation#
Install the pillow package to be able to save images to disk:
pip3 install pillow
from openai import OpenAI
import base64
from io import BytesIO
from PIL import Image
client = OpenAI(
base_url="http://localhost:8000/v3",
api_key="unused"
)
response = client.images.generate(
model="OpenVINO/FLUX.1-schnell-int4-ov",
prompt="three cute cats sitting on a bench",
"size": "512x512",
extra_body={
"rng_seed": 45,
"num_inference_steps": 3
}
)
base64_image = response.data[0].b64_json
image_data = base64.b64decode(base64_image)
image = Image.open(BytesIO(image_data))
image.save('output.png')
curl http://localhost:8000/v3/images/generations \
-H "Content-Type: application/json" \
-d '{
"model": "black-forest-labs/FLUX.1-schnell",
"prompt": "three cute cats sitting on a bench",
"num_inference_steps": 3,
"size": "512x512"
}'| jq -r '.data[0].b64_json' | base64 --decode > output.png
$response = Invoke-WebRequest -Uri "http://localhost:8000/v3/images/generations" `
-Method POST `
-Headers @{ "Content-Type" = "application/json" } `
-Body '{"model": "OpenVINO/FLUX.1-schnell-int4-ov", "prompt": "three cute cats sitting on a bench", "num_inference_steps": 3, "size": "512x512"}'
$base64 = ($response.Content | ConvertFrom-Json).data[0].b64_json
[IO.File]::WriteAllBytes('output.png', [Convert]::FromBase64String($base64))
Cohere Python Client#
Clients can use rerank endpoint via cohere python package - cohere.
Just like with openAI endpoints and alternative is in curl
command or requests
python library.
Install the Package#
pip3 install cohere
pip3 install requests
Documents reranking#
import cohere
client = cohere.Client(base_url='http://localhost:8000/v3', api_key="not_used")
responses = client.rerank(query="Hello",documents=["Welcome","Farewell"], model='BAAI/bge-reranker-large')
for res in responses.results:
print(res.index, res.relevance_score)
import requests
payload = {"model": "BAAI/bge-reranker-large", "query": "Hello", "documents":["Welcome","Farewell"]}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/rerank", json=payload, headers=headers)
print(response.text)
curl http://localhost:8000/v3/rerank \
-H "Content-Type: application/json" \
-d '{"model": "BAAI/bge-reranker-large", "query": "Hello", "documents":["Welcome","Farewell"]}'