Multi-Modal Image Query
This example demonstrates how to use the AgentOpera framework together with OpenAI GPT-4o to send a multi-modal message (image and text) and receive a multi-modal response from the AI assistant.
from io import BytesIO
import os
import asyncio
import PIL
import requests
from agentopera.chatflow.media import PilImage
from agentopera.models.openai import OpenAIChatCompletionClient
from agentopera.chatflow.agents import AssistantAgent
from agentopera.chatflow.messages import MultiModalMessage
from agentopera.engine.types.agent import CancellationToken1️⃣ Initialize the OpenAI GPT-4o Model Agent
from dotenv import load_dotenv
import os
load_dotenv()Truemodel_client = OpenAIChatCompletionClient(
base_url=os.getenv("MODEL_BASE_URL"),
api_key=os.getenv("MODEL_API_KEY"),
model=os.getenv("MODEL_ID")
)
agent = AssistantAgent(
name="assistant",
model_client=model_client,
)2️⃣ Create a multimodal message and send it to the AI assistant
3️⃣ Execute the multimodal query
Last updated