Multi-Modal Image Query
This example demonstrates how to use the AgentOpera framework together with OpenAI GPT-4o to send a multi-modal message (image and text) and receive a multi-modal response from the AI assistant.
from io import BytesIO
import os
import asyncio
import PIL
import requests
from agentopera.chatflow.media import PilImage
from agentopera.models.openai import OpenAIChatCompletionClient
from agentopera.chatflow.agents import AssistantAgent
from agentopera.chatflow.messages import MultiModalMessage
from agentopera.engine.types.agent import CancellationToken
1️⃣ Initialize the OpenAI GPT-4o Model Agent
from dotenv import load_dotenv
import os
load_dotenv()
True
model_client = OpenAIChatCompletionClient(
base_url=os.getenv("MODEL_BASE_URL"),
api_key=os.getenv("MODEL_API_KEY"),
model=os.getenv("MODEL_ID")
)
agent = AssistantAgent(
name="assistant",
model_client=model_client,
)
2️⃣ Create a multimodal message and send it to the AI assistant
async def assistant_run_multi_modal():
pil_image = PIL.Image.open(BytesIO(requests.get("https://picsum.photos/300/200").content))
img = PilImage(pil_image)
multi_modal_message = MultiModalMessage(
content=["Can you describe the content of this image?", img],
source="user"
)
response = await agent.on_messages(
[multi_modal_message],
cancellation_token=CancellationToken(),
)
print(response.chat_message)
3️⃣ Execute the multimodal query
await assistant_run_multi_modal()
source='assistant' models_usage=RequestUsage(prompt_tokens=8545, completion_tokens=99) metadata={} content='The image depicts a mountainous landscape, where the peak of the mountain is covered in snow, indicating a cold or wintery environment. Below the peak, there are slopes that transition into a forested area, primarily consisting of evergreen trees, suggesting a diverse alpine ecosystem. The sky appears to be overcast, with clouds partially covering it, enhancing the dramatic effect of the snow-capped mountain. Overall, the scene conveys a sense of natural beauty and tranquility associated with mountainous terrains. \n\nTERMINATE' context=[] type='TextMessage'
Last updated