Multi-Modal Image Query

This example demonstrates how to use the AgentOpera framework together with OpenAI GPT-4o to send a multi-modal message (image and text) and receive a multi-modal response from the AI assistant.

from io import BytesIO
import os
import asyncio
import PIL
import requests

from agentopera.chatflow.media import PilImage
from agentopera.models.openai import OpenAIChatCompletionClient
from agentopera.chatflow.agents import AssistantAgent
from agentopera.chatflow.messages import MultiModalMessage
from agentopera.engine.types.agent import CancellationToken

1️⃣ Initialize the OpenAI GPT-4o Model Agent

from dotenv import load_dotenv
import os
load_dotenv()
True
model_client = OpenAIChatCompletionClient(
    base_url=os.getenv("MODEL_BASE_URL"),
    api_key=os.getenv("MODEL_API_KEY"),
    model=os.getenv("MODEL_ID")
)

agent = AssistantAgent(
    name="assistant",
    model_client=model_client,
)

2️⃣ Create a multimodal message and send it to the AI assistant

3️⃣ Execute the multimodal query

Last updated