Multi-Modal Image Query

This example demonstrates how to use the AgentOpera framework together with OpenAI GPT-4o to send a multi-modal message (image and text) and receive a multi-modal response from the AI assistant.

from io import BytesIO
import os
import asyncio
import PIL
import requests

from agentopera.chatflow.media import PilImage
from agentopera.models.openai import OpenAIChatCompletionClient
from agentopera.chatflow.agents import AssistantAgent
from agentopera.chatflow.messages import MultiModalMessage
from agentopera.engine.types.agent import CancellationToken

1️⃣ Initialize the OpenAI GPT-4o Model Agent

from dotenv import load_dotenv
import os
load_dotenv()

True

model_client = OpenAIChatCompletionClient(
    base_url=os.getenv("MODEL_BASE_URL"),
    api_key=os.getenv("MODEL_API_KEY"),
    model=os.getenv("MODEL_ID")
)

agent = AssistantAgent(
    name="assistant",
    model_client=model_client,
)

2️⃣ Create a multimodal message and send it to the AI assistant

async def assistant_run_multi_modal():
    pil_image = PIL.Image.open(BytesIO(requests.get("https://picsum.photos/300/200").content))
    img = PilImage(pil_image)

    multi_modal_message = MultiModalMessage(
        content=["Can you describe the content of this image?", img],
        source="user"
    )

    response = await agent.on_messages(
        [multi_modal_message],
        cancellation_token=CancellationToken(),
    )
    print(response.chat_message)

3️⃣ Execute the multimodal query

await assistant_run_multi_modal()

source='assistant' models_usage=RequestUsage(prompt_tokens=8545, completion_tokens=99) metadata={} content='The image depicts a mountainous landscape, where the peak of the mountain is covered in snow, indicating a cold or wintery environment. Below the peak, there are slopes that transition into a forested area, primarily consisting of evergreen trees, suggesting a diverse alpine ecosystem. The sky appears to be overcast, with clouds partially covering it, enhancing the dramatic effect of the snow-capped mountain. Overall, the scene conveys a sense of natural beauty and tranquility associated with mountainous terrains. \n\nTERMINATE' context=[] type='TextMessage'

PreviousDeployment NextTool Invocation

Last updated 8 months ago