Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <set>
#include <string>
#include <string.h>
#include <string_view>
#include <vector>

#include <openvino/genai/llm_pipeline.hpp>
Expand All @@ -46,6 +47,10 @@ using namespace rapidjson;

namespace ovms {

static bool containsReservedImageTag(std::string_view text) {
return text.find("<ov_genai_image_") != std::string::npos;
}

static bool hasToolCallsInStreamingDelta(const rapidjson::Document& delta) {
if (!delta.HasMember("delta") || !delta["delta"].IsObject()) {
return false;
Expand Down Expand Up @@ -176,6 +181,9 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
return absl::InvalidArgumentError("Invalid message structure");
std::string memberName = member->name.GetString();
if (member->value.IsString() && (memberName == "role" || memberName == "content")) {
if (memberName == "content" && containsReservedImageTag({member->value.GetString(), member->value.GetStringLength()})) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
// Add new field to the last message in history
request.chatHistory.last()[memberName] = member->value.GetString();
continue;
Expand All @@ -201,7 +209,8 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
return absl::InvalidArgumentError("Invalid message structure - content array is empty");
}
jsonChanged = true;
std::string combinedText;
std::string contentWithImageTags;
bool previousPartWasText = false;
for (auto& v : member->value.GetArray()) {
if (!v.IsObject()) {
return absl::InvalidArgumentError("Invalid message structure - content array should contain objects");
Expand All @@ -215,10 +224,14 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
if (!entry.HasMember("text") || !entry["text"].IsString()) {
return absl::InvalidArgumentError("Invalid message structure - content text missing");
}
if (!combinedText.empty()) {
combinedText += "\n";
if (containsReservedImageTag({entry["text"].GetString(), entry["text"].GetStringLength()})) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
if (previousPartWasText) {
contentWithImageTags += "\n";
}
combinedText.append(entry["text"].GetString(), entry["text"].GetStringLength());
contentWithImageTags.append(entry["text"].GetString(), entry["text"].GetStringLength());
previousPartWasText = true;
continue;
} else if (entryType == std::string("image_url")) {
if (!entry.HasMember("image_url") || !entry["image_url"].IsObject()) {
Expand All @@ -233,15 +246,17 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
if (!tensorResult.ok()) {
return tensorResult.status();
}
contentWithImageTags += "<ov_genai_image_" + std::to_string(request.imageHistory.size()) + ">\n";
request.imageHistory.push_back({i, tensorResult.value()});
previousPartWasText = false;
} else {
return absl::InvalidArgumentError("Unsupported content type");
}
}
// Flatten all text parts (joined with newlines) into the "content" field.
// Images are stored separately in request.imageHistory.
// Preserve multipart content order by replacing image parts with
// the placeholders consumed by the VLM pipeline.
Value contentText(rapidjson::kStringType);
contentText.SetString(combinedText.c_str(), combinedText.length(), doc.GetAllocator());
contentText.SetString(contentWithImageTags.c_str(), contentWithImageTags.length(), doc.GetAllocator());
member->value = contentText;
// Add new field to the last message in history if content is text
if (member->value.IsString()) {
Expand Down
45 changes: 28 additions & 17 deletions src/llm/visual_language_model/continuous_batching/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,38 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
}
if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) {
ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();
const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();

for (size_t i = 0; i < chatHistory.size(); i++) {
const auto& message = chatHistory[i];
if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
// Responses parsing keeps images in imageHistory only; Chat Completions parsing
// already inserts placeholders in content to preserve multipart order.
if (executionContext->endpoint == Endpoint::RESPONSES) {
for (size_t i = 0; i < chatHistory.size(); i++) {
const auto& message = chatHistory[i];
if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
}
}

const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();
size_t imageIndex = 0;
std::unordered_map<size_t, std::string> imageTags;
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
vlmExecutionContext->inputImages.push_back(imageTensor);
}

for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
if (executionContext->endpoint == Endpoint::RESPONSES) {
size_t imageIndex = 0;
std::unordered_map<size_t, std::string> imageTags;
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
vlmExecutionContext->inputImages.push_back(imageTensor);
}
for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
}
} else {
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
(void)chatTurnIndex;
vlmExecutionContext->inputImages.push_back(imageTensor);
}
}
Comment on lines +78 to 106

constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
Expand Down
44 changes: 28 additions & 16 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,26 +275,38 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
}
if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) {
ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();
const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();

for (size_t i = 0; i < chatHistory.size(); i++) {
const auto& message = chatHistory[i];
if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
// Responses parsing keeps images in imageHistory only; Chat Completions parsing
// already inserts placeholders in content to preserve multipart order.
if (executionContext->endpoint == Endpoint::RESPONSES) {
for (size_t i = 0; i < chatHistory.size(); i++) {
const auto& message = chatHistory[i];
if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
}
}
}

const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();
size_t imageIndex = 0;
std::unordered_map<size_t, std::string> imageTags;
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
vlmExecutionContext->inputImages.push_back(imageTensor);
}
for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
if (executionContext->endpoint == Endpoint::RESPONSES) {
size_t imageIndex = 0;
std::unordered_map<size_t, std::string> imageTags;
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
vlmExecutionContext->inputImages.push_back(imageTensor);
}
for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
}
} else {
for (const auto& image : imageHistory) {
const auto& [chatTurnIndex, imageTensor] = image;
(void)chatTurnIndex;
vlmExecutionContext->inputImages.push_back(imageTensor);
}
}
Comment on lines +282 to 310

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. I checked the parser ownership here: Endpoint::RESPONSES does not go through OpenAIChatCompletionsHandler::parseMessages(). It is parsed by openai_responses.cpp, where image content is still stored only in imageHistory and placeholders are not injected into message content. So the Responses servable path should keep owning placeholder insertion. I added comments in both VLM servables to make this contract explicit.


constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded
Expand Down
Loading