From ba1ae1bb49c285fb0b160d65fe196b91d17112d4 Mon Sep 17 00:00:00 2001 From: Junrui Zhang Date: Fri, 5 Jun 2026 14:14:18 +0800 Subject: [PATCH] Preserve VLM multipart content order --- src/llm/apis/openai_completions.cpp | 29 ++++++-- .../continuous_batching/servable.cpp | 45 +++++++----- .../visual_language_model/legacy/servable.cpp | 44 +++++++----- src/test/http_openai_handler_test.cpp | 69 ++++++++++++++----- 4 files changed, 129 insertions(+), 58 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 2fd5e12005..572bbfcf6b 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,10 @@ using namespace rapidjson; namespace ovms { +static bool containsReservedImageTag(std::string_view text) { + return text.find("name.GetString(); if (member->value.IsString() && (memberName == "role" || memberName == "content")) { + if (memberName == "content" && containsReservedImageTag({member->value.GetString(), member->value.GetStringLength()})) { + return absl::InvalidArgumentError("Message contains restricted tag"); + } // Add new field to the last message in history request.chatHistory.last()[memberName] = member->value.GetString(); continue; @@ -201,7 +209,8 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue.GetArray()) { if (!v.IsObject()) { return absl::InvalidArgumentError("Invalid message structure - content array should contain objects"); @@ -215,10 +224,14 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional tag"); + } + if (previousPartWasText) { + contentWithImageTags += "\n"; } - combinedText.append(entry["text"].GetString(), entry["text"].GetStringLength()); + contentWithImageTags.append(entry["text"].GetString(), entry["text"].GetStringLength()); + previousPartWasText = true; continue; } else if (entryType == std::string("image_url")) { if (!entry.HasMember("image_url") || !entry["image_url"].IsObject()) { @@ -233,15 +246,17 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional\n"; request.imageHistory.push_back({i, tensorResult.value()}); + previousPartWasText = false; } else { return absl::InvalidArgumentError("Unsupported content type"); } } - // Flatten all text parts (joined with newlines) into the "content" field. - // Images are stored separately in request.imageHistory. + // Preserve multipart content order by replacing image parts with + // the placeholders consumed by the VLM pipeline. Value contentText(rapidjson::kStringType); - contentText.SetString(combinedText.c_str(), combinedText.length(), doc.GetAllocator()); + contentText.SetString(contentWithImageTags.c_str(), contentWithImageTags.length(), doc.GetAllocator()); member->value = contentText; // Add new field to the last message in history if content is text if (member->value.IsString()) { diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 13b7e73a62..379adbe643 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -71,27 +71,38 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); + const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); + // Responses parsing keeps images in imageHistory only; Chat Completions parsing + // already inserts placeholders in content to preserve multipart order. + if (executionContext->endpoint == Endpoint::RESPONSES) { + for (size_t i = 0; i < chatHistory.size(); i++) { + const auto& message = chatHistory[i]; + if (message["content"].as_string().value_or("").find(" tag"); + } } } - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } - - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + if (executionContext->endpoint == Endpoint::RESPONSES) { + size_t imageIndex = 0; + std::unordered_map imageTags; + for (const auto& image : imageHistory) { + const auto& [chatTurnIndex, imageTensor] = image; + std::string imageTag = "\n"; + imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; + vlmExecutionContext->inputImages.push_back(imageTensor); + } + for (const auto& [chatTurnIndex, imageTagString] : imageTags) { + std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); + chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + } + } else { + for (const auto& image : imageHistory) { + const auto& [chatTurnIndex, imageTensor] = image; + (void)chatTurnIndex; + vlmExecutionContext->inputImages.push_back(imageTensor); + } } constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 033cb8641d..7ba09af59a 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -275,26 +275,38 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); + const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); + // Responses parsing keeps images in imageHistory only; Chat Completions parsing + // already inserts placeholders in content to preserve multipart order. + if (executionContext->endpoint == Endpoint::RESPONSES) { + for (size_t i = 0; i < chatHistory.size(); i++) { + const auto& message = chatHistory[i]; + if (message["content"].as_string().value_or("").find(" tag"); + } } } - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + if (executionContext->endpoint == Endpoint::RESPONSES) { + size_t imageIndex = 0; + std::unordered_map imageTags; + for (const auto& image : imageHistory) { + const auto& [chatTurnIndex, imageTensor] = image; + std::string imageTag = "\n"; + imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; + vlmExecutionContext->inputImages.push_back(imageTensor); + } + for (const auto& [chatTurnIndex, imageTagString] : imageTags) { + std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); + chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + } + } else { + for (const auto& image : imageHistory) { + const auto& [chatTurnIndex, imageTensor] = image; + (void)chatTurnIndex; + vlmExecutionContext->inputImages.push_back(imageTensor); + } } constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 46a31a3337..ed8d18a9ec 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -2249,7 +2249,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { @@ -2286,7 +2286,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllowedDomains) { @@ -2323,7 +2323,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllow EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { @@ -2360,7 +2360,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomainAll) { @@ -2397,7 +2397,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomai EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { @@ -2432,7 +2432,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails) { @@ -2607,7 +2607,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystem) { EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPath) { @@ -2643,7 +2643,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAl EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPathMixedSeparators) { @@ -3094,9 +3094,9 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { } } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}," + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\\n\"}," "{\"role\":\"assistant\",\"content\":\"No idea my friend.\"}," - "{\"role\":\"user\",\"content\":\"What about this one?\"}," + "{\"role\":\"user\",\"content\":\"What about this one?\\n\"}," "{\"role\":\"assistant\",\"content\":\"Same thing. I'm not very good with images.\"}," "{\"role\":\"user\",\"content\":\"You were not trained with images, were you?\"}]}")); } @@ -3187,7 +3187,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyContentArrayFails) { EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid message structure - content array is empty")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesMultipleTextItemsConcatenatesWithNewline) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesMultipleTextItemsPreservesTextParts) { std::string json = R"({ "model": "llama", "messages": [ @@ -3210,15 +3210,13 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesMultipleTextItemsConcatenate ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - // Non-Python path: chatHistory content is the concatenated string auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 1); EXPECT_EQ(chatHistory[0]["content"], "First part.\nSecond part."); - // Python Jinja path: processedJson carries the same flattened content for applyChatTemplate EXPECT_EQ(apiHandler->getProcessedJson(), R"({"model":"llama","messages":[{"role":"user","content":"First part.\nSecond part."}]})"); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextBeforeAndAfterImageConcatenatesAllText) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextBeforeAndAfterImagePreservesMultipartOrder) { std::string json = R"({ "model": "llama", "messages": [ @@ -3247,13 +3245,48 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextBeforeAndAfterImageConca ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - // Non-Python path: chatHistory content is the concatenated string auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 1); - EXPECT_EQ(chatHistory[0]["content"], "Before image.\nAfter image."); + EXPECT_EQ(chatHistory[0]["content"], "Before image.\nAfter image."); EXPECT_EQ(apiHandler->getImageHistory().size(), 1); - // Python Jinja path: processedJson carries the same flattened content for applyChatTemplate - EXPECT_EQ(apiHandler->getProcessedJson(), R"({"model":"llama","messages":[{"role":"user","content":"Before image.\nAfter image."}]})"); + EXPECT_EQ(apiHandler->getProcessedJson(), R"({"model":"llama","messages":[{"role":"user","content":"Before image.\nAfter image."}]})"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesStringContentWithReservedImageTagFails) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": "Please use " + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Message contains restricted tag")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextPartWithReservedImageTagFails) { + std::string json = R"({ + "model": "llama", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Please use " + } + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Message contains restricted tag")); } TEST_F(HttpOpenAIHandlerParsingTest, maxTokensValueDefaultToMaxTokensLimit) {