diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp index 1cfb9919ec..4b8f851184 100644 --- a/src/capi_frontend/server_settings.hpp +++ b/src/capi_frontend/server_settings.hpp @@ -226,6 +226,10 @@ struct ServerSettingsImpl { std::optional> allowedMediaDomains; std::string logLevel = "INFO"; std::string logPath; + // When enabled, LLM chat/text completion unary responses include an extra + // "__verbose" object with the raw model input (after chat template) and the + // raw model output (before tool/reasoning parsing). Inspired by llama.cpp -v. + bool verboseResponse = false; bool allowCredentials = false; std::string allowedOrigins{"*"}; std::string allowedMethods{"*"}; diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp index e6f6365c26..6eb770d1b6 100644 --- a/src/cli_parser.cpp +++ b/src/cli_parser.cpp @@ -105,6 +105,12 @@ std::variant> CLIParser::parse(int argc, char* ("log_path", "Optional path to the log file", cxxopts::value(), "LOG_PATH") + ("verbose_response", + "When enabled, LLM chat/text completion unary responses include an extra " + "\"__verbose\" object with the raw prompt (after chat template) and the raw " + "model output (before tool/reasoning parsing). Useful for debugging.", + cxxopts::value()->default_value("false"), + "VERBOSE_RESPONSE") #ifdef MTR_ENABLED ("trace_path", "Path to the trace file", @@ -522,6 +528,8 @@ void CLIParser::prepareServer(ServerSettingsImpl& serverSettings) { serverSettings.logLevel = result->operator[]("log_level").as(); if (result->count("log_path")) serverSettings.logPath = result->operator[]("log_path").as(); + if (result->count("verbose_response")) + serverSettings.verboseResponse = result->operator[]("verbose_response").as(); if (result->count("grpc_channel_arguments")) serverSettings.grpcChannelArguments = result->operator[]("grpc_channel_arguments").as(); diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 7c56bcbf95..8c1dd0357a 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -99,6 +99,16 @@ class OpenAIApiHandler { // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; + // Verbose response support (enabled via --verbose_response). When set, the + // serialized unary response includes a "__verbose" object with the raw prompt + // (post chat template) and raw decoded model output (before tool/reasoning + // parsing). Inspired by llama.cpp -v. + bool verboseResponse = false; + std::string verbosePrompt; + // Streaming accumulators for raw model output. + std::vector verboseRawTokens; + std::string verboseRawText; + // Shared parsing helpers absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); absl::Status parseResponseFormat(); @@ -156,6 +166,26 @@ class OpenAIApiHandler { std::string getToolChoice() const; const std::unique_ptr& getOutputParser() const; + // Verbose response configuration + void enableVerboseResponse(const std::string& promptAfterTemplate) { + verboseResponse = true; + verbosePrompt = promptAfterTemplate; + } + bool isVerboseResponse() const { return verboseResponse; } + const std::string& getVerbosePrompt() const { return verbosePrompt; } + // Accumulators used to assemble the "raw model output" for streaming responses. + void appendVerboseRawTokens(const std::vector& tokens) { + verboseRawTokens.insert(verboseRawTokens.end(), tokens.begin(), tokens.end()); + } + void appendVerboseRawText(const std::string& chunk) { + verboseRawText.append(chunk); + } + void setVerboseRawText(std::string text) { + verboseRawText = std::move(text); + } + const std::vector& getVerboseRawTokens() const { return verboseRawTokens; } + const std::string& getVerboseRawText() const { return verboseRawText; } + // Usage tracking void setPromptTokensUsage(size_t promptTokens); void setCompletionTokensUsage(size_t completionTokens); diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 2fd5e12005..89009c0d74 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -394,6 +394,17 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + if (isVerboseResponse()) { + jsonResponse.StartObject("__verbose"); + jsonResponse.String("prompt", getVerbosePrompt()); + std::string rawContent; + if (!generationOutputs.empty()) { + rawContent = tokenizer.decode(generationOutputs.front().generated_ids, ov::genai::skip_special_tokens(false)); + } + jsonResponse.String("content", rawContent); + jsonResponse.EndObject(); + } + // finish response object jsonResponse.EndObject(); return jsonResponse.ToString(); @@ -458,6 +469,17 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + if (isVerboseResponse()) { + jsonResponse.StartObject("__verbose"); + jsonResponse.String("prompt", getVerbosePrompt()); + std::string rawContent; + if (!results.tokens.empty()) { + rawContent = tokenizer.decode(results.tokens.front(), ov::genai::skip_special_tokens(false)); + } + jsonResponse.String("content", rawContent); + jsonResponse.EndObject(); + } + // finish response object jsonResponse.EndObject(); return jsonResponse.ToString(); @@ -528,6 +550,14 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + if (isVerboseResponse()) { + jsonResponse.StartObject("__verbose"); + jsonResponse.String("prompt", getVerbosePrompt()); + // For VLM the raw decoded text is provided by GenAI directly. + jsonResponse.String("content", textResponse); + jsonResponse.EndObject(); + } + // finish response object jsonResponse.EndObject(); return jsonResponse.ToString(); @@ -622,6 +652,21 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. + // Verbose mode: attach prompt and raw model output to the FINAL chunk only. + if (isVerboseResponse() && finishReason != ov::genai::GenerationFinishReason::NONE) { + std::string rawOutput; + if (!getVerboseRawTokens().empty()) { + rawOutput = tokenizer.decode(getVerboseRawTokens(), ov::genai::skip_special_tokens(false)); + } else { + rawOutput = getVerboseRawText(); + } + + Value verboseObject(kObjectType); + verboseObject.AddMember("prompt", Value(getVerbosePrompt().c_str(), allocator), allocator); + verboseObject.AddMember("content", Value(rawOutput.c_str(), allocator), allocator); + doc.AddMember("__verbose", verboseObject, allocator); + } + StringBuffer buffer; Writer writer(buffer); doc.Accept(writer); diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 8e244df219..7dffb22afe 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -234,6 +234,9 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrresults.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0]; + if (executionContext->apiHandler->isVerboseResponse() && !legacyExecutionContext->results.tokens.empty()) { + executionContext->apiHandler->appendVerboseRawTokens(legacyExecutionContext->results.tokens[0]); + } std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index cb8d87fcee..b4d5ca8185 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -269,6 +269,9 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->enableVerboseResponse(inputText); + } bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; if (getProperties()->maxModelLength.has_value()) { @@ -305,6 +308,9 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptrgenerationOutputs[0]; executionContext->apiHandler->incrementProcessedTokens(generationOutput.generated_ids.size()); + if (executionContext->apiHandler->isVerboseResponse()) { + executionContext->apiHandler->appendVerboseRawTokens(generationOutput.generated_ids); + } std::stringstream ss; executionContext->textStreamer->write(generationOutput.generated_ids); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 13b7e73a62..7779d9c0be 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -22,6 +22,7 @@ #include #include +#include "../../../config.hpp" #include "../../../logging.hpp" #include "../../text_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" @@ -121,6 +122,10 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler->enableVerboseResponse(vlmExecutionContext->inputText); + } + // Below logic is used only for the statistics and debugging purposes and does not affect the model execution. SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText); bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 033cb8641d..9c8e02c5df 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -224,6 +224,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar lastTextChunk = executionContext->lastStreamerCallbackOutput; executionContext->lastStreamerCallbackOutput = ""; } + if (executionContext->apiHandler->isVerboseResponse() && !lastTextChunk.empty()) { + executionContext->apiHandler->appendVerboseRawText(lastTextChunk); + } if (generationStatus != std::future_status::ready) { // continue // For RESPONSES endpoint, always call serializeStreamingChunk so that // output item initialization events are emitted even before the tokenizer produces text. @@ -244,6 +247,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar // if streamer::put returned a value, streamer::end() result will not contain it, so we add it manually if (!executionContext->lastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; + if (executionContext->apiHandler->isVerboseResponse()) { + executionContext->apiHandler->appendVerboseRawText(executionContext->lastStreamerCallbackOutput); + } } if (legacyExecutionContext->results.finish_reasons.empty()) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy VLM streaming generation result, defaulting to STOP"); @@ -316,6 +322,10 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler->enableVerboseResponse(vlmExecutionContext->inputText); + } + // Below logic is used only for the statistics and debugging purposes and does not affect the model execution. SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText); bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens