Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/capi_frontend/server_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ struct ServerSettingsImpl {
std::optional<std::vector<std::string>> allowedMediaDomains;
std::string logLevel = "INFO";
std::string logPath;
// When enabled, LLM chat/text completion unary responses include an extra
// "__verbose" object with the raw model input (after chat template) and the
// raw model output (before tool/reasoning parsing). Inspired by llama.cpp -v.
bool verboseResponse = false;
bool allowCredentials = false;
std::string allowedOrigins{"*"};
std::string allowedMethods{"*"};
Expand Down
8 changes: 8 additions & 0 deletions src/cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ std::variant<bool, std::pair<int, std::string>> CLIParser::parse(int argc, char*
("log_path",
"Optional path to the log file",
cxxopts::value<std::string>(), "LOG_PATH")
("verbose_response",
"When enabled, LLM chat/text completion unary responses include an extra "
"\"__verbose\" object with the raw prompt (after chat template) and the raw "
"model output (before tool/reasoning parsing). Useful for debugging.",
cxxopts::value<bool>()->default_value("false"),
"VERBOSE_RESPONSE")
#ifdef MTR_ENABLED
("trace_path",
"Path to the trace file",
Expand Down Expand Up @@ -522,6 +528,8 @@ void CLIParser::prepareServer(ServerSettingsImpl& serverSettings) {
serverSettings.logLevel = result->operator[]("log_level").as<std::string>();
if (result->count("log_path"))
serverSettings.logPath = result->operator[]("log_path").as<std::string>();
if (result->count("verbose_response"))
serverSettings.verboseResponse = result->operator[]("verbose_response").as<bool>();

if (result->count("grpc_channel_arguments"))
serverSettings.grpcChannelArguments = result->operator[]("grpc_channel_arguments").as<std::string>();
Expand Down
30 changes: 30 additions & 0 deletions src/llm/apis/openai_api_handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ class OpenAIApiHandler {
// Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning.
std::unique_ptr<OutputParser> outputParser = nullptr;

// Verbose response support (enabled via --verbose_response). When set, the
// serialized unary response includes a "__verbose" object with the raw prompt
// (post chat template) and raw decoded model output (before tool/reasoning
// parsing). Inspired by llama.cpp -v.
bool verboseResponse = false;
std::string verbosePrompt;
// Streaming accumulators for raw model output.
std::vector<int64_t> verboseRawTokens;
std::string verboseRawText;

// Shared parsing helpers
absl::Status parseCommonPart(std::optional<uint32_t> maxTokensLimit, uint32_t bestOfLimit, std::optional<uint32_t> maxModelLength);
absl::Status parseResponseFormat();
Expand Down Expand Up @@ -156,6 +166,26 @@ class OpenAIApiHandler {
std::string getToolChoice() const;
const std::unique_ptr<OutputParser>& getOutputParser() const;

// Verbose response configuration
void enableVerboseResponse(const std::string& promptAfterTemplate) {
verboseResponse = true;
verbosePrompt = promptAfterTemplate;
}
bool isVerboseResponse() const { return verboseResponse; }
const std::string& getVerbosePrompt() const { return verbosePrompt; }
// Accumulators used to assemble the "raw model output" for streaming responses.
void appendVerboseRawTokens(const std::vector<int64_t>& tokens) {
verboseRawTokens.insert(verboseRawTokens.end(), tokens.begin(), tokens.end());
}
void appendVerboseRawText(const std::string& chunk) {
verboseRawText.append(chunk);
}
void setVerboseRawText(std::string text) {
verboseRawText = std::move(text);
}
const std::vector<int64_t>& getVerboseRawTokens() const { return verboseRawTokens; }
const std::string& getVerboseRawText() const { return verboseRawText; }

// Usage tracking
void setPromptTokensUsage(size_t promptTokens);
void setCompletionTokensUsage(size_t completionTokens);
Expand Down
45 changes: 45 additions & 0 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,17 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

if (isVerboseResponse()) {
jsonResponse.StartObject("__verbose");
jsonResponse.String("prompt", getVerbosePrompt());
std::string rawContent;
if (!generationOutputs.empty()) {
rawContent = tokenizer.decode(generationOutputs.front().generated_ids, ov::genai::skip_special_tokens(false));
}
jsonResponse.String("content", rawContent);
jsonResponse.EndObject();
}

// finish response object
jsonResponse.EndObject();
return jsonResponse.ToString();
Expand Down Expand Up @@ -458,6 +469,17 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

if (isVerboseResponse()) {
jsonResponse.StartObject("__verbose");
jsonResponse.String("prompt", getVerbosePrompt());
std::string rawContent;
if (!results.tokens.empty()) {
rawContent = tokenizer.decode(results.tokens.front(), ov::genai::skip_special_tokens(false));
}
jsonResponse.String("content", rawContent);
jsonResponse.EndObject();
}

// finish response object
jsonResponse.EndObject();
return jsonResponse.ToString();
Expand Down Expand Up @@ -528,6 +550,14 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

if (isVerboseResponse()) {
jsonResponse.StartObject("__verbose");
jsonResponse.String("prompt", getVerbosePrompt());
// For VLM the raw decoded text is provided by GenAI directly.
jsonResponse.String("content", textResponse);
jsonResponse.EndObject();
}

// finish response object
jsonResponse.EndObject();
return jsonResponse.ToString();
Expand Down Expand Up @@ -622,6 +652,21 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

// Verbose mode: attach prompt and raw model output to the FINAL chunk only.
if (isVerboseResponse() && finishReason != ov::genai::GenerationFinishReason::NONE) {
std::string rawOutput;
if (!getVerboseRawTokens().empty()) {
rawOutput = tokenizer.decode(getVerboseRawTokens(), ov::genai::skip_special_tokens(false));
} else {
rawOutput = getVerboseRawText();
}

Value verboseObject(kObjectType);
verboseObject.AddMember("prompt", Value(getVerbosePrompt().c_str(), allocator), allocator);
verboseObject.AddMember("content", Value(rawOutput.c_str(), allocator), allocator);
doc.AddMember("__verbose", verboseObject, allocator);
}

StringBuffer buffer;
Writer<StringBuffer> writer(buffer);
doc.Accept(writer);
Expand Down
3 changes: 3 additions & 0 deletions src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
}
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
if (executionContext->apiHandler->isVerboseResponse() && !legacyExecutionContext->results.tokens.empty()) {
executionContext->apiHandler->appendVerboseRawTokens(legacyExecutionContext->results.tokens[0]);
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
Expand Down
6 changes: 6 additions & 0 deletions src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,9 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
case Endpoint::TOKENIZE:
return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage");
}
if (Config::instance().getServerSettings().verboseResponse) {
executionContext->apiHandler->enableVerboseResponse(inputText);
}
bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS);
executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids;
if (getProperties()->maxModelLength.has_value()) {
Expand Down Expand Up @@ -305,6 +308,9 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr<GenAiServable
}
auto& generationOutput = executionContext->generationOutputs[0];
executionContext->apiHandler->incrementProcessedTokens(generationOutput.generated_ids.size());
if (executionContext->apiHandler->isVerboseResponse()) {
executionContext->apiHandler->appendVerboseRawTokens(generationOutput.generated_ids);
}

std::stringstream ss;
executionContext->textStreamer->write(generationOutput.generated_ids);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <unordered_map>
#include <vector>

#include "../../../config.hpp"
#include "../../../logging.hpp"
#include "../../text_utils.hpp"
#include "../../../tokenize/tokenize_parser.hpp"
Expand Down Expand Up @@ -121,6 +122,10 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
return absl::InvalidArgumentError("Unsupported endpoint");
}

if (Config::instance().getServerSettings().verboseResponse) {
vlmExecutionContext->apiHandler->enableVerboseResponse(vlmExecutionContext->inputText);
}

// Below logic is used only for the statistics and debugging purposes and does not affect the model execution.
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText);
bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens
Expand Down
10 changes: 10 additions & 0 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
lastTextChunk = executionContext->lastStreamerCallbackOutput;
executionContext->lastStreamerCallbackOutput = "";
}
if (executionContext->apiHandler->isVerboseResponse() && !lastTextChunk.empty()) {
executionContext->apiHandler->appendVerboseRawText(lastTextChunk);
}
if (generationStatus != std::future_status::ready) { // continue
// For RESPONSES endpoint, always call serializeStreamingChunk so that
// output item initialization events are emitted even before the tokenizer produces text.
Expand All @@ -244,6 +247,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
// if streamer::put returned a value, streamer::end() result will not contain it, so we add it manually
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
if (executionContext->apiHandler->isVerboseResponse()) {
executionContext->apiHandler->appendVerboseRawText(executionContext->lastStreamerCallbackOutput);
}
}
if (legacyExecutionContext->results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy VLM streaming generation result, defaulting to STOP");
Expand Down Expand Up @@ -316,6 +322,10 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
return absl::InvalidArgumentError("Unsupported endpoint");
}

if (Config::instance().getServerSettings().verboseResponse) {
vlmExecutionContext->apiHandler->enableVerboseResponse(vlmExecutionContext->inputText);
}

// Below logic is used only for the statistics and debugging purposes and does not affect the model execution.
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText);
bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens
Expand Down