Skip to content

Commit

Permalink
Fix server multimodal statistics (#392)
Browse files Browse the repository at this point in the history
  • Loading branch information
cjpais committed May 7, 2024
1 parent aa8c01a commit a2d159e
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions llama.cpp/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1374,6 +1374,7 @@ struct llama_server_context
bool ingest_images(llama_client_slot &slot, int n_batch)
{
int image_idx = 0;
std::string prompt = "";

while (image_idx < (int) slot.images.size())
{
Expand Down Expand Up @@ -1435,6 +1436,11 @@ struct llama_server_context
slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt);

// rebuild the prompt since it was cleared earlier
prompt += img.prefix_prompt;
prompt += "[img-" + std::to_string(img.id) + "]";
prompt += json_prompt;

std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i)
{
Expand All @@ -1443,6 +1449,13 @@ struct llama_server_context
}
}

// There is no prompt caching in multimodal currently
slot.num_prompt_tokens = slot.n_past;
slot.num_prompt_tokens_processed = slot.n_past;

// prompt for multimodal is set to empty to avoid processing those tokens here
slot.prompt = prompt;

return true;
}

Expand Down

0 comments on commit a2d159e

Please sign in to comment.