Fix server multimodal statistics (#392)

Mozilla-Ocho · May 7, 2024 · a2d159e · a2d159e
1 parent aa8c01a
commit a2d159e
Showing 1 changed file with 13 additions and 0 deletions.
diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
@@ -1374,6 +1374,7 @@ struct llama_server_context
     bool ingest_images(llama_client_slot &slot, int n_batch)
     {
         int image_idx = 0;
+        std::string prompt = "";
 
         while (image_idx < (int) slot.images.size())
         {
@@ -1435,6 +1436,11 @@ struct llama_server_context
                 slot.params.input_suffix : // no more images, then process suffix prompt
                 (json)(slot.images[image_idx].prefix_prompt);
 
+            // rebuild the prompt since it was cleared earlier
+            prompt += img.prefix_prompt;
+            prompt += "[img-" + std::to_string(img.id) + "]";
+            prompt += json_prompt;
+
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
             for (int i = 0; i < (int) append_tokens.size(); ++i)
             {
@@ -1443,6 +1449,13 @@ struct llama_server_context
             }
         }
 
+        // There is no prompt caching in multimodal currently
+        slot.num_prompt_tokens = slot.n_past;
+        slot.num_prompt_tokens_processed = slot.n_past;
+
+        // prompt for multimodal is set to empty to avoid processing those tokens here
+        slot.prompt = prompt;
+
         return true;
     }