Upgrade to latest llama.cpp code

- There's a new super cool flash attention feature (pass -fa flag) - llama.cpp is now able to ask tinyBLAS to use an F16 output type, which should help it reduce overall memory requirements. - llama.cpp will now tell tinyBLAS when it wants a higher precision word size, which is useful for models like Phi-2 and Phi-3, where using ARM FP16 arithmetic might not be a good idea. - We're using a new strategy for synchronizing ggml-quants.c, where instead of doing runtime dispatching by hand, it is now done with generated code. This is good news since it means many quants that couldn't be optimized for modern machines before (e.g. IQ quants) will now go much faster on AVX2 and AVX512 microprocessors.
Mozilla-Ocho · May 1, 2024 · 0bdea60 · 0bdea60
1 parent 9540b43
commit 0bdea60
Show file tree

Hide file tree

Showing 49 changed files with 10,733 additions and 5,838 deletions.
diff --git a/llama.cpp/BUILD.mk b/llama.cpp/BUILD.mk
@@ -43,6 +43,11 @@ o/$(MODE)/llama.cpp/ggml-alloc.o			\
 o/$(MODE)/llama.cpp/common.o: private			\
 		CCFLAGS += -Os
 
+o/$(MODE)/llama.cpp/ggml-quants.o: private CXXFLAGS += -Os
+o/$(MODE)/llama.cpp/ggml-quants-amd-avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge
+o/$(MODE)/llama.cpp/ggml-quants-amd-avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2
+o/$(MODE)/llama.cpp/ggml-quants-amd-avx512.o: private TARGET_ARCH += -Xx86_64-mtune=cannonlake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2 -Xx86_64-mavx512f
+
 $(LLAMA_CPP_OBJS): llama.cpp/BUILD.mk
 
 .PHONY: o/$(MODE)/llama.cpp

diff --git a/llama.cpp/README.llamafile b/llama.cpp/README.llamafile
@@ -9,8 +9,8 @@ LICENSE
 ORIGIN
 
   https://github.com/ggerganov/llama.cpp/pull/4406/
-  46e12c4692a37bdd31a0432fc5153d7d22bc7f72
-  2024-04-25
+  a8f9b076316e16aadd0791015b3bfd446fe1e904
+  2024-04-30
 
 LOCAL MODIFICATIONS
 
@@ -21,9 +21,9 @@ LOCAL MODIFICATIONS
   - Add support to main() programs for Cosmo /zip/.args files
   - Introduce pledge() SECCOMP sandboxing to improve security
   - Call exit() rather than abort() when GGML_ASSERT() fails
+  - Clamp bf16/f32 values before passing to K quantizers
   - Make GPU logger callback API safer and less generic
   - Write log to /dev/null when main.log fails to open
-  - Use _rand64() rather than time() as default seed
   - Make main and llava-cli print timings on ctrl-c
   - Avoid bind() conflicts on port 8080 w/ server
   - Use runtime dispatching for matmul quants

diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
diff --git a/llama.cpp/common.h b/llama.cpp/common.h
@@ -36,6 +36,8 @@
         tinylog(__func__, ": llamafile version " LLAMAFILE_VERSION_STRING "\n", NULL); \
 } while(0)
 
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const *LLAMA_COMMIT;
@@ -97,7 +99,7 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model                = "";  // model path
     std::string model_draft          = "";  // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = "";  // model url to download
@@ -138,7 +140,7 @@ struct gpt_params {
     bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
     size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
-    bool   kl_divergence   = false; // compute KL-divergence
+    bool   kl_divergence   = false; // compute KL divergence
 
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
@@ -153,6 +155,7 @@ struct gpt_params {
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
+    bool flash_attn        = false; // flash attention
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
@@ -166,15 +169,20 @@ struct gpt_params {
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
     bool warmup            = true;  // warmup run
+    bool check_tensors     = false; // validate tensor data
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
-    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
+    std::string mmproj = "";        // path to multimodal projector
+    std::vector<std::string> image; // path to image file(s)
 };
 
+void gpt_params_handle_model_default(gpt_params & params);
+
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -198,6 +206,7 @@ bool validate_file_name(const std::string & filename);
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
+std::string string_strip(const std::string & str);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
 
 //

diff --git a/llama.cpp/ggml-backend-impl.h b/llama.cpp/ggml-backend-impl.h
@@ -180,6 +180,7 @@ extern "C" {
         bool (*GGML_CALL ggml_backend_buffer_is_host)(ggml_backend_buffer_t);
         bool (*GGML_CALL ggml_guid_matches)(ggml_guid_t, ggml_guid_t);
         bool (*GGML_CALL ggml_is_empty)(const struct ggml_tensor *);
+        bool (*GGML_CALL ggml_are_same_shape)(const struct ggml_tensor *, const struct ggml_tensor *);
     };
 
 #ifdef  __cplusplus

diff --git a/llama.cpp/ggml-backend.c b/llama.cpp/ggml-backend.c
@@ -1790,12 +1790,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
 
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     // reset state for the next run
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
-    memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
-    memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
+    if (!sched->is_reset) {
+        size_t hash_size = sched->hash_set.size;
+        memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
+        memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
+        memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
 
-    sched->is_reset = true;
+        sched->is_reset = true;
+    }
     sched->is_alloc = false;
 }
 
@@ -2161,6 +2163,7 @@ static const struct ggml_backend_api kGgmlBackendApi = {
     ggml_backend_buffer_is_host,
     ggml_guid_matches,
     ggml_is_empty,
+    ggml_are_same_shape,
 };
 
 const struct ggml_backend_api *ggml_backend_api(void) {