Skip to content

Commit

Permalink
Upgrade to latest llama.cpp code
Browse files Browse the repository at this point in the history
- There's a new super cool flash attention feature (pass -fa flag)

- llama.cpp is now able to ask tinyBLAS to use an F16 output type,
  which should help it reduce overall memory requirements.

- llama.cpp will now tell tinyBLAS when it wants a higher precision
  word size, which is useful for models like Phi-2 and Phi-3, where
  using ARM FP16 arithmetic might not be a good idea.

- We're using a new strategy for synchronizing ggml-quants.c, where
  instead of doing runtime dispatching by hand, it is now done with
  generated code. This is good news since it means many quants that
  couldn't be optimized for modern machines before (e.g. IQ quants)
  will now go much faster on AVX2 and AVX512 microprocessors.
  • Loading branch information
jart committed May 1, 2024
1 parent 9540b43 commit 0bdea60
Show file tree
Hide file tree
Showing 49 changed files with 10,733 additions and 5,838 deletions.
5 changes: 5 additions & 0 deletions llama.cpp/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ o/$(MODE)/llama.cpp/ggml-alloc.o \
o/$(MODE)/llama.cpp/common.o: private \
CCFLAGS += -Os

o/$(MODE)/llama.cpp/ggml-quants.o: private CXXFLAGS += -Os
o/$(MODE)/llama.cpp/ggml-quants-amd-avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge
o/$(MODE)/llama.cpp/ggml-quants-amd-avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2
o/$(MODE)/llama.cpp/ggml-quants-amd-avx512.o: private TARGET_ARCH += -Xx86_64-mtune=cannonlake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2 -Xx86_64-mavx512f

$(LLAMA_CPP_OBJS): llama.cpp/BUILD.mk

.PHONY: o/$(MODE)/llama.cpp
Expand Down
6 changes: 3 additions & 3 deletions llama.cpp/README.llamafile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ LICENSE
ORIGIN

https://github.com/ggerganov/llama.cpp/pull/4406/
46e12c4692a37bdd31a0432fc5153d7d22bc7f72
2024-04-25
a8f9b076316e16aadd0791015b3bfd446fe1e904
2024-04-30

LOCAL MODIFICATIONS

Expand All @@ -21,9 +21,9 @@ LOCAL MODIFICATIONS
- Add support to main() programs for Cosmo /zip/.args files
- Introduce pledge() SECCOMP sandboxing to improve security
- Call exit() rather than abort() when GGML_ASSERT() fails
- Clamp bf16/f32 values before passing to K quantizers
- Make GPU logger callback API safer and less generic
- Write log to /dev/null when main.log fails to open
- Use _rand64() rather than time() as default seed
- Make main and llava-cli print timings on ctrl-c
- Avoid bind() conflicts on port 8080 w/ server
- Use runtime dispatching for matmul quants
Expand Down
376 changes: 208 additions & 168 deletions llama.cpp/common.cpp

Large diffs are not rendered by default.

17 changes: 13 additions & 4 deletions llama.cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
tinylog(__func__, ": llamafile version " LLAMAFILE_VERSION_STRING "\n", NULL); \
} while(0)

#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
Expand Down Expand Up @@ -97,7 +99,7 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;

std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
Expand Down Expand Up @@ -138,7 +140,7 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

bool kl_divergence = false; // compute KL-divergence
bool kl_divergence = false; // compute KL divergence

bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
Expand All @@ -153,6 +155,7 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
Expand All @@ -166,15 +169,20 @@ struct gpt_params {
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
};

void gpt_params_handle_model_default(gpt_params & params);

bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand All @@ -198,6 +206,7 @@ bool validate_file_name(const std::string & filename);
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

//
Expand Down
1 change: 1 addition & 0 deletions llama.cpp/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ extern "C" {
bool (*GGML_CALL ggml_backend_buffer_is_host)(ggml_backend_buffer_t);
bool (*GGML_CALL ggml_guid_matches)(ggml_guid_t, ggml_guid_t);
bool (*GGML_CALL ggml_is_empty)(const struct ggml_tensor *);
bool (*GGML_CALL ggml_are_same_shape)(const struct ggml_tensor *, const struct ggml_tensor *);
};

#ifdef __cplusplus
Expand Down
13 changes: 8 additions & 5 deletions llama.cpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1790,12 +1790,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {

void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
if (!sched->is_reset) {
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);

sched->is_reset = true;
sched->is_reset = true;
}
sched->is_alloc = false;
}

Expand Down Expand Up @@ -2161,6 +2163,7 @@ static const struct ggml_backend_api kGgmlBackendApi = {
ggml_backend_buffer_is_host,
ggml_guid_matches,
ggml_is_empty,
ggml_are_same_shape,
};

const struct ggml_backend_api *ggml_backend_api(void) {
Expand Down

0 comments on commit 0bdea60

Please sign in to comment.