feat: add flash_attn support

ollama · May 5, 2024 · e400b1a · e400b1a
1 parent b7a87a2
commit e400b1a
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 4 deletions.
diff --git a/api/types.go b/api/types.go
@@ -151,6 +151,7 @@ type Runner struct {
 	UseMMap   bool `json:"use_mmap,omitempty"`
 	UseMLock  bool `json:"use_mlock,omitempty"`
 	NumThread int  `json:"num_thread,omitempty"`
+	FlashAttn bool `json:"flash_attn,omitempty"`
 
 	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
 	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
@@ -428,6 +429,7 @@ func DefaultOptions() Options {
 			UseMLock:  false,
 			UseMMap:   true,
 			UseNUMA:   false,
+			FlashAttn: false, // for CPU only compatibility
 		},
 	}
 }

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
@@ -2106,6 +2106,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --embedding               enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
     printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
     printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
     printf("  -spf FNAME, --system-prompt-file FNAME\n");
     printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
     printf("  -ctk TYPE, --cache-type-k TYPE\n");
@@ -2507,7 +2508,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             params.use_mmap = false;
         }
-        else if (arg == "--numa") {
+        else if (arg == "--numa")
+        {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -2527,6 +2529,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             params.cont_batching = true;
         }
+        else if (arg == "-fa" || arg == "--flash-attn")
+        {
+            params.flash_attn = true;
+        }
         else if (arg == "-np" || arg == "--parallel")
         {
             if (++i >= argc)
@@ -2535,15 +2541,17 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict")
+        }
+        else if (arg == "-n" || arg == "--n-predict")
         {
             if (++i >= argc)
             {
                 invalid_param = true;
                 break;
             }
             params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file")
+        }
+        else if (arg == "-spf" || arg == "--system-prompt-file")
         {
             if (++i >= argc)
             {
@@ -2678,6 +2686,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             exit(1);
         }
     }
+
+    gpt_params_handle_model_default(params);
+
     if (!params.kv_overrides.empty()) {
         params.kv_overrides.emplace_back();
         params.kv_overrides.back().key[0] = 0;

diff --git a/llm/llama.cpp b/llm/llama.cpp
diff --git a/llm/server.go b/llm/server.go
@@ -193,6 +193,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}
 
+	if opts.FlashAttn {
+    flashAttnSupported := (gpus[0].Library == "cuda" && gpus[0].Major >= 7 || gpus[0].Library == "metal")
+    if flashAttnSupported {
+        params = append(params, "--flash-attn")
+    } else {
+        slog.Warn("flash attention is not supported on your current hardware configuration, it is now disabled")
+    }
+	}
+
 	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
 	numParallel := 1
 	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {