feat: add flash_attn support

ollama · May 5, 2024 · 34136d4 · 34136d4
1 parent 5631fe7
commit 34136d4
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 19 deletions.
diff --git a/api/types.go b/api/types.go
@@ -151,6 +151,7 @@ type Runner struct {
 	UseMMap   bool `json:"use_mmap,omitempty"`
 	UseMLock  bool `json:"use_mlock,omitempty"`
 	NumThread int  `json:"num_thread,omitempty"`
+	FlashAttn bool `json:"flash_attn,omitempty"`
 
 	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
 	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
@@ -428,6 +429,7 @@ func DefaultOptions() Options {
 			UseMLock:  false,
 			UseMMap:   true,
 			UseNUMA:   false,
+			FlashAttn: false, // for CPU only compatibility
 		},
 	}
 }

diff --git a/docs/faq.md b/docs/faq.md
@@ -232,14 +232,3 @@ curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0
 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
 
 If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
-
-## Passing additional parameters to llama.cpp
-
-You can pass additional parameters to the `llama.cpp` binary by setting the `OLLAMA_LLAMA_EXTRA_ARGS` environment variable. This can be useful for debugging or performance testing.
-
-Example - enabling flash_attn and continuos batching:
-
-```shell
-export OLLAMA_LLAMA_EXTRA_ARGS="-fa,-cb"
-ollama serve
-```
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
@@ -2107,6 +2107,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
     printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
+    printf("  --flash-attn-disable      disable Flash Attention\n");
     printf("  -spf FNAME, --system-prompt-file FNAME\n");
     printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
     printf("  -ctk TYPE, --cache-type-k TYPE\n");
@@ -2508,7 +2509,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             params.use_mmap = false;
         }
-        else if (arg == "--numa") {
+        else if (arg == "--numa")
+        {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -2532,6 +2534,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         {
             params.flash_attn = true;
         }
+        else if (arg == "--flash-attn-disable")
+        {
+            params.flash_attn = false;
+        }
         else if (arg == "-np" || arg == "--parallel")
         {
             if (++i >= argc)
@@ -2540,15 +2546,17 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict")
+        }
+        else if (arg == "-n" || arg == "--n-predict")
         {
             if (++i >= argc)
             {
                 invalid_param = true;
                 break;
             }
             params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file")
+        }
+        else if (arg == "-spf" || arg == "--system-prompt-file")
         {
             if (++i >= argc)
             {

diff --git a/llm/llama.cpp b/llm/llama.cpp
diff --git a/llm/server.go b/llm/server.go
@@ -193,6 +193,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}
 
+	if gpus[0].Library == "cuda" || gpus[0].Library == "metal" || opts.FlashAttn {
+		params = append(params, "--flash-attn")
+	}
+
 	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
 	numParallel := 1
 	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
@@ -205,10 +209,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	}
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 
-	if other_args := os.Getenv("OLLAMA_LLAMA_EXTRA_ARGS"); other_args != "" {
-		params = append(params, strings.Split(other_args, ",")...)
-	}
-
 	for i := 0; i < len(servers); i++ {
 		dir := availableServers[servers[i]]
 		if dir == "" {