Add llama-cli example to llama.cpp snippets (#1889)

gary149 · web-flow · commit 59af4df0479a · 2025-12-19T17:51:21.000+01:00
Add `llama-cli` command alongside `llama-server` for all installation methods. ## Changes - Add `llama-cli` snippet for direct terminal inference - Update `llama-server` comment to mention web UI - Build from source now compiles both targets <img width="3786" height="1702" alt="image" src="https://github.com/user-attachments/assets/305f7a8f-110b-4688-8d58-e18b8f731d25" />
diff --git a/packages/tasks/src/local-apps.spec.ts b/packages/tasks/src/local-apps.spec.ts
@@ -12,8 +12,12 @@ describe("local-apps", () => {
 		};
 		const snippet = snippetFunc(model);
 
-		expect(snippet[0].content).toEqual(`# Load and run the model:
-llama-server -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`);
+		expect(snippet[0].content).toEqual([
+			`# Start a local OpenAI-compatible server with a web UI:
+llama-server -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`,
+			`# Run inference directly in the terminal:
+llama-cli -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`,
+		]);
 	});
 
 	it("llama.cpp non-conversational", async () => {
@@ -25,8 +29,12 @@ llama-server -hf bartowski/Llama-3.2-3B-Instruct-GGUF:{{QUANT_TAG}}`);
 		};
 		const snippet = snippetFunc(model);
 
-		expect(snippet[0].content).toEqual(`# Load and run the model:
-llama-server -hf mlabonne/gemma-2b-GGUF:{{QUANT_TAG}}`);
+		expect(snippet[0].content).toEqual([
+			`# Start a local OpenAI-compatible server with a web UI:
+llama-server -hf mlabonne/gemma-2b-GGUF:{{QUANT_TAG}}`,
+			`# Run inference directly in the terminal:
+llama-cli -hf mlabonne/gemma-2b-GGUF:{{QUANT_TAG}}`,
+		]);
 	});
 
 	it("vLLM conversational llm", async () => {
diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts
@@ -110,20 +110,27 @@ function getQuantTag(filepath?: string): string {
 }
 
 const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
-	const command = (binary: string) => {
-		const snippet = ["# Load and run the model:", `${binary} -hf ${model.id}${getQuantTag(filepath)}`];
+	const serverCommand = (binary: string) => {
+		const snippet = [
+			"# Start a local OpenAI-compatible server with a web UI:",
+			`${binary} -hf ${model.id}${getQuantTag(filepath)}`,
+		];
+		return snippet.join("\n");
+	};
+	const cliCommand = (binary: string) => {
+		const snippet = ["# Run inference directly in the terminal:", `${binary} -hf ${model.id}${getQuantTag(filepath)}`];
 		return snippet.join("\n");
 	};
 	return [
 		{
 			title: "Install from brew",
 			setup: "brew install llama.cpp",
-			content: command("llama-server"),
+			content: [serverCommand("llama-server"), cliCommand("llama-cli")],
 		},
 		{
 			title: "Install from WinGet (Windows)",
 			setup: "winget install llama.cpp",
-			content: command("llama-server"),
+			content: [serverCommand("llama-server"), cliCommand("llama-cli")],
 		},
 		{
 			title: "Use pre-built binary",
@@ -132,17 +139,17 @@ const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[]
 				"# Download pre-built binary from:",
 				"# https://github.com/ggerganov/llama.cpp/releases",
 			].join("\n"),
-			content: command("./llama-server"),
+			content: [serverCommand("./llama-server"), cliCommand("./llama-cli")],
 		},
 		{
 			title: "Build from source code",
 			setup: [
 				"git clone https://github.com/ggerganov/llama.cpp.git",
 				"cd llama.cpp",
 				"cmake -B build",
-				"cmake --build build -j --target llama-server",
+				"cmake --build build -j --target llama-server llama-cli",
 			].join("\n"),
-			content: command("./build/bin/llama-server"),
+			content: [serverCommand("./build/bin/llama-server"), cliCommand("./build/bin/llama-cli")],
 		},
 	];
 };