|
| 1 | +#!/bin/bash |
| 2 | +# |
| 3 | +# Helper script for deploying llama.cpp server with a single Bash command |
| 4 | +# |
| 5 | +# - Works on Linux and macOS |
| 6 | +# - Supports: CPU, CUDA, Metal, OpenCL |
| 7 | +# - Can run all GGUF models from HuggingFace |
| 8 | +# - Can serve requests in parallel |
| 9 | +# - Always builds latest llama.cpp from GitHub |
| 10 | +# |
| 11 | +# Limitations |
| 12 | +# |
| 13 | +# - Chat templates are poorly supported (base models recommended) |
| 14 | +# - Might be unstable! |
| 15 | +# |
| 16 | +# Usage: |
| 17 | +# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] |
| 18 | +# |
| 19 | +# --port: port number, default is 8888 |
| 20 | +# --repo: path to a repo containing GGUF model files |
| 21 | +# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input |
| 22 | +# --backend: cpu, cuda, metal, opencl, depends on the OS |
| 23 | +# --gpu-id: gpu id, default is 0 |
| 24 | +# --n-parallel: number of parallel requests, default is 8 |
| 25 | +# --n-kv: KV cache size, default is 4096 |
| 26 | +# --verbose: verbose output |
| 27 | +# |
| 28 | +# Example: |
| 29 | +# |
| 30 | +# bash -c "$(curl -s https://ggml.ai/server-llm.sh)" |
| 31 | +# |
| 32 | + |
| 33 | +set -e |
| 34 | + |
| 35 | +# required utils: curl, git, make |
| 36 | +if ! command -v curl &> /dev/null; then |
| 37 | + printf "[-] curl not found\n" |
| 38 | + exit 1 |
| 39 | +fi |
| 40 | +if ! command -v git &> /dev/null; then |
| 41 | + printf "[-] git not found\n" |
| 42 | + exit 1 |
| 43 | +fi |
| 44 | +if ! command -v make &> /dev/null; then |
| 45 | + printf "[-] make not found\n" |
| 46 | + exit 1 |
| 47 | +fi |
| 48 | + |
| 49 | +# parse arguments |
| 50 | +port=8888 |
| 51 | +repo="" |
| 52 | +wtype="" |
| 53 | +backend="cpu" |
| 54 | + |
| 55 | +# if macOS, use metal backend by default |
| 56 | +if [[ "$OSTYPE" == "darwin"* ]]; then |
| 57 | + backend="metal" |
| 58 | +elif command -v nvcc &> /dev/null; then |
| 59 | + backend="cuda" |
| 60 | +fi |
| 61 | + |
| 62 | +gpu_id=0 |
| 63 | +n_parallel=8 |
| 64 | +n_kv=4096 |
| 65 | +verbose=0 |
| 66 | + |
| 67 | +function print_usage { |
| 68 | + printf "Usage:\n" |
| 69 | + printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" |
| 70 | + printf " --port: port number, default is 8888\n" |
| 71 | + printf " --repo: path to a repo containing GGUF model files\n" |
| 72 | + printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" |
| 73 | + printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" |
| 74 | + printf " --gpu-id: gpu id, default is 0\n" |
| 75 | + printf " --n-parallel: number of parallel requests, default is 8\n" |
| 76 | + printf " --n-kv: KV cache size, default is 4096\n" |
| 77 | + printf " --verbose: verbose output\n\n" |
| 78 | + printf "Example:\n\n" |
| 79 | + printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' |
| 80 | +} |
| 81 | + |
| 82 | +while [[ $# -gt 0 ]]; do |
| 83 | + key="$1" |
| 84 | + case $key in |
| 85 | + --port) |
| 86 | + port="$2" |
| 87 | + shift |
| 88 | + shift |
| 89 | + ;; |
| 90 | + --repo) |
| 91 | + repo="$2" |
| 92 | + shift |
| 93 | + shift |
| 94 | + ;; |
| 95 | + --wtype) |
| 96 | + wtype="$2" |
| 97 | + shift |
| 98 | + shift |
| 99 | + ;; |
| 100 | + --backend) |
| 101 | + backend="$2" |
| 102 | + shift |
| 103 | + shift |
| 104 | + ;; |
| 105 | + --gpu-id) |
| 106 | + gpu_id="$2" |
| 107 | + shift |
| 108 | + shift |
| 109 | + ;; |
| 110 | + --n-parallel) |
| 111 | + n_parallel="$2" |
| 112 | + shift |
| 113 | + shift |
| 114 | + ;; |
| 115 | + --n-kv) |
| 116 | + n_kv="$2" |
| 117 | + shift |
| 118 | + shift |
| 119 | + ;; |
| 120 | + --verbose) |
| 121 | + verbose=1 |
| 122 | + shift |
| 123 | + ;; |
| 124 | + --help) |
| 125 | + print_usage |
| 126 | + exit 0 |
| 127 | + ;; |
| 128 | + *) |
| 129 | + echo "Unknown argument: $key" |
| 130 | + print_usage |
| 131 | + exit 1 |
| 132 | + ;; |
| 133 | + esac |
| 134 | +done |
| 135 | + |
| 136 | +# available weights types |
| 137 | +wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K") |
| 138 | + |
| 139 | +wfiles=() |
| 140 | +for wt in "${wtypes[@]}"; do |
| 141 | + wfiles+=("") |
| 142 | +done |
| 143 | + |
| 144 | +# sample repos |
| 145 | +repos=( |
| 146 | + "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" |
| 147 | + "https://huggingface.co/TheBloke/Llama-2-13B-GGUF" |
| 148 | + "https://huggingface.co/TheBloke/Llama-2-70B-GGUF" |
| 149 | + "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF" |
| 150 | + "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF" |
| 151 | + "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF" |
| 152 | + "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" |
| 153 | + "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF" |
| 154 | + "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" |
| 155 | + "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" |
| 156 | +) |
| 157 | + |
| 158 | +printf "\n" |
| 159 | +printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" |
| 160 | +printf " Based on the options that follow, the script might download a model file\n" |
| 161 | +printf " from the internet, which can be a few GBs in size. The script will also\n" |
| 162 | +printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" |
| 163 | +printf "\n" |
| 164 | +printf " Upon success, an HTTP server will be started and it will serve the selected\n" |
| 165 | +printf " model using llama.cpp for demonstration purposes.\n" |
| 166 | +printf "\n" |
| 167 | +printf " Please note:\n" |
| 168 | +printf "\n" |
| 169 | +printf " - All new data will be stored in the current folder\n" |
| 170 | +printf " - The server will be listening on all network interfaces\n" |
| 171 | +printf " - The server will run with default settings which are not always optimal\n" |
| 172 | +printf " - Do not judge the quality of a model based on the results from this script\n" |
| 173 | +printf " - Do not use this script to benchmark llama.cpp\n" |
| 174 | +printf " - Do not use this script in production\n" |
| 175 | +printf " - This script is only for demonstration purposes\n" |
| 176 | +printf "\n" |
| 177 | +printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" |
| 178 | +printf "\n" |
| 179 | +printf " Press Enter to continue ...\n\n" |
| 180 | + |
| 181 | +read |
| 182 | + |
| 183 | +if [[ -z "$repo" ]]; then |
| 184 | + printf "[+] No repo provided from the command line\n" |
| 185 | + printf " Please select a number from the list below or enter an URL:\n\n" |
| 186 | + |
| 187 | + is=0 |
| 188 | + for r in "${repos[@]}"; do |
| 189 | + printf " %2d) %s\n" $is "$r" |
| 190 | + is=$((is+1)) |
| 191 | + done |
| 192 | + |
| 193 | + # ask for repo until index of sample repo is provided or an URL |
| 194 | + while [[ -z "$repo" ]]; do |
| 195 | + printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n" |
| 196 | + read -p "[+] Select repo: " repo |
| 197 | + |
| 198 | + # check if the input is a number |
| 199 | + if [[ "$repo" =~ ^[0-9]+$ ]]; then |
| 200 | + if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then |
| 201 | + repo="${repos[$repo]}" |
| 202 | + else |
| 203 | + printf "[-] Invalid repo index: %s\n" "$repo" |
| 204 | + repo="" |
| 205 | + fi |
| 206 | + elif [[ "$repo" =~ ^https?:// ]]; then |
| 207 | + repo="$repo" |
| 208 | + else |
| 209 | + printf "[-] Invalid repo URL: %s\n" "$repo" |
| 210 | + repo="" |
| 211 | + fi |
| 212 | + done |
| 213 | +fi |
| 214 | + |
| 215 | +# remove suffix |
| 216 | +repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g') |
| 217 | + |
| 218 | +printf "[+] Checking for GGUF model files in %s\n" "$repo" |
| 219 | + |
| 220 | +# find GGUF files in the source |
| 221 | +# TODO: better logic |
| 222 | +model_tree="${repo%/}/tree/main" |
| 223 | +model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g') |
| 224 | + |
| 225 | +# list all files in the provided git repo |
| 226 | +printf "[+] Model files:\n\n" |
| 227 | +for file in $model_files; do |
| 228 | + # determine iw by grepping the filename with wtypes |
| 229 | + iw=-1 |
| 230 | + is=0 |
| 231 | + for wt in "${wtypes[@]}"; do |
| 232 | + # uppercase |
| 233 | + ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]') |
| 234 | + if [[ "$ufile" =~ "$wt" ]]; then |
| 235 | + iw=$is |
| 236 | + break |
| 237 | + fi |
| 238 | + is=$((is+1)) |
| 239 | + done |
| 240 | + |
| 241 | + if [[ $iw -eq -1 ]]; then |
| 242 | + continue |
| 243 | + fi |
| 244 | + |
| 245 | + wfiles[$iw]="$file" |
| 246 | + |
| 247 | + have=" " |
| 248 | + if [[ -f "$file" ]]; then |
| 249 | + have="*" |
| 250 | + fi |
| 251 | + |
| 252 | + printf " %2d) %s %s\n" $iw "$have" "$file" |
| 253 | +done |
| 254 | + |
| 255 | +# ask for weights type until provided and available |
| 256 | +while [[ -z "$wtype" ]]; do |
| 257 | + printf "\n" |
| 258 | + read -p "[+] Select weight type: " wtype |
| 259 | + wfile="${wfiles[$wtype]}" |
| 260 | + |
| 261 | + if [[ -z "$wfile" ]]; then |
| 262 | + printf "[-] Invalid weight type: %s\n" "$wtype" |
| 263 | + wtype="" |
| 264 | + fi |
| 265 | +done |
| 266 | + |
| 267 | +printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile" |
| 268 | + |
| 269 | +url="${repo%/}/resolve/main/$wfile" |
| 270 | + |
| 271 | +# check file if the model has been downloaded before |
| 272 | +chk="$wfile.chk" |
| 273 | + |
| 274 | +# check if we should download the file |
| 275 | +# - if $wfile does not exist |
| 276 | +# - if $wfile exists but $chk does not exist |
| 277 | +# - if $wfile exists and $chk exists but $wfile is newer than $chk |
| 278 | +# TODO: better logic using git lfs info |
| 279 | + |
| 280 | +do_download=0 |
| 281 | + |
| 282 | +if [[ ! -f "$wfile" ]]; then |
| 283 | + do_download=1 |
| 284 | +elif [[ ! -f "$chk" ]]; then |
| 285 | + do_download=1 |
| 286 | +elif [[ "$wfile" -nt "$chk" ]]; then |
| 287 | + do_download=1 |
| 288 | +fi |
| 289 | + |
| 290 | +if [[ $do_download -eq 1 ]]; then |
| 291 | + printf "[+] Downloading weights from %s\n" "$url" |
| 292 | + |
| 293 | + # download the weights file |
| 294 | + curl -o "$wfile" -# -L "$url" |
| 295 | + |
| 296 | + # create a check file if successful |
| 297 | + if [[ $? -eq 0 ]]; then |
| 298 | + printf "[+] Creating check file %s\n" "$chk" |
| 299 | + touch "$chk" |
| 300 | + fi |
| 301 | +else |
| 302 | + printf "[+] Using cached weights %s\n" "$wfile" |
| 303 | +fi |
| 304 | + |
| 305 | +# get latest llama.cpp and build |
| 306 | + |
| 307 | +printf "[+] Downloading latest llama.cpp\n" |
| 308 | + |
| 309 | +llama_cpp_dir="__llama_cpp_port_${port}__" |
| 310 | + |
| 311 | +if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then |
| 312 | + # if the dir exists and there isn't a file "__ggml_script__" in it, abort |
| 313 | + printf "[-] Directory %s already exists\n" "$llama_cpp_dir" |
| 314 | + printf "[-] Please remove it and try again\n" |
| 315 | + exit 1 |
| 316 | +elif [[ -d "$llama_cpp_dir" ]]; then |
| 317 | + printf "[+] Directory %s already exists\n" "$llama_cpp_dir" |
| 318 | + printf "[+] Using cached llama.cpp\n" |
| 319 | + |
| 320 | + cd "$llama_cpp_dir" |
| 321 | + git reset --hard |
| 322 | + git fetch |
| 323 | + git checkout origin/master |
| 324 | + |
| 325 | + cd .. |
| 326 | +else |
| 327 | + printf "[+] Cloning llama.cpp\n" |
| 328 | + |
| 329 | + git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir" |
| 330 | +fi |
| 331 | + |
| 332 | +# mark that that the directory is made by this script |
| 333 | +touch "$llama_cpp_dir/__ggml_script__" |
| 334 | + |
| 335 | +if [[ $verbose -eq 1 ]]; then |
| 336 | + set -x |
| 337 | +fi |
| 338 | + |
| 339 | +# build |
| 340 | +cd "$llama_cpp_dir" |
| 341 | + |
| 342 | +make clean |
| 343 | + |
| 344 | +log="--silent" |
| 345 | +if [[ $verbose -eq 1 ]]; then |
| 346 | + log="" |
| 347 | +fi |
| 348 | + |
| 349 | +if [[ "$backend" == "cuda" ]]; then |
| 350 | + printf "[+] Building with CUDA backend\n" |
| 351 | + LLAMA_CUBLAS=1 make -j server $log |
| 352 | +elif [[ "$backend" == "cpu" ]]; then |
| 353 | + printf "[+] Building with CPU backend\n" |
| 354 | + make -j server $log |
| 355 | +elif [[ "$backend" == "metal" ]]; then |
| 356 | + printf "[+] Building with Metal backend\n" |
| 357 | + make -j server $log |
| 358 | +elif [[ "$backend" == "opencl" ]]; then |
| 359 | + printf "[+] Building with OpenCL backend\n" |
| 360 | + LLAMA_CLBLAST=1 make -j server $log |
| 361 | +else |
| 362 | + printf "[-] Unknown backend: %s\n" "$backend" |
| 363 | + exit 1 |
| 364 | +fi |
| 365 | + |
| 366 | +# run the server |
| 367 | + |
| 368 | +printf "[+] Running server\n" |
| 369 | + |
| 370 | +args="" |
| 371 | +if [[ "$backend" == "cuda" ]]; then |
| 372 | + export CUDA_VISIBLE_DEVICES=$gpu_id |
| 373 | + args="-ngl 999" |
| 374 | +elif [[ "$backend" == "cpu" ]]; then |
| 375 | + args="-ngl 0" |
| 376 | +elif [[ "$backend" == "metal" ]]; then |
| 377 | + args="-ngl 999" |
| 378 | +elif [[ "$backend" == "opencl" ]]; then |
| 379 | + args="-ngl 999" |
| 380 | +else |
| 381 | + printf "[-] Unknown backend: %s\n" "$backend" |
| 382 | + exit 1 |
| 383 | +fi |
| 384 | + |
| 385 | +if [[ $verbose -eq 1 ]]; then |
| 386 | + args="$args --verbose" |
| 387 | +fi |
| 388 | + |
| 389 | +./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args |
| 390 | + |
| 391 | +exit 0 |
0 commit comments