hiyouga · emascarenhas · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 13, 2025
diff --git a/README.md b/README.md
@@ -405,13 +405,15 @@ huggingface-cli login
 | peft         | 0.11.1  | 0.12.0    |
 | trl          | 0.8.6   | 0.9.6     |
 
-| Optional     | Minimum | Recommend |
-| ------------ | ------- | --------- |
-| CUDA         | 11.6    | 12.2      |
-| deepspeed    | 0.10.0  | 0.16.4    |
-| bitsandbytes | 0.39.0  | 0.43.1    |
-| vllm         | 0.4.3   | 0.7.3     |
-| flash-attn   | 2.3.0   | 2.7.2     |
+| Optional       | Minimum | Recommend |
+| -------------- | ------- | --------- |
+| CUDA           | 11.6    | 12.2      |
+| deepspeed      | 0.10.0  | 0.16.4    |
+| bitsandbytes   | 0.39.0  | 0.43.1    |
+| vllm           | 0.4.3   | 0.7.3     |
+| flash-attn     | 2.3.0   | 2.7.2     |
+| habana-*       | 1.19.0  | 1.20.0    |
+| optimum-habana | 1.15    | 1.16.0    |
 
 ### Hardware Requirement
 
@@ -543,6 +545,16 @@ pip install .
 
 </details>
 
+<details><summary>For Gaudi HPU users</summary>
+
+Please follow the instructions in the [Intel Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up your environment, including the `$PYTHON` environment variable. This guide will walk you through the process of configuring your system to run on Gaudi HPU devices.
+
+To run `llamafactory-cli`, use the Docker setup referenced in the [Docker Installation ](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html) Guide. Follow the instructions provided to install LLaMA-Factory within the Docker environment. Compatible `optimum-habana` releases are specified above. Note that most LLaMA-Factory features and optimizations, including inferencing, training (SFT, DPO, etc.), LoRA fine-tuning, and distributed training with DeepSpeed and DDP, are supported on Gaudi HPU devices.
+
+The examples directory contains various YAML and JSON configuration files tailored for Gaudi-specific configurations with DeepSpeed, covering both training and inference. These files are optimized to transparently leverage Gaudi's performance enhancements and can be identified by the inclusion of `gaudi` in their filenames. When creating custom configurations, it is recommended to use or modify these files to achieve better performance on Gaudi HPU devices.
+
+</details>
+
 ### Data Preparation
 
 Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope / Modelers hub or load the dataset in local disk.
@@ -583,6 +595,14 @@ docker compose up -d
 docker compose exec llamafactory bash
 ```
 
+For HPU users:
+
+```bash
+cd docker/docker-hpu/
+docker compose up -d
+docker compose exec llamafactory bash
+```
+
 For Ascend NPU users:
 
 ```bash
@@ -627,6 +647,43 @@ docker run -dit --gpus=all \
 docker exec -it llamafactory bash
 ```
 
+For HPU users:
+
+```bash
+docker build -f ./docker/docker-hpu/Dockerfile \
+    --build-arg INSTALL_BNB=false \
+    --build-arg INSTALL_VLLM=false \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg INSTALL_FLASHATTN=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg http_proxy=${http_proxy} \
+    --build-arg https_proxy=${https_proxy} \
+    --build-arg ftp_proxy=${ftp_proxy} \
+    --build-arg no_proxy=${no_proxy:-localhost} \
+    -t llamafactory:latest .
+
+docker run -dit --runtime=habana \
+    --cap-add=sys_nice --cap_drop=net_raw --ipc=host \
+    -e http_proxy=${http_proxy} \
+    -e https_proxy=${https_proxy} \
+    -e ftp_proxy=${ftp_proxy} \
+    -e no_proxy=${no_proxy:-localhost} \
+    -e HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES:-all} \
+    -e OMPI_MCA_btl_vader_single_copy_mechanism=${OMPI_MCA_btl_vader_single_copy_mechanism:-none} \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
+    -v ./om_cache:/root/.cache/openmind \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --shm-size 16G \
+    --name llamafactory \
+    llamafactory:latest
+
+docker exec -it llamafactory bash
+```
+
 For Ascend NPU users:
 
 ```bash

diff --git a/README_zh.md b/README_zh.md
@@ -407,13 +407,15 @@ huggingface-cli login
 | peft         | 0.11.1  | 0.12.0    |
 | trl          | 0.8.6   | 0.9.6     |
 
-| 可选项       | 至少     | 推荐      |
-| ------------ | ------- | --------- |
-| CUDA         | 11.6    | 12.2      |
-| deepspeed    | 0.10.0  | 0.16.4    |
-| bitsandbytes | 0.39.0  | 0.43.1    |
-| vllm         | 0.4.3   | 0.7.3     |
-| flash-attn   | 2.3.0   | 2.7.2     |
+| 可选项         | 至少    | 推荐     |
+| -------------- | ------- | -------- |
+| CUDA           | 11.6    | 12.2     |
+| deepspeed      | 0.10.0  | 0.16.4   |
+| bitsandbytes   | 0.39.0  | 0.43.1   |
+| vllm           | 0.4.3   | 0.7.3    |
+| flash-attn     | 2.3.0   | 2.7.2    |
+| habana-*       | 1.19.0  | 1.20.0   |
+| optimum-habana | 1.15    | 1.16.0   |
 
 ### 硬件依赖
 
@@ -546,6 +548,16 @@ pip install .
 
 </details>
 
+<details><summary>HPU 用户指南</summary>
+
+请按照 [Intel Gaudi](https://docs.habana.ai/en/latest/Installation_Guide/index.html) 安装指南 中的说明设置您的环境，包括 `$PYTHON` 环境变量。该指南将引导您完成配置系统以在 Gaudi HPU 设备上运行的过程。
+
+要运行 `llamafactory-cli`，请使用 Docker 安装指南 中引用的 Docker 设置。按照提供的说明在 [Docker](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html) 环境中安装 LLaMA-Factory。兼容的 `optimum-habana` 版本在上文中已指定。请注意，大多数 LLaMA-Factory 功能和优化，包括推理、训练（SFT、DPO 等）、LoRA 微调以及使用 DeepSpeed 和 DDP 的分布式训练，均已在 Gaudi HPU 设备上支持。
+
+examples 目录包含各种针对 Gaudi 特定配置的 YAML 和 JSON 配置文件，涵盖训练和推理。这些文件经过优化，可以透明地利用 Gaudi 的性能增强，并且可以通过文件名中包含 gaudi 来识别。在创建自定义配置时，建议使用或修改这些文件，以在 Gaudi HPU 设备上获得更好的性能。
+
+</details>
+
 ### 数据准备
 
 关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope / Modelers 上的数据集或加载本地数据集。
@@ -586,6 +598,14 @@ docker compose up -d
 docker compose exec llamafactory bash
 ```
 
+HPU 用户：
+
+```bash
+cd docker/docker-hpu/
+docker compose up -d
+docker compose exec llamafactory bash
+```
+
 昇腾 NPU 用户：
 
 ```bash
@@ -630,6 +650,43 @@ docker run -dit --gpus=all \
 docker exec -it llamafactory bash
 ```
 
+HPU 用户：
+
+```bash
+docker build -f ./docker/docker-hpu/Dockerfile \
+    --build-arg INSTALL_BNB=false \
+    --build-arg INSTALL_VLLM=false \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg INSTALL_FLASHATTN=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg http_proxy=${http_proxy} \
+    --build-arg https_proxy=${https_proxy} \
+    --build-arg ftp_proxy=${ftp_proxy} \
+    --build-arg no_proxy=${no_proxy:-localhost} \
+    -t llamafactory:latest .
+
+docker run -dit --runtime=habana \
+    --cap-add=sys_nice --cap_drop=net_raw --ipc=host \
+    -e http_proxy=${http_proxy} \
+    -e https_proxy=${https_proxy} \
+    -e ftp_proxy=${ftp_proxy} \
+    -e no_proxy=${no_proxy:-localhost} \
+    -e HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES:-all} \
+    -e OMPI_MCA_btl_vader_single_copy_mechanism=${OMPI_MCA_btl_vader_single_copy_mechanism:-none} \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
+    -v ./om_cache:/root/.cache/openmind \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --shm-size 16G \
+    --name llamafactory \
+    llamafactory:latest
+
+docker exec -it llamafactory bash
+```
+
 昇腾 NPU 用户：
 
 ```bash

diff --git a/data/dataset_info.json b/data/dataset_info.json
@@ -5,6 +5,9 @@
   "alpaca_en_demo": {
     "file_name": "alpaca_en_demo.json"
   },
+  "tatsu_lab_alpaca": {
+    "hf_hub_url": "tatsu-lab/alpaca"
+  },
   "alpaca_zh_demo": {
     "file_name": "alpaca_zh_demo.json"
   },

diff --git a/docker/docker-hpu/Dockerfile b/docker/docker-hpu/Dockerfile
@@ -0,0 +1,100 @@
+# Use the Gaudi v1.19.1 official image with PyTorch 2.5.1
+FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+
+# Define environments
+ENV MAX_JOBS=4
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Define installation arguments
+ARG INSTALL_BNB=false
+ARG INSTALL_VLLM=false
+ARG INSTALL_DEEPSPEED=false
+ARG INSTALL_FLASHATTN=false
+ARG INSTALL_LIGER_KERNEL=false
+ARG INSTALL_HQQ=false
+ARG INSTALL_EETQ=false
+ARG PIP_INDEX=https://pypi.org/simple
+ARG http_proxy=
+ARG https_proxy=
+ARG ftp_proxy=
+ARG no_proxy="localhost"
+
+# Set the working directory
+WORKDIR /app
+
+# Set http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        echo "Configuring proxy..."; \
+        export http_proxy=$HTTP_PROXY; \
+        export https_proxy=$HTTP_PROXY; \
+        export no_proxy=${no_proxy:-localhost}; \
+    elif [ -n "$http_proxy" ]; then \
+        echo "Configuring proxy..."; \
+        export http_proxy=$http_proxy; \
+        export https_proxy=$http_proxy; \
+        export ftp_proxy=$ftp_proxy; \
+        export no_proxy=${no_proxy:-localhost}; \
+    fi
+
+# Install the requirements
+COPY requirements-gaudi.txt /app
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    if [ -n "$HTTP_PROXY" ]; then \
+        python -m pip install --proxy=$HTTP_PROXY -r requirements-gaudi.txt; \
+    elif [ -n "$http_proxy" ]; then \
+        python -m pip install --proxy=$http_proxy -r requirements-gaudi.txt; \
+    else \
+        python -m pip install -r requirements-gaudi.txt; \
+    fi
+
+# Copy the rest of the application into the image
+COPY . /app
+
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$INSTALL_BNB" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
+    fi; \
+    if [ "$INSTALL_VLLM" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
+    fi; \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    if [ -n "$HTTP_PROXY" ]; then \
+        pip install --proxy=$HTTP_PROXY -e ".[$EXTRA_PACKAGES]"; \
+    elif [ -n "$http_proxy" ]; then \
+            pip install --proxy=$http_proxy -e ".[$EXTRA_PACKAGES]"; \
+    else \
+        pip install -e ".[$EXTRA_PACKAGES]"; \
+    fi
+
+# Rebuild flash attention
+RUN pip uninstall -y transformer-engine flash-attn && \
+    if [ "$INSTALL_FLASHATTN" == "true" ]; then \
+        pip uninstall -y ninja && \
+        if [ -n "$HTTP_PROXY" ]; then \
+            pip install --proxy=$HTTP_PROXY ninja && \
+            pip install --proxy=$HTTP_PROXY --no-cache-dir flash-attn --no-build-isolation; \
+        elif [ -n "$http_proxy" ]; then \
+            pip install --proxy=$http_proxy ninja && \
+            pip install --proxy=$http_proxy --no-cache-dir flash-attn --no-build-isolation; \
+        else \
+            pip install ninja && \
+            pip install --no-cache-dir flash-attn --no-build-isolation; \
+        fi; \
+    fi
+
+# Set up volumes
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
+EXPOSE 7860
+
+# Expose port 8000 for the API service
+ENV API_PORT 8000
+EXPOSE 8000
diff --git a/docker/docker-hpu/docker-compose.yml b/docker/docker-hpu/docker-compose.yml
@@ -0,0 +1,37 @@
+services:
+  llamafactory:
+    build:
+      dockerfile: ./docker/docker-hpu/Dockerfile
+      context: ../..
+      args:
+        INSTALL_BNB: false
+        INSTALL_VLLM: false
+        INSTALL_DEEPSPEED: false
+        INSTALL_FLASHATTN: false
+        INSTALL_LIGER_KERNEL: false
+        INSTALL_HQQ: false
+        INSTALL_EETQ: false
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
+    environment:
+      - HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES:-all}
+      - OMPI_MCA_btl_vader_single_copy_mechanism=${OMPI_MCA_btl_vader_single_copy_mechanism:-none}
+    volumes:
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../om_cache:/root/.cache/openmind
+      - ../../data:/app/data
+      - ../../output:/app/output
+    cap_add:
+      - SYS_NICE
+    cap_drop:
+      - NET_RAW
+    ports:
+      - "7860:7860"
+      - "8000:8000"
+    ipc: host
+    tty: true
+    shm_size: '16gb'
+    stdin_open: true
+    command: bash
+    restart: unless-stopped
diff --git a/examples/deepspeed/ds_z1_config_gaudi.json b/examples/deepspeed/ds_z1_config_gaudi.json
@@ -0,0 +1,26 @@
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": false,
+    "round_robin_gradients": true
+  },
+  "timers": {
+    "throughput": {
+      "enabled": true,
+      "synchronized": false
+    }
+  }
+}