Skip to content

Commit 9ab94fa

Browse files
authored
[CI] Add distribute-stable (#72396)
1 parent e1d0215 commit 9ab94fa

8 files changed

+1062
-2
lines changed

.github/workflows/CI.yml

+5
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,8 @@ jobs:
4949
name: Linux-NPU
5050
uses: ./.github/workflows/_Linux-NPU.yml
5151
needs: cpu
52+
53+
distribute:
54+
name: Distribute-stable
55+
uses: ./.github/workflows/_Distribute-stable.yml
56+
needs: clone
+336
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
name: Distribute-stable
2+
3+
on:
4+
workflow_call:
5+
6+
env:
7+
dockerfile: Dockerfile.cuda123_cudnn9_gcc122_ubuntu20
8+
docker_image: 4b95dcca2173
9+
PR_ID: ${{ github.event.pull_request.number }}
10+
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
11+
work_dir: /paddle
12+
PADDLE_ROOT: /paddle
13+
TASK: paddle-CI-${{ github.event.pull_request.number }}-distribute
14+
ci_scripts: /paddle/ci
15+
BRANCH: ${{ github.event.pull_request.base.ref }}
16+
CI_name: distribute
17+
no_proxy: bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn,paddlepaddle.org.cn
18+
19+
defaults:
20+
run:
21+
shell: bash
22+
23+
jobs:
24+
build:
25+
name: Build
26+
runs-on:
27+
group: GZ_BD-CPU
28+
29+
steps:
30+
- name: Check docker image and run container
31+
env:
32+
FLAGS_fraction_of_gpu_memory_to_use: 0.15
33+
CTEST_OUTPUT_ON_FAILURE: 1
34+
CTEST_PARALLEL_LEVEL: 4
35+
WITH_GPU: "ON"
36+
WITH_AVX: "ON"
37+
WITH_MKL: "OFF"
38+
WITH_PYTHON: "ON"
39+
WITH_DISTRIBUTE: "ON"
40+
WITH_PSCORE: "ON"
41+
WITH_PSLIB: "OFF"
42+
WITH_HETERPS: "ON"
43+
WITH_TESTING: "ON"
44+
WITH_INFERENCE_API_TEST: "OFF"
45+
COVERALLS_UPLOAD: "ON"
46+
PADDLE_VERSION: 0.0.0
47+
CUDA_VISIBLE_DEVICES: 0,1
48+
GIT_PR_ID: ${{ github.event.pull_request.number }}
49+
GPUBOX_DEMO_INSTALL_DIR: /root/.cache/build
50+
INFERENCE_DEMO_INSTALL_DIR: /root/.cache/python35
51+
PY_VERSION: "3.10"
52+
WITH_TENSORRT: "OFF"
53+
GENERATOR: "Ninja"
54+
WITH_SHARED_PHI: "ON"
55+
CUDA_ARCH_NAME: Ampere
56+
WITH_CUDNN_FRONTEND: "ON"
57+
FLAGS_enable_cudnn_frontend: 1
58+
CACHE_DIR: /root/.cache/build
59+
CCACHE_DIR: /root/.ccache/gpubox
60+
CCACHE_MAXSIZE: 150G
61+
CCACHE_LIMIT_MULTIPLE: 0.8
62+
CCACHE_STATSLOG: /paddle/build/.stats.log
63+
CCACHE_SLOPPINESS: clang_index_store,time_macros,include_file_mtime
64+
run: |
65+
container_name=${TASK}-build-$(date +%Y%m%d-%H%M%S)
66+
echo "container_name=${container_name}" >> ${{ github.env }}
67+
docker run -d -t --name ${container_name} \
68+
-v "/home/data/cfs:/home/data/cfs" \
69+
-v "/home/data/cfs/.cache/:/root/.cache" \
70+
-v "/home/data/cfs/.ccache:/root/.ccache" \
71+
-v "/dev/shm:/dev/shm" \
72+
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
73+
-v ${{ github.workspace }}:/paddle \
74+
-e BRANCH \
75+
-e PR_ID \
76+
-e COMMIT_ID \
77+
-e work_dir \
78+
-e PADDLE_ROOT \
79+
-e ci_scripts \
80+
-e CI_name \
81+
-e WITH_SHARED_PHI \
82+
-e WITH_MKL \
83+
-e WITH_TESTING \
84+
-e COVERALLS_UPLOAD \
85+
-e GIT_PR_ID \
86+
-e PADDLE_VERSION \
87+
-e WITH_DISTRIBUTE \
88+
-e PY_VERSION \
89+
-e WITH_TENSORRT \
90+
-e GENERATOR \
91+
-e CCACHE_MAXSIZE \
92+
-e CCACHE_LIMIT_MULTIPLE \
93+
-e WITH_AVX \
94+
-e WITH_PYTHON \
95+
-e CACHE_DIR \
96+
-e CCACHE_DIR \
97+
-e CCACHE_STATSLOG \
98+
-e CCACHE_SLOPPINESS \
99+
-e FLAGS_fraction_of_gpu_memory_to_use \
100+
-e CTEST_OUTPUT_ON_FAILURE \
101+
-e CTEST_PARALLEL_LEVEL \
102+
-e WITH_GPU \
103+
-e WITH_PSCORE \
104+
-e WITH_PSLIB \
105+
-e WITH_HETERPS \
106+
-e WITH_INFERENCE_API_TEST \
107+
-e CUDA_VISIBLE_DEVICES \
108+
-e GPUBOX_DEMO_INSTALL_DIR \
109+
-e INFERENCE_DEMO_INSTALL_DIR \
110+
-e CUDA_ARCH_NAME \
111+
-e WITH_CUDNN_FRONTEND \
112+
-e FLAGS_enable_cudnn_frontend \
113+
-e no_proxy \
114+
-w /paddle --network host ${docker_image}
115+
116+
- name: Download paddle.tar.gz and merge target branch
117+
env:
118+
work_dir: ${{ github.workspace }}
119+
run: |
120+
docker exec -t ${{ env.container_name }} /bin/bash -c '
121+
mkdir -p /root/.cache/build
122+
mkdir -p /root/.ccache/gpubox
123+
rm -rf * .[^.]*
124+
set -e
125+
echo "Downloading Paddle.tar.gz"
126+
wget -q --no-proxy https://paddle-github-action.bj.bcebos.com/PR/Paddle/${PR_ID}/${COMMIT_ID}/Paddle.tar.gz --no-check-certificate
127+
echo "Extracting Paddle.tar.gz"
128+
tar xf Paddle.tar.gz --strip-components=1
129+
rm Paddle.tar.gz
130+
git remote add upstream https://github.com/PaddlePaddle/Paddle.git
131+
source ${{ github.workspace }}/../../../proxy
132+
git checkout test
133+
echo "Pull upstream develop"
134+
git pull upstream $BRANCH --no-edit
135+
'
136+
137+
- name: Check bypass
138+
id: check-bypass
139+
uses: ./.github/actions/check-bypass
140+
with:
141+
github-token: ${{ secrets.GITHUB_TOKEN }}
142+
workflow-name: distribute
143+
144+
- name: Download flashattn cache
145+
run: |
146+
docker exec -t ${{ env.container_name }} /bin/bash -c '
147+
set +e
148+
flashattn_version=$(git submodule status | grep flashattn | awk "{print \$1}" | sed "s#-##g")
149+
echo "flashattn_version=${flashattn_version}" >> ${{ github.env }}
150+
wget -q --no-proxy https://paddle-github-action.bj.bcebos.com/PR/gpups/flashattn_cache/flashattn_libs_${flashattn_version}.tar.gz --no-check-certificate; FACODE=$?
151+
if [ $FACODE -ne 0 ]; then
152+
echo "flashattn_cached_package=true" >> ${{ github.env }}
153+
fi
154+
'
155+
156+
- name: Build
157+
run: |
158+
docker exec -t ${{ env.container_name }} /bin/bash -c '
159+
source ${{ github.workspace }}/../../../proxy
160+
bash ${ci_scripts}/run_setup.sh bdist_wheel
161+
'
162+
163+
- name: Packaging of products
164+
run: |
165+
docker exec -t ${{ env.container_name }} /bin/bash -c '
166+
if [ "${{ env.flashattn_cached_package }}" == "true" ]; then
167+
cd ${work_dir}/build/third_party/install/flashattn/lib
168+
mkdir flashattn_libs_${{ env.flashattn_version }} && cd flashattn_libs_${{ env.flashattn_version }}
169+
mkdir fa_libs && cp ../lib*.so fa_libs && tar -zcf fa_libs.tar ./fa_libs && rm -rf ./fa_libs
170+
md5sum fa_libs.tar |awk "{print \$1}" >MD5.txt
171+
cd .. && tar -zcf flashattn_libs_${{ env.flashattn_version }}.tar ./flashattn_libs_${{ env.flashattn_version }}
172+
fi
173+
cd ${work_dir}/build
174+
rm -rf $(find . -name "*.a")
175+
rm -rf $(find . -name "*.o")
176+
cd ${work_dir}/build/third_party && find `ls | grep -v "dlpack"| grep -v "install"|grep -v "eigen3"|grep -v "gflags" ` -type f ! -name "*.so" -a ! -name "libdnnl.so*" -delete
177+
cd ${work_dir}/..
178+
tar --use-compress-program="pzstd -1" --warning=no-file-changed -cf Paddle.tar.gz paddle
179+
'
180+
181+
- name: Upload product to bos
182+
env:
183+
home_path: ${{ github.workspace }}/..
184+
bos_file: ${{ github.workspace }}/../bos/BosClient.py
185+
paddle_whl: paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
186+
run: |
187+
docker exec -t ${{ env.container_name }} /bin/bash -c '
188+
set -x
189+
export AK=paddle
190+
export SK=paddle
191+
source ${{ github.workspace }}/../../../proxy
192+
echo "::group::Install bce-python-sdk"
193+
python -m pip install bce-python-sdk==0.8.74
194+
echo "::endgroup::"
195+
if [ ! -f "${{ env.bos_file }}" ]; then
196+
wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
197+
mkdir ${{ env.home_path }}/bos
198+
tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
199+
fi
200+
cd ..
201+
source ${{ github.workspace }}/../../../unproxy
202+
echo "Uploading Paddle.tar.gz to bos"
203+
python ${{ env.bos_file }} Paddle.tar.gz paddle-github-action/PR/gpups/${{ env.PR_ID }}/${{ env.COMMIT_ID }}
204+
echo "Uploading whl to bos"
205+
mv ${work_dir}/dist/${{ env.paddle_whl }} .
206+
python ${{ env.bos_file }} ${{ env.paddle_whl }} paddle-github-action/PR/gpups/${{ env.PR_ID }}/${{ env.COMMIT_ID }}
207+
if [ "${{ env.flashattn_cached_package }}" == "true" ]; then
208+
echo "Uploading flashattn_libs_${flashattn_version}.tar.gz to bos"
209+
mv ${work_dir}/build/third_party/install/flashattn/lib/flashattn_libs_${{ env.flashattn_version }}.tar .
210+
python ${{ env.bos_file }} flashattn_libs_${{ env.flashattn_version }}.tar paddle-github-action/PR/gpups/flashattn_cache
211+
fi
212+
rm -rf Paddle.tar.gz ${{ env.paddle_whl }} flashattn_libs_${flashattn_version}.tar
213+
'
214+
215+
- name: Terminate and delete the container
216+
if: always()
217+
run: |
218+
set +e
219+
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
220+
docker stop ${{ env.container_name }}
221+
docker rm ${{ env.container_name }}
222+
223+
test:
224+
name: Test
225+
needs: build
226+
runs-on:
227+
group: Distribute
228+
steps:
229+
- name: Check docker image and run container
230+
env:
231+
FLAGS_fraction_of_gpu_memory_to_use: 0.15
232+
CTEST_OUTPUT_ON_FAILURE: 1
233+
CTEST_PARALLEL_LEVEL: 4
234+
WITH_GPU: "ON"
235+
WITH_AVX: "ON"
236+
WITH_DISTRIBUTE: "ON"
237+
WITH_HETERPS: "ON"
238+
WITH_TESTING: "ON"
239+
WITH_COVERAGE: "OFF"
240+
CMAKE_BUILD_TYPE: Release
241+
PADDLE_FRACTION_GPU_MEMORY_TO_USE: 0.15
242+
PRECISION_TEST: "OFF"
243+
WITH_UNITY_BUILD: "ON"
244+
AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
245+
AGILE_REVISION: ${{ github.event.pull_request.head.sha }}
246+
WITH_INCREMENTAL_COVERAGE: "OFF"
247+
WITH_ONNXRUNTIME: "OFF"
248+
COVERALLS_UPLOAD: "ON"
249+
PADDLE_VERSION: 0.0.0
250+
GIT_PR_ID: ${{ github.event.pull_request.number }}
251+
PY_VERSION: "3.10"
252+
CUDA_ARCH_NAME: Auto
253+
WITH_CUDNN_FRONTEND: "ON"
254+
FLAGS_enable_cudnn_frontend: 1
255+
CACHE_DIR: /root/.cache/build
256+
CCACHE_DIR: /root/.ccache/gpubox
257+
run: |
258+
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo "-v {}:{}") $(\ls /usr/lib64/libnvidia* | xargs -I{} echo "-v {}:{}")"
259+
export DEVICES="$(\ls -d /dev/nvidia* | xargs -I{} echo "-v {}:{}") $(\ls /dev/nvidia-caps/* | xargs -I{} echo "-v {}:{}")"
260+
export SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
261+
container_name=${TASK}-test-$(date +%Y%m%d-%H%M%S)
262+
echo "container_name=${container_name}" >> ${{ github.env }}
263+
docker run --privileged -d -t --name ${container_name} ${CUDA_SO} ${DEVICES} ${SMI} --shm-size=32G \
264+
-v "/home/data/cfs:/home/data/cfs" \
265+
-v "/home/data/cfs/.cache/:/root/.cache" \
266+
-v "/home/data/cfs/.ccache:/root/.ccache" \
267+
-v "/ssd1/root:/root" \
268+
-v "/dev/shm:/dev/shm" \
269+
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
270+
-v ${{ github.workspace }}:/paddle \
271+
-e BRANCH \
272+
-e PR_ID \
273+
-e COMMIT_ID \
274+
-e work_dir \
275+
-e PADDLE_ROOT \
276+
-e ci_scripts \
277+
-e CI_name \
278+
-e FLAGS_fraction_of_gpu_memory_to_use \
279+
-e CTEST_OUTPUT_ON_FAILURE \
280+
-e CTEST_PARALLEL_LEVEL \
281+
-e WITH_GPU \
282+
-e WITH_AVX \
283+
-e WITH_DISTRIBUTE \
284+
-e WITH_HETERPS \
285+
-e WITH_TESTING \
286+
-e WITH_COVERAGE \
287+
-e CMAKE_BUILD_TYPE \
288+
-e PADDLE_FRACTION_GPU_MEMORY_TO_USE \
289+
-e PRECISION_TEST \
290+
-e WITH_UNITY_BUILD \
291+
-e AGILE_COMPILE_BRANCH \
292+
-e AGILE_REVISION \
293+
-e WITH_INCREMENTAL_COVERAGE \
294+
-e WITH_ONNXRUNTIME \
295+
-e COVERALLS_UPLOAD \
296+
-e PADDLE_VERSION \
297+
-e GIT_PR_ID \
298+
-e PY_VERSION \
299+
-e CUDA_ARCH_NAME \
300+
-e WITH_CUDNN_FRONTEND \
301+
-e FLAGS_enable_cudnn_frontend \
302+
-e CACHE_DIR \
303+
-e CCACHE_DIR \
304+
-e no_proxy \
305+
-w /paddle --network host ${docker_image}
306+
307+
- name: Download paddle.tar.gz and merge target branch
308+
run: |
309+
docker exec -t ${{ env.container_name }} /bin/bash -c '
310+
rm -rf * .[^.]*
311+
echo "Downloading Paddle.tar.gz"
312+
wget -q --no-proxy https://paddle-github-action.bj.bcebos.com/PR/gpups/${{ env.PR_ID }}/${{ env.COMMIT_ID }}/Paddle.tar.gz --no-check-certificate
313+
# wget -q --no-proxy https://paddle-github-action.bj.bcebos.com/PR/gpups/72396/2b797c6b9b483cc0bd8433fb9d97d06b7ce137c1/Paddle.tar.gz --no-check-certificate
314+
echo "Extracting Paddle.tar.gz"
315+
tar --use-compress-program="pzstd" -xf Paddle.tar.gz --strip-components=1
316+
rm Paddle.tar.gz
317+
git checkout test
318+
echo "Pull upstream $BRANCH"
319+
source ${{ github.workspace }}/../../../proxy
320+
git pull upstream $BRANCH --no-edit
321+
'
322+
323+
- name: Test
324+
run: |
325+
docker exec -t ${{ env.container_name }} /bin/bash -c '
326+
source ${{ github.workspace }}/../../../proxy
327+
bash ${ci_scripts}/distribute_test.sh
328+
'
329+
330+
- name: Terminate and delete the container
331+
if: always()
332+
run: |
333+
set +e
334+
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
335+
docker stop ${{ env.container_name }}
336+
docker rm ${{ env.container_name }}

0 commit comments

Comments
 (0)