# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # name: Build and test on: workflow_call: inputs: java: required: false type: string default: 17 branch: description: Branch to run the build against required: false type: string # Change 'master' to 'branch-4.0' in branch-4.0 branch after cutting it. default: master hadoop: description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it. required: false type: string default: hadoop3 envs: description: Additional environment variables to set when running the tests. Should be in JSON format. required: false type: string default: '{"PYSPARK_IMAGE_TO_TEST": "python-311", "PYTHON_TO_TEST": "python3.11"}' jobs: description: >- Jobs to run, and should be in JSON format. The values should be matched with the job's key defined in this file, e.g., build. See precondition job below. required: false type: string default: '' secrets: codecov_token: description: The upload token of codecov. required: false jobs: precondition: name: Check changes runs-on: ubuntu-latest env: GITHUB_PREV_SHA: ${{ github.event.before }} outputs: required: ${{ steps.set-outputs.outputs.required }} image_url: ${{ steps.infra-image-outputs.outputs.image_url }} image_docs_url: ${{ steps.infra-image-docs-outputs.outputs.image_docs_url }} image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }} image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }} image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }} image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }} image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }} image_pyspark_url: ${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }} image_pyspark_url_link: ${{ steps.infra-image-link.outputs.image_pyspark_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Check all modules id: set-outputs run: | if [ -z "${{ inputs.jobs }}" ]; then pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark') and not m.name.startswith('pyspark-pandas')))"` pyspark_pandas_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark-pandas')))"` pyspark=`./dev/is-changed.py -m $pyspark_modules` pandas=`./dev/is-changed.py -m $pyspark_pandas_modules` if [[ "${{ github.repository }}" != 'apache/spark' ]]; then yarn=`./dev/is-changed.py -m yarn` kubernetes=`./dev/is-changed.py -m kubernetes` sparkr=`./dev/is-changed.py -m sparkr` tpcds=`./dev/is-changed.py -m sql` docker=`./dev/is-changed.py -m docker-integration-tests` buf=true ui=true docs=true else pandas=false yarn=false kubernetes=false sparkr=false tpcds=false docker=false buf=false ui=false docs=false fi build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,profiler,protobuf,yarn,connect,sql,hive,pipelines"` precondition=" { \"build\": \"$build\", \"pyspark\": \"$pyspark\", \"pyspark-pandas\": \"$pandas\", \"sparkr\": \"$sparkr\", \"tpcds-1g\": \"$tpcds\", \"docker-integration-tests\": \"$docker\", \"lint\" : \"true\", \"java17\" : \"true\", \"java25\" : \"true\", \"docs\" : \"$docs\", \"yarn\" : \"$yarn\", \"k8s-integration-tests\" : \"$kubernetes\", \"buf\" : \"$buf\", \"ui\" : \"$ui\", }" echo $precondition # For debugging # Remove `\n` to avoid "Invalid format" error precondition="${precondition//$'\n'/}}" echo "required=$precondition" >> $GITHUB_OUTPUT else # This is usually set by scheduled jobs. precondition='${{ inputs.jobs }}' echo $precondition # For debugging precondition="${precondition//$'\n'/}" echo "required=$precondition" >> $GITHUB_OUTPUT fi - name: Check envs id: check-envs if: inputs.branch != 'branch-3.5' env: ${{ fromJSON(inputs.envs) }} run: | if [[ "${{ fromJson(steps.set-outputs.outputs.required).pyspark }}" == 'true' || "${{ fromJson(steps.set-outputs.outputs.required).pyspark-pandas }}" == 'true' ]]; then if [[ "${{ env.PYSPARK_IMAGE_TO_TEST }}" == "" ]]; then echo "PYSPARK_IMAGE_TO_TEST is required when pyspark is enabled." exit 1 fi PYSPARK_IMAGE_PATH="dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/Dockerfile" if [ -f $PYSPARK_IMAGE_PATH ]; then echo "Dockerfile $PYSPARK_IMAGE_PATH exists." else echo "Dockerfile $PYSPARK_IMAGE_PATH does NOT exist." exit 1 fi if [[ "${{ env.PYTHON_TO_TEST }}" == "" ]]; then echo "PYTHON_TO_TEST is required when pyspark is enabled." exit 1 fi fi - name: Generate infra image URL id: infra-image-outputs run: | # Convert to lowercase to meet Docker repo name requirement REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Generate infra image URL (Documentation) id: infra-image-docs-outputs run: | # Convert to lowercase to meet Docker repo name requirement REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') IMG_NAME="apache-spark-ci-image-docs:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_docs_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Generate infra image URL (Linter) id: infra-image-lint-outputs run: | # Convert to lowercase to meet Docker repo name requirement REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Generate infra image URL (SparkR) id: infra-image-sparkr-outputs run: | # Convert to lowercase to meet Docker repo name requirement REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Generate infra image URL (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }}) id: infra-image-pyspark-outputs if: ${{ env.PYSPARK_IMAGE_TO_TEST }} env: ${{ fromJSON(inputs.envs) }} run: | # Convert to lowercase to meet Docker repo name requirement REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') IMG_NAME="apache-spark-ci-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_pyspark_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Link the docker images id: infra-image-link env: ${{ fromJSON(inputs.envs) }} run: | # Set the image URL for job "docs" # Should delete the link and directly use image_docs_url after SPARK 3.x EOL if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_pyspark_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT else echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT echo "image_pyspark_url_link=${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}" >> $GITHUB_OUTPUT fi # Build: build Spark and run the tests for specified modules. build: name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" needs: precondition if: fromJson(needs.precondition.outputs.required).build == 'true' runs-on: ubuntu-latest timeout-minutes: 120 strategy: fail-fast: false matrix: java: - ${{ inputs.java }} hadoop: - ${{ inputs.hadoop }} hive: - hive2.3 # Note that the modules below are from sparktestsupport/modules.py. modules: - >- core, unsafe, kvstore, avro, utils, network-common, network-shuffle, repl, launcher, examples, sketch, variant - >- api, catalyst, hive-thriftserver - >- mllib-local, mllib, graphx, profiler, pipelines - >- streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect - yarn # Here, we split Hive and SQL tests into some of slow ones and the rest of them. included-tags: [""] excluded-tags: [""] comment: [""] include: # Hive tests - modules: hive java: ${{ inputs.java }} hadoop: ${{ inputs.hadoop }} hive: hive2.3 included-tags: org.apache.spark.tags.SlowHiveTest comment: "- slow tests" - modules: hive java: ${{ inputs.java }} hadoop: ${{ inputs.hadoop }} hive: hive2.3 excluded-tags: org.apache.spark.tags.SlowHiveTest comment: "- other tests" # SQL tests - modules: sql java: ${{ inputs.java }} hadoop: ${{ inputs.hadoop }} hive: hive2.3 included-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- extended tests" - modules: sql java: ${{ inputs.java }} hadoop: ${{ inputs.hadoop }} hive: hive2.3 included-tags: org.apache.spark.tags.SlowSQLTest comment: "- slow tests" - modules: sql java: ${{ inputs.java }} hadoop: ${{ inputs.hadoop }} hive: hive2.3 excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest comment: "- other tests" exclude: # Always run if yarn == 'true', even infra-image is skip (such as non-master job) # In practice, the build will run in individual PR, but not against the individual commit # in Apache Spark repository. - modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }} env: MODULES_TO_TEST: ${{ matrix.modules }} EXCLUDED_TAGS: ${{ matrix.excluded-tags }} INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost NOLINT_ON_COMPILE: true SKIP_UNIDOC: true SKIP_MIMA: true SKIP_PACKAGING: true steps: - name: Checkout Spark repository uses: actions/checkout@v4 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - name: Free up disk space run: | if [ -f ./dev/free_disk_space ]; then ./dev/free_disk_space fi - name: Install Java ${{ matrix.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ matrix.java }} - name: Install Python 3.11 uses: actions/setup-python@v5 # We should install one Python that is higher than 3+ for SQL and Yarn because: # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. # - Yarn has a Python specific test too, for example, YarnClusterSuite. if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') with: python-version: '3.11' architecture: x64 - name: Install Python packages (Python 3.11) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') || contains(matrix.modules, 'yarn') run: | python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1' python3.11 -m pip list # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | # Fix for TTY related issues when launching the Ammonite REPL in tests. export TERM=vt100 # Hive "other tests" test needs larger metaspace size based on experiment. if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi # SPARK-46283: should delete the following env replacement after SPARK 3.x EOL if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} fi export SERIAL_SBT_TESTS=1 ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 with: name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: ${{ !success() }} uses: actions/upload-artifact@v4 with: name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/*.log" - name: Upload yarn app log files if: ${{ !success() && contains(matrix.modules, 'yarn') }} uses: actions/upload-artifact@v4 with: name: yarn-app-log-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/test/data/" infra-image: name: "Base image build" needs: precondition if: >- fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true' || fromJson(needs.precondition.outputs.required).lint == 'true' || fromJson(needs.precondition.outputs.required).docs == 'true' || fromJson(needs.precondition.outputs.required).sparkr == 'true' runs-on: ubuntu-latest permissions: packages: write steps: - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Checkout Spark repository uses: actions/checkout@v4 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build and push for branch-3.5 if: inputs.branch == 'branch-3.5' id: docker_build uses: docker/build-push-action@v6 with: context: ./dev/infra/ push: true tags: | ${{ needs.precondition.outputs.image_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} - name: Build and push (Documentation) if: ${{ inputs.branch != 'branch-3.5' && fromJson(needs.precondition.outputs.required).docs == 'true' && hashFiles('dev/spark-test-image/docs/Dockerfile') != '' }} id: docker_build_docs uses: docker/build-push-action@v6 with: context: ./dev/spark-test-image/docs/ push: true tags: | ${{ needs.precondition.outputs.image_docs_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-docs-cache:${{ inputs.branch }} - name: Build and push (Linter) if: ${{ inputs.branch != 'branch-3.5' && fromJson(needs.precondition.outputs.required).lint == 'true' && hashFiles('dev/spark-test-image/lint/Dockerfile') != '' }} id: docker_build_lint uses: docker/build-push-action@v6 with: context: ./dev/spark-test-image/lint/ push: true tags: | ${{ needs.precondition.outputs.image_lint_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }} - name: Build and push (SparkR) if: ${{ inputs.branch != 'branch-3.5' && fromJson(needs.precondition.outputs.required).sparkr == 'true' && hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' }} id: docker_build_sparkr uses: docker/build-push-action@v6 with: context: ./dev/spark-test-image/sparkr/ push: true tags: | ${{ needs.precondition.outputs.image_sparkr_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }} - name: Build and push (PySpark with ${{ env.PYSPARK_IMAGE_TO_TEST }}) if: ${{ inputs.branch != 'branch-3.5' && (fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true') && env.PYSPARK_IMAGE_TO_TEST != '' }} id: docker_build_pyspark env: ${{ fromJSON(inputs.envs) }} uses: docker/build-push-action@v6 with: context: ./dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/ push: true tags: | ${{ needs.precondition.outputs.image_pyspark_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}-cache:${{ inputs.branch }} pyspark: needs: [precondition, infra-image] # always run if pyspark == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && (fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true') name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-latest timeout-minutes: 120 container: image: ${{ needs.precondition.outputs.image_pyspark_url_link }} strategy: fail-fast: false matrix: java: - ${{ inputs.java }} modules: - >- pyspark-sql, pyspark-resource, pyspark-testing - >- pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger - >- pyspark-mllib, pyspark-ml, pyspark-ml-connect, pyspark-pipelines - >- pyspark-connect - >- pyspark-pandas - >- pyspark-pandas-slow - >- pyspark-pandas-connect-part0 - >- pyspark-pandas-connect-part1 - >- pyspark-pandas-connect-part2 - >- pyspark-pandas-connect-part3 exclude: # Always run if pyspark == 'true', even infra-image is skip (such as non-master job) # In practice, the build will run in individual PR, but not against the individual commit # in Apache Spark repository. - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-sql, pyspark-resource, pyspark-testing' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-mllib, pyspark-ml, pyspark-ml-connect' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-connect' }} # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) # In practice, the build will run in individual PR, but not against the individual commit # in Apache Spark repository. - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }} - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }} env: MODULES_TO_TEST: ${{ matrix.modules }} HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_UNIDOC: true SKIP_MIMA: true SKIP_PACKAGING: true METASPACE_SIZE: 1g BRANCH: ${{ inputs.branch }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Add GITHUB_WORKSPACE to git trust safe.directory run: | git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | pyspark-coursier- - name: Free up disk space shell: 'script -q -e -c "bash {0}"' run: ./dev/free_disk_space_container - name: Install Java ${{ matrix.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ matrix.java }} - name: List Python packages (${{ env.PYTHON_TO_TEST }}) if: ${{ env.PYTHON_TO_TEST != '' }} env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | for py in $(echo $PYTHON_TO_TEST | tr "," "\n") do echo $py $py -m pip list done - name: Install Conda for pip packaging test if: contains(matrix.modules, 'pyspark-errors') uses: conda-incubator/setup-miniconda@v3 with: miniforge-version: latest # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then export PATH=$CONDA/bin:$PATH export SKIP_PACKAGING=false echo "Python Packaging Tests Enabled!" fi if [ ! -z "$PYTHON_TO_TEST" ]; then ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" else # For branch-3.5 and below, it uses the default Python versions. ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" fi - name: Upload coverage to Codecov if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' uses: codecov/codecov-action@v5 env: CODECOV_TOKEN: ${{ secrets.codecov_token }} with: files: ./python/coverage.xml flags: unittests name: PySpark verbose: true - name: Upload test results to report env: ${{ fromJSON(inputs.envs) }} if: always() uses: actions/upload-artifact@v4 with: name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} path: "**/target/test-reports/*.xml" - name: Upload unit tests log files env: ${{ fromJSON(inputs.envs) }} if: ${{ !success() }} uses: actions/upload-artifact@v4 with: name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} path: "**/target/unit-tests.log" sparkr: needs: [precondition, infra-image] # always run if sparkr == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' name: "Build modules: sparkr" runs-on: ubuntu-latest timeout-minutes: 120 container: image: ${{ needs.precondition.outputs.image_sparkr_url_link }} env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_UNIDOC: true SKIP_MIMA: true SKIP_PACKAGING: true steps: - name: Checkout Spark repository uses: actions/checkout@v4 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Add GITHUB_WORKSPACE to git trust safe.directory run: | git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | sparkr-coursier- - name: Free up disk space run: ./dev/free_disk_space_container - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} - name: Run tests env: ${{ fromJSON(inputs.envs) }} run: | # The followings are also used by `r-lib/actions/setup-r` to avoid # R issues at docker environment export TZ=UTC export _R_CHECK_SYSTEM_CLOCK_=FALSE ./dev/run-tests --parallelism 1 --modules sparkr - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 with: name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" buf: needs: [precondition] if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true' name: Protobuf breaking change detection and Python CodeGen check runs-on: ubuntu-latest steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Install Buf uses: bufbuild/buf-setup-action@v1 with: github_token: ${{ secrets.GITHUB_TOKEN }} - name: Protocol Buffers Linter uses: bufbuild/buf-lint-action@v1 with: input: core/src/main/protobuf - name: Breaking change detection against branch-4.0 uses: bufbuild/buf-breaking-action@v1 with: input: sql/connect/common/src/main against: 'https://github.com/apache/spark.git#branch=branch-4.0,subdir=sql/connect/common/src/main' - name: Install Python 3.11 uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install dependencies for Python CodeGen check run: | python3.11 -m pip install 'black==23.12.1' 'protobuf==5.29.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' python3.11 -m pip list - name: Python CodeGen check for branch-3.5 if: inputs.branch == 'branch-3.5' run: ./dev/connect-check-protos.py - name: Python CodeGen check if: inputs.branch != 'branch-3.5' run: ./dev/check-protos.py # Static analysis lint: needs: [precondition, infra-image] # always run if lint == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true' name: Linters, licenses, and dependencies runs-on: ubuntu-latest timeout-minutes: 120 env: LC_ALL: C.UTF-8 LANG: C.UTF-8 NOLINT_ON_COMPILE: false GITHUB_PREV_SHA: ${{ github.event.before }} container: image: ${{ needs.precondition.outputs.image_lint_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Add GITHUB_WORKSPACE to git trust safe.directory run: | git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | docs-coursier- - name: Cache Maven local repository uses: actions/cache@v4 with: path: ~/.m2/repository key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- - name: Free up disk space run: ./dev/free_disk_space_container - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} - name: License test run: ./dev/check-license - name: Dependencies test run: ./dev/test-dependencies.sh - name: MIMA test run: ./dev/mima - name: Scala linter run: ./dev/lint-scala - name: Scala structured logging check for branch-3.5 and branch-4.0 if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0' run: | if [ -f ./dev/structured_logging_style.py ]; then python3.9 ./dev/structured_logging_style.py fi - name: Scala structured logging check if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' run: | if [ -f ./dev/structured_logging_style.py ]; then python3.11 ./dev/structured_logging_style.py fi - name: Java linter run: ./dev/lint-java - name: Spark connect jvm client mima check run: ./dev/connect-jvm-client-mima-check - name: Install Python linter dependencies for branch-3.5 if: inputs.branch == 'branch-3.5' run: | # SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638 # Should delete this section after SPARK 3.5 EOL. python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - name: List Python packages for branch-3.5 and branch-4.0 if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0' run: python3.9 -m pip list - name: List Python packages if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' run: python3.11 -m pip list - name: Python linter for branch-3.5 and branch-4.0 if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0' run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python - name: Python linter if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' run: PYTHON_EXECUTABLE=python3.11 ./dev/lint-python # Should delete this section after SPARK 3.5 EOL. - name: Install dependencies for Python code generation check for branch-3.5 if: inputs.branch == 'branch-3.5' run: | # See more in "Installation" https://docs.buf.build/installation#tarball curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz mkdir -p $HOME/buf tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1 rm buf-Linux-x86_64.tar.gz python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0' # Should delete this section after SPARK 3.5 EOL. - name: Python code generation check for branch-3.5 if: inputs.branch == 'branch-3.5' run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi # Should delete this section after SPARK 3.5 EOL. - name: Install JavaScript linter dependencies for branch-3.5 if: inputs.branch == 'branch-3.5' run: | apt update apt-get install -y nodejs npm - name: JS linter run: ./dev/lint-js # Should delete this section after SPARK 3.5 EOL. - name: Install R linter dependencies for branch-3.5 if: inputs.branch == 'branch-3.5' run: | apt update apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ libtiff5-dev libjpeg-dev Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" - name: Install R linter dependencies and SparkR run: ./R/install-dev.sh - name: R linter run: ./dev/lint-r java17: needs: [precondition] if: fromJson(needs.precondition.outputs.required).java17 == 'true' name: Java 17 build with Maven runs-on: ubuntu-latest timeout-minutes: 120 steps: - uses: actions/checkout@v4 - uses: actions/setup-java@v4 with: distribution: zulu java-version: 17 - name: Build with Maven run: | export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" export MAVEN_CLI_OPTS="--no-transfer-progress" ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl clean install java25: needs: [precondition] if: fromJson(needs.precondition.outputs.required).java25 == 'true' name: Java 25 build with Maven runs-on: ubuntu-latest timeout-minutes: 120 steps: - uses: actions/checkout@v4 - uses: actions/setup-java@v4 with: distribution: zulu java-version: 25-ea - name: Build with Maven run: | export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" export MAVEN_CLI_OPTS="--no-transfer-progress" ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl clean install # Documentation build docs: needs: [precondition, infra-image] # always run if lint == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true' name: Documentation generation runs-on: ubuntu-latest timeout-minutes: 120 env: LC_ALL: C.UTF-8 LANG: C.UTF-8 NOLINT_ON_COMPILE: false PYSPARK_DRIVER_PYTHON: python3.9 PYSPARK_PYTHON: python3.9 GITHUB_PREV_SHA: ${{ github.event.before }} container: image: ${{ needs.precondition.outputs.image_docs_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Add GITHUB_WORKSPACE to git trust safe.directory run: | git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | docs-coursier- - name: Cache Maven local repository uses: actions/cache@v4 with: path: ~/.m2/repository key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- - name: Free up disk space run: ./dev/free_disk_space_container - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} - name: Install dependencies for documentation generation for branch-3.5 if: inputs.branch == 'branch-3.5' run: | # pandoc is required to generate PySpark APIs as well in nbsphinx. apt-get update -y apt-get install -y libcurl4-openssl-dev pandoc apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' python3.9 -m pip install ipython_genutils # See SPARK-38517 python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly<6.0.0' python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 - name: List Python packages for branch-3.5 and branch-4.0 if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0' run: python3.9 -m pip list - name: List Python packages if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' run: python3.11 -m pip list - name: Install dependencies for documentation generation run: | # Keep the version of Bundler here in sync with the following locations: # - dev/create-release/spark-rm/Dockerfile # - docs/README.md gem install bundler -v 2.4.22 cd docs bundle install --retry=100 - name: Run documentation build for branch-3.5 and branch-4.0 if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0' run: | # We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages. ln -s "$(which python3.9)" "/usr/local/bin/python3" # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd .. if [ -f "./dev/is-changed.py" ]; then # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi fi # Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC` echo "SKIP_ERRORDOC: $SKIP_ERRORDOC" echo "SKIP_SCALADOC: $SKIP_SCALADOC" echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC" echo "SKIP_RDOC: $SKIP_RDOC" echo "SKIP_SQLDOC: $SKIP_SQLDOC" cd docs bundle exec jekyll build - name: Run documentation build if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0' run: | # We need this link to make sure `python3` points to `python3.11` which contains the prerequisite packages. ln -s "$(which python3.11)" "/usr/local/bin/python3" # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd .. if [ -f "./dev/is-changed.py" ]; then # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs pyspark_modules=`cd dev && python3.11 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi fi export PYSPARK_DRIVER_PYTHON=python3.11 export PYSPARK_PYTHON=python3.11 # Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC` echo "SKIP_ERRORDOC: $SKIP_ERRORDOC" echo "SKIP_SCALADOC: $SKIP_SCALADOC" echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC" echo "SKIP_RDOC: $SKIP_RDOC" echo "SKIP_SQLDOC: $SKIP_SQLDOC" cd docs bundle exec jekyll build - name: Tar documentation if: github.repository != 'apache/spark' run: tar cjf site.tar.bz2 docs/_site - name: Upload documentation if: github.repository != 'apache/spark' uses: actions/upload-artifact@v4 with: name: site path: site.tar.bz2 retention-days: 1 # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well tpcds-1g: needs: precondition if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' name: Run TPC-DS queries with SF=1 runs-on: ubuntu-latest timeout-minutes: 120 env: SPARK_LOCAL_IP: localhost steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | tpcds-coursier- - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} - name: Cache TPC-DS generated data id: cache-tpcds-sf-1 uses: actions/cache@v4 with: path: ./tpcds-sf-1 key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} - name: Checkout tpcds-kit repository if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' uses: actions/checkout@v4 with: repository: databricks/tpcds-kit ref: 1b7fb7529edae091684201fab142d956d6afd881 path: ./tpcds-kit - name: Build tpcds-kit if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' run: cd tpcds-kit/tools && make OS=LINUX - name: Generate TPC-DS (SF=1) table data if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" - name: Run TPC-DS queries (Sort merge join) run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" env: SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=-1 spark.sql.join.preferSortMergeJoin=true - name: Run TPC-DS queries (Broadcast hash join) run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" env: SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=10485760 - name: Run TPC-DS queries (Shuffled hash join) run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" env: SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=-1 spark.sql.join.forceApplyShuffledHashJoin=true - name: Run TPC-DS queries on collated data if: inputs.branch != 'branch-3.5' run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite" - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 with: name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: ${{ !success() }} uses: actions/upload-artifact@v4 with: name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/unit-tests.log" docker-integration-tests: needs: precondition if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' name: Run Docker integration tests runs-on: ubuntu-latest timeout-minutes: 120 env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_UNIDOC: true SKIP_MIMA: true SKIP_PACKAGING: true steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | docker-integration-coursier- - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} - name: Run tests env: ${{ fromJSON(inputs.envs) }} run: | ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest - name: Upload test results to report if: always() uses: actions/upload-artifact@v4 with: name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: ${{ !success() }} uses: actions/upload-artifact@v4 with: name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/unit-tests.log" k8s-integration-tests: needs: precondition if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' name: Run Spark on Kubernetes Integration test runs-on: ubuntu-latest timeout-minutes: 120 steps: - name: Checkout Spark repository uses: actions/checkout@v4 with: fetch-depth: 0 repository: apache/spark ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache SBT and Maven uses: actions/cache@v4 with: path: | build/apache-maven-* build/*.jar ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - name: Cache Coursier local repository uses: actions/cache@v4 with: path: ~/.cache/coursier key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | k8s-integration-coursier- - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} - name: Install R run: | sudo apt update sudo apt-get install r-base - name: Start Minikube uses: medyagh/setup-minikube@v0.0.19 with: kubernetes-version: "1.33.0" # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic cpus: 2 memory: 6144m - name: Print K8S pods and nodes info run: | kubectl get pods -A kubectl describe node - name: Run Spark on K8S integration test run: | # Prepare PV test PVC_TMP_DIR=$(mktemp -d) export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR export PVC_TESTS_VM_PATH=$PVC_TMP_DIR minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true elif [[ "${{ inputs.branch }}" == 'branch-4.0' ]]; then kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.11.0/installer/volcano-development.yaml || true else kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.12.1/installer/volcano-development.yaml || true fi eval $(minikube docker-env) build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" - name: Upload Spark on K8S integration tests log files if: ${{ !success() }} uses: actions/upload-artifact@v4 with: name: spark-on-kubernetes-it-log path: "**/target/integration-tests.log" ui: needs: [precondition] if: fromJson(needs.precondition.outputs.required).ui == 'true' name: Run Spark UI tests runs-on: ubuntu-latest timeout-minutes: 120 steps: - uses: actions/checkout@v4 - name: Use Node.js uses: actions/setup-node@v4 with: node-version: 20 cache: 'npm' cache-dependency-path: ui-test/package-lock.json - run: | cd ui-test npm install --save-dev node --experimental-vm-modules node_modules/.bin/jest