diff --git a/.codecov.yml b/.codecov.yml deleted file mode 100644 index 046261603d..0000000000 --- a/.codecov.yml +++ /dev/null @@ -1,2 +0,0 @@ -ignore: - - "**/tensorflow/tensorflow_serving/.*" diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 8ed7382211..0000000000 --- a/.coveragerc +++ /dev/null @@ -1,4 +0,0 @@ -[run] -concurrency = threading -omit = sagemaker/tests/* -timid = True diff --git a/.dictionary b/.dictionary deleted file mode 100644 index 8907f4ff9b..0000000000 --- a/.dictionary +++ /dev/null @@ -1,38 +0,0 @@ -args -arn -autoscaling -aws -bool -boolean -boto -botocore -clienterror -cloudwatch -cron -config -dataset -datasets -datetime -desc -docstring -entrypoint -env -iam -hyperparameter -hyperparameters -jupyter -kms -kwargs -neo -noqa -rc -runtime -sagemaker -stdout -str -subdirectories -subnet -subnets -unexpectedstatusexception -uri -vpc diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 51ecee6eee..0000000000 --- a/.flake8 +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -application_import_names = sagemaker, tests -import-order-style = google -per-file-ignores = - tests/unit/test_tuner.py: F405 diff --git a/.githooks/pre-push b/.githooks/pre-push deleted file mode 100755 index a6d1f6ad6b..0000000000 --- a/.githooks/pre-push +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -# this pre-push hook runs style checks and unit tests in python 3.6, 3.7, and 3.8 using tox. - -set -e - -TOX_PARALLEL_NO_SPINNER=1, -PY_COLORS=0 -start_time=`date +%s` -tox -e flake8,pylint,docstyle,black-check,twine --parallel all -./ci-scripts/displaytime.sh 'flake8,pylint,docstyle,black-check,twine' $start_time -start_time=`date +%s` -tox -e sphinx,doc8 --parallel all -./ci-scripts/displaytime.sh 'sphinx,doc8' $start_time -start_time=`date +%s` -tox -e py36,py37,py38 --parallel all -- tests/unit -./ci-scripts/displaytime.sh 'py36,py37,py38 unit' $start_time diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 036f6a3e9e..746d95a2aa 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,11 +7,16 @@ assignees: '' --- +**PySDK Version** +- [ ] PySDK V2 (2.x) +- [ ] PySDK V3 (3.x) + **Describe the bug** A clear and concise description of what the bug is. **To reproduce** A clear, step-by-step set of instructions to reproduce the bug. +The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue. **Expected behavior** A clear and concise description of what you expected to happen. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 664bf50754..0000000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,26 +0,0 @@ -*Issue #, if available:* - -*Description of changes:* - -*Testing done:* - -## Merge Checklist - -_Put an `x` in the boxes that apply. You can also fill these out after creating the PR. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request._ - -#### General - -- [ ] I have read the [CONTRIBUTING](https://github.com/aws/sagemaker-python-sdk/blob/master/CONTRIBUTING.md) doc -- [ ] I certify that the changes I am introducing will be backword compatible, and I have discussed concerns about this, if any, with the Python SDK team -- [ ] I used the commit message format described in [CONTRIBUTING](https://github.com/aws/sagemaker-python-sdk/blob/master/CONTRIBUTING.md#committing-your-change) -- [ ] I have passed the region in to all S3 and STS clients that I've initialized as part of this change. -- [ ] I have updated any necessary documentation, including [READMEs](https://github.com/aws/sagemaker-python-sdk/blob/master/README.rst) and [API docs](https://github.com/aws/sagemaker-python-sdk/tree/master/doc) (if appropriate) - -#### Tests - -- [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate) -- [ ] I have added unit and/or integration tests as appropriate to ensure backward compatibility of the changes -- [ ] I have checked that my tests are not configured for a specific region or account (if appropriate) -- [ ] I have used [`unique_name_from_base`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/utils.py#L77) to create resource names in integ tests (if appropriate) - -By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. diff --git a/.github/workflows/_conda-forge-package-release.yml b/.github/workflows/_conda-forge-package-release.yml new file mode 100644 index 0000000000..347489124e --- /dev/null +++ b/.github/workflows/_conda-forge-package-release.yml @@ -0,0 +1,99 @@ +name: _Conda Forge Package Release + +on: + workflow_call: + inputs: + package: + description: 'Package being released (e.g. sagemaker-train)' + required: true + type: string + feedstock: + description: 'Feedstock repo (e.g. conda-forge/sagemaker-train-feedstock)' + required: true + type: string + pr_search: + description: 'PR title search string (e.g. sagemaker-train v1.6.0)' + required: true + type: string + version: + description: 'Version of this package being released (e.g. 1.6.0)' + required: true + type: string + dep_package: + description: 'Conda dependency to wait for before retrying CI (e.g. sagemaker-core)' + required: true + type: string + dep_version: + description: 'Version of the dependency to wait for (e.g. 2.6.0)' + required: true + type: string + poll_interval: + required: true + type: string + max_attempts: + required: true + type: string + secrets: + token: + required: true + +jobs: + release: + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.token }} + steps: + - name: Wait for dependency (${{ inputs.dep_package }}==${{ inputs.dep_version }}) on conda-forge + run: | + PACKAGE="${{ inputs.dep_package }}" + VERSION="${{ inputs.dep_version }}" + echo "Waiting for ${PACKAGE}==${VERSION} on conda-forge..." + for i in $(seq 1 ${{ inputs.max_attempts }}); do + RESULT=$(conda search -c conda-forge --override-channels \ + "${PACKAGE}==${VERSION}" --json 2>/dev/null \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print('found' if d.get('$PACKAGE') else 'not_found')" \ + 2>/dev/null || echo "not_found") + echo "Attempt $i: ${RESULT}" + [ "$RESULT" = "found" ] && echo "${PACKAGE}==${VERSION} is available." && exit 0 + sleep ${{ inputs.poll_interval }} + done + echo "Timed out waiting for ${PACKAGE}==${VERSION}." && exit 1 + + - name: Merge and wait for ${{ inputs.package }} feedstock PR + run: | + REPO="${{ inputs.feedstock }}" + SEARCH="${{ inputs.pr_search }}" + for i in $(seq 1 ${{ inputs.max_attempts }}); do + STATE=$(gh pr list --repo "$REPO" --state all \ + --search "$SEARCH" --json state -q '.[0].state // "NOT_FOUND"') + echo "Attempt $i: ${STATE}" + [ "$STATE" = "MERGED" ] && exit 0 + PR=$(gh pr list --repo "$REPO" --state open \ + --search "$SEARCH" --json number -q '.[0].number') + if [ -n "$PR" ]; then + CI_STATUS=$(gh pr view "$PR" --repo "$REPO" \ + --json statusCheckRollup -q ' + .statusCheckRollup | map( + if .__typename == "CheckRun" then .conclusion + elif .__typename == "StatusContext" then .state + else null end + ) | if length == 0 then "pending" + elif any(. == null or . == "" or . == "IN_PROGRESS" or . == "QUEUED" or . == "WAITING" or . == "PENDING") then "pending" + elif all(. == "SUCCESS") then "success" + else "failure" end') + echo "CI status: ${CI_STATUS}" + if [ "$CI_STATUS" = "success" ]; then + echo "CI passed, merging PR #${PR}..." + gh pr merge "$PR" --repo "$REPO" --merge 2>/dev/null || true + elif [ "$CI_STATUS" = "failure" ]; then + echo "CI failed, retriggering..." + BRANCH=$(gh pr view "$PR" --repo "$REPO" --json headRefName -q .headRefName) + RUN_ID=$(gh run list --repo "$REPO" --branch "$BRANCH" \ + --json databaseId,conclusion -q \ + '[.[] | select(.conclusion=="failure")][0].databaseId') + [ -n "$RUN_ID" ] && gh run rerun "$RUN_ID" --repo "$REPO" --failed || true + fi + fi + sleep ${{ inputs.poll_interval }} + done + echo "Timed out waiting for ${{ inputs.package }} PR to merge." && exit 1 diff --git a/.github/workflows/ci-health.yml b/.github/workflows/ci-health.yml new file mode 100644 index 0000000000..db94ce084a --- /dev/null +++ b/.github/workflows/ci-health.yml @@ -0,0 +1,38 @@ +name: CI Health +on: + schedule: + - cron: "0 */3 * * *" + workflow_dispatch: + +permissions: + id-token: write # This is required for requesting the JWT + +jobs: + canaries-v3: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Canaries V3 + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-canaries-v3 + source-version: refs/heads/master + canaries-v2: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Canaries V2 + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-canaries-v2 + source-version: refs/heads/master-v2 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..a01c842acf --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,33 @@ +name: "CodeQL" +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + schedule: + - cron: '30 15 * * *' +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + runs-on: ${{ 'ubuntu-latest' }} + permissions: + security-events: write + packages: read + + strategy: + matrix: + include: + - language: python + build-mode: none + steps: + - name: Checkout repository + uses: actions/checkout@6ccd57f4c5d15bdc2fef309bd9fb6cc9db2ef1c6 + - name: Initialize CodeQL + uses: github/codeql-action/init@4b1d7da102ff94aca014c0245062b1a463356d72 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@4b1d7da102ff94aca014c0245062b1a463356d72 + with: + category: "/language:${{matrix.language}}" \ No newline at end of file diff --git a/.github/workflows/conda-forge-release-chain.yml b/.github/workflows/conda-forge-release-chain.yml new file mode 100644 index 0000000000..8c49014b74 --- /dev/null +++ b/.github/workflows/conda-forge-release-chain.yml @@ -0,0 +1,122 @@ +name: Conda Forge Release Chain + +on: + workflow_dispatch: + inputs: + poll_interval_seconds: + description: 'Seconds between polls (default: 300)' + required: false + default: '300' + timeout_attempts: + description: 'Max poll attempts per package before failing (default: 20 = 100min at 300s)' + required: false + default: '20' + +jobs: + read-versions: + runs-on: ubuntu-latest + outputs: + core: ${{ steps.v.outputs.core }} + train: ${{ steps.v.outputs.train }} + serve: ${{ steps.v.outputs.serve }} + mlops: ${{ steps.v.outputs.mlops }} + pysdk: ${{ steps.v.outputs.pysdk }} + meta: ${{ steps.v.outputs.meta }} + steps: + - uses: actions/checkout@v4 + - name: Read versions + id: v + run: | + echo "core=$(cat sagemaker-core/VERSION)" >> $GITHUB_OUTPUT + echo "train=$(cat sagemaker-train/VERSION)" >> $GITHUB_OUTPUT + echo "serve=$(cat sagemaker-serve/VERSION)" >> $GITHUB_OUTPUT + echo "mlops=$(cat sagemaker-mlops/VERSION)" >> $GITHUB_OUTPUT + echo "pysdk=$(cat VERSION)" >> $GITHUB_OUTPUT + echo "meta=$(cat VERSION)" >> $GITHUB_OUTPUT + echo "Versions:" + echo " sagemaker-core: $(cat sagemaker-core/VERSION)" + echo " sagemaker-train: $(cat sagemaker-train/VERSION)" + echo " sagemaker-serve: $(cat sagemaker-serve/VERSION)" + echo " sagemaker-mlops: $(cat sagemaker-mlops/VERSION)" + echo " sagemaker-python-sdk: $(cat VERSION)" + echo " sagemaker (meta): $(cat VERSION)" + + # sagemaker-train waits for sagemaker-core + release-sagemaker-train: + needs: read-versions + uses: ./.github/workflows/_conda-forge-package-release.yml + with: + package: sagemaker-train + feedstock: conda-forge/sagemaker-train-feedstock + pr_search: "sagemaker-train v${{ needs.read-versions.outputs.train }}" + version: ${{ needs.read-versions.outputs.train }} + dep_package: sagemaker-core + dep_version: ${{ needs.read-versions.outputs.core }} + poll_interval: ${{ github.event.inputs.poll_interval_seconds }} + max_attempts: ${{ github.event.inputs.timeout_attempts }} + secrets: + token: ${{ secrets.CONDA_FORGE_RELEASE }} + + # sagemaker-serve waits for sagemaker-train + release-sagemaker-serve: + needs: [read-versions, release-sagemaker-train] + uses: ./.github/workflows/_conda-forge-package-release.yml + with: + package: sagemaker-serve + feedstock: conda-forge/sagemaker-serve-feedstock + pr_search: "sagemaker-serve v${{ needs.read-versions.outputs.serve }}" + version: ${{ needs.read-versions.outputs.serve }} + dep_package: sagemaker-train + dep_version: ${{ needs.read-versions.outputs.train }} + poll_interval: ${{ github.event.inputs.poll_interval_seconds }} + max_attempts: ${{ github.event.inputs.timeout_attempts }} + secrets: + token: ${{ secrets.CONDA_FORGE_RELEASE }} + + # sagemaker-mlops waits for sagemaker-serve + release-sagemaker-mlops: + needs: [read-versions, release-sagemaker-serve] + uses: ./.github/workflows/_conda-forge-package-release.yml + with: + package: sagemaker-mlops + feedstock: conda-forge/sagemaker-mlops-feedstock + pr_search: "sagemaker-mlops v${{ needs.read-versions.outputs.mlops }}" + version: ${{ needs.read-versions.outputs.mlops }} + dep_package: sagemaker-serve + dep_version: ${{ needs.read-versions.outputs.serve }} + poll_interval: ${{ github.event.inputs.poll_interval_seconds }} + max_attempts: ${{ github.event.inputs.timeout_attempts }} + secrets: + token: ${{ secrets.CONDA_FORGE_RELEASE }} + + # sagemaker-python-sdk waits for sagemaker-mlops + release-sagemaker-python-sdk: + needs: [read-versions, release-sagemaker-mlops] + uses: ./.github/workflows/_conda-forge-package-release.yml + with: + package: sagemaker-python-sdk + feedstock: conda-forge/sagemaker-python-sdk-feedstock + pr_search: "[bot-automerge] sagemaker-python-sdk v${{ needs.read-versions.outputs.pysdk }}" + version: ${{ needs.read-versions.outputs.pysdk }} + dep_package: sagemaker-mlops + dep_version: ${{ needs.read-versions.outputs.mlops }} + poll_interval: ${{ github.event.inputs.poll_interval_seconds }} + max_attempts: ${{ github.event.inputs.timeout_attempts }} + secrets: + token: ${{ secrets.CONDA_FORGE_RELEASE }} + + # sagemaker (meta) waits for sagemaker-python-sdk + release-sagemaker: + needs: [read-versions, release-sagemaker-python-sdk] + uses: ./.github/workflows/_conda-forge-package-release.yml + with: + package: sagemaker + feedstock: conda-forge/sagemaker-feedstock + pr_search: "[bot-automerge] sagemaker v${{ needs.read-versions.outputs.meta }}" + version: ${{ needs.read-versions.outputs.meta }} + dep_package: sagemaker-python-sdk + dep_version: ${{ needs.read-versions.outputs.pysdk }} + poll_interval: ${{ github.event.inputs.poll_interval_seconds }} + max_attempts: ${{ github.event.inputs.timeout_attempts }} + secrets: + token: ${{ secrets.CONDA_FORGE_RELEASE }} diff --git a/.github/workflows/fortress-scan.yml b/.github/workflows/fortress-scan.yml new file mode 100644 index 0000000000..d2b6af54d7 --- /dev/null +++ b/.github/workflows/fortress-scan.yml @@ -0,0 +1,62 @@ +name: Fortress Security Scan +on: + pull_request_target: + branches: + - "master" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} + cancel-in-progress: true + +permissions: + id-token: write + +jobs: + collab-check: + runs-on: ubuntu-latest + outputs: + approval-env: ${{ steps.collab-check.outputs.result }} + steps: + - name: Collaborator Check + uses: actions/github-script@v7 + id: collab-check + with: + github-token: ${{ secrets.COLLAB_CHECK_TOKEN }} + result-encoding: string + script: | + try { + const res = await github.rest.repos.checkCollaborator({ + owner: context.repo.owner, + repo: context.repo.repo, + username: "${{ github.event.pull_request.user.login }}", + }); + console.log("Verified ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving.") + return res.status == "204" ? "auto-approve" : "manual-approval" + } catch (error) { + console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval.") + return "manual-approval" + } + + wait-for-approval: + runs-on: ubuntu-latest + needs: [collab-check] + environment: ${{ needs.collab-check.outputs.approval-env }} + steps: + - run: echo "Workflow Approved! Starting Fortress Security Scan." + + fortress-scan: + runs-on: ubuntu-latest + needs: [wait-for-approval] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run Fortress Security Scan + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-fortress-scan + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' diff --git a/.github/workflows/pr-checks-master-v2.yml b/.github/workflows/pr-checks-master-v2.yml new file mode 100644 index 0000000000..2004af5ea5 --- /dev/null +++ b/.github/workflows/pr-checks-master-v2.yml @@ -0,0 +1,97 @@ +name: Sagemaker PR Checks (Master-v2) +on: + pull_request_target: + branches: + - "master-v2" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} + cancel-in-progress: true + +permissions: + id-token: write + +jobs: + collab-check: + runs-on: ubuntu-latest + outputs: + approval-env: ${{ steps.collab-check.outputs.result }} + steps: + - name: Collaborator Check + uses: actions/github-script@v7 + id: collab-check + with: + github-token: ${{ secrets.COLLAB_CHECK_TOKEN }} + result-encoding: string + script: | + try { + const res = await github.rest.repos.checkCollaborator({ + owner: context.repo.owner, + repo: context.repo.repo, + username: "${{ github.event.pull_request.user.login }}", + }); + console.log("Verifed ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving PR Checks.") + return res.status == "204" ? "auto-approve" : "manual-approval" + } catch (error) { + console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval to run PR Checks.") + return "manual-approval" + } + wait-for-approval: + runs-on: ubuntu-latest + needs: [collab-check] + environment: ${{ needs.collab-check.outputs.approval-env }} + steps: + - run: echo "Workflow Approved! Starting PR Checks." + codestyle-doc-tests: + runs-on: ubuntu-latest + needs: [wait-for-approval] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Codestyle & Doc Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-codestyle-doc-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + unit-tests: + runs-on: ubuntu-latest + needs: [wait-for-approval] + strategy: + fail-fast: false + matrix: + python-version: ["py39","py310","py311","py312"] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Unit Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-unit-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + env-vars-for-codebuild: | + PY_VERSION + env: + PY_VERSION: ${{ matrix.python-version }} + integ-tests: + runs-on: ubuntu-latest + needs: [wait-for-approval] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Integ Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-integ-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' diff --git a/.github/workflows/pr-checks-master.yml b/.github/workflows/pr-checks-master.yml new file mode 100644 index 0000000000..4f63ad0b9a --- /dev/null +++ b/.github/workflows/pr-checks-master.yml @@ -0,0 +1,217 @@ +name: Sagemaker PR Checks (Master) +on: + pull_request_target: + branches: + - "master" + paths: + - 'sagemaker-train/**' + - 'sagemaker-serve/**' + - 'sagemaker-mlops/**' + - 'sagemaker-core/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} + cancel-in-progress: true + +permissions: + id-token: write + +jobs: + collab-check: + runs-on: ubuntu-latest + outputs: + approval-env: ${{ steps.collab-check.outputs.result }} + steps: + - name: Collaborator Check + uses: actions/github-script@v7 + id: collab-check + with: + github-token: ${{ secrets.COLLAB_CHECK_TOKEN }} + result-encoding: string + script: | + try { + const res = await github.rest.repos.checkCollaborator({ + owner: context.repo.owner, + repo: context.repo.repo, + username: "${{ github.event.pull_request.user.login }}", + }); + console.log("Verifed ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving PR Checks.") + return res.status == "204" ? "auto-approve" : "manual-approval" + } catch (error) { + console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval to run PR Checks.") + return "manual-approval" + } + wait-for-approval: + runs-on: ubuntu-latest + needs: [ collab-check ] + environment: ${{ needs.collab-check.outputs.approval-env }} + steps: + - run: echo "Workflow Approved! Starting PR Checks." + detect-changes: + runs-on: ubuntu-latest + needs: [wait-for-approval] + outputs: + submodules: ${{ steps.check-changes.outputs.submodules }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + token: ${{ secrets.GH_PAT }} + ref: ${{ github.event.pull_request.base.ref }} + - name: Detect Changes + id: check-changes + run: | + set -e + + echo "Target Branch: ${{ github.event.pull_request.base.ref }}" + echo "Current Target SHA: $(git rev-parse HEAD)" + echo "PR Number: ${{ github.event.pull_request.number }}" + echo "PR Latest SHA: ${{ github.event.pull_request.head.sha }}" + + git fetch origin pull/${{ github.event.pull_request.number }}/head + CHANGES=$(git diff --name-only HEAD FETCH_HEAD) + + echo "Changed files:" + echo "$CHANGES" + + # Function to extract dependencies from pyproject.toml + get_dependencies() { + local module=$1 + grep "sagemaker-" "$module/pyproject.toml" | grep -o 'sagemaker-[a-z]*' | sort -u + } + + # Function to find all modules that depend on a given module (recursively) + find_dependents() { + local target=$1 + local all_modules=("sagemaker-core" "sagemaker-train" "sagemaker-serve" "sagemaker-mlops") + local dependents=() + + for module in "${all_modules[@]}"; do + if [ "$module" != "$target" ]; then + if get_dependencies "$module" | grep -q "^$target$"; then + dependents+=("$module") + fi + fi + done + + echo "${dependents[@]}" + } + + # Initialize set of submodules to test (using associative array) + declare -A SUBMODULES_SET + + # Function to recursively add module and all its dependents + add_module_and_dependents() { + local module=$1 + + if [ -z "${SUBMODULES_SET[$module]}" ]; then + SUBMODULES_SET["$module"]=1 + echo "Adding $module to test set" + + # Find all modules that depend on this one and add them recursively + local dependents=$(find_dependents "$module") + for dependent in $dependents; do + add_module_and_dependents "$dependent" + done + fi + } + + # Check which submodules changed and add them plus their dependents + if echo "$CHANGES" | grep -q "^sagemaker-core/"; then + echo "sagemaker-core changed - will add core and all dependents" + add_module_and_dependents "sagemaker-core" + fi + + if echo "$CHANGES" | grep -q "^sagemaker-train/"; then + echo "sagemaker-train changed - will add train and all dependents" + add_module_and_dependents "sagemaker-train" + fi + + if echo "$CHANGES" | grep -q "^sagemaker-serve/"; then + echo "sagemaker-serve changed - will add serve and all dependents" + add_module_and_dependents "sagemaker-serve" + fi + + if echo "$CHANGES" | grep -q "^sagemaker-mlops/"; then + echo "sagemaker-mlops changed - will add mlops" + add_module_and_dependents "sagemaker-mlops" + fi + + # Convert associative array to JSON array + SUBMODULES='[]' + for submodule in "${!SUBMODULES_SET[@]}"; do + if [ "$SUBMODULES" = '[]' ]; then + SUBMODULES="[\"$submodule\"]" + else + SUBMODULES=$(echo $SUBMODULES | sed "s/\]$/,\"$submodule\"\]/") + fi + done + + echo "Final SUBMODULES: $SUBMODULES" + echo "submodules=$SUBMODULES" >> $GITHUB_OUTPUT + + codestyle-doc-tests: + runs-on: ubuntu-latest + needs: [detect-changes] + if: needs.detect-changes.outputs.submodules != '[]' + strategy: + fail-fast: false + matrix: + submodule: ${{ fromJson(needs.detect-changes.outputs.submodules) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run CodeBuild for ${{ matrix.submodule }} + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-codestyle-doc-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + + unit-tests: + runs-on: ubuntu-latest + needs: [detect-changes] + if: needs.detect-changes.outputs.submodules != '[]' + strategy: + fail-fast: false + matrix: + submodule: ${{ fromJson(needs.detect-changes.outputs.submodules) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run Unit Tests for ${{ matrix.submodule }} + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-unit-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + + integ-tests: + runs-on: ubuntu-latest + needs: [detect-changes] + if: needs.detect-changes.outputs.submodules != '[]' + strategy: + fail-fast: false + matrix: + submodule: ${{ fromJson(needs.detect-changes.outputs.submodules) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run Integ Tests for ${{ matrix.submodule }} + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-integ-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' diff --git a/.github/workflows/security-monitoring.yml b/.github/workflows/security-monitoring.yml new file mode 100644 index 0000000000..8e44b426c2 --- /dev/null +++ b/.github/workflows/security-monitoring.yml @@ -0,0 +1,121 @@ +name: Security Monitoring + +on: + schedule: + - cron: '0 16 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.run_id }} + cancel-in-progress: true + +permissions: + id-token: write + +jobs: + check-code-scanning-alerts: + runs-on: ubuntu-latest + outputs: + code_scanning_alert_status: ${{ steps.check-code-scanning-alerts.outputs.code_scanning_alert_status }} + steps: + - name: Check for security alerts + id: check-code-scanning-alerts + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea + with: + github-token: ${{ secrets.GH_PAT }} + script: | + async function checkAlerts() { + const owner = '${{ github.repository_owner }}'; + const repo = '${{ github.event.repository.name }}'; + const ref = 'refs/heads/master'; + + const codeScanningAlerts = await github.rest.codeScanning.listAlertsForRepo({ + owner, + repo, + ref: ref + }); + const activeCodeScanningAlerts = codeScanningAlerts.data.filter(alert => alert.state === 'open'); + core.setOutput('code_scanning_alert_status', activeCodeScanningAlerts.length > 0 ? '1': '0'); + } + await checkAlerts(); + + check-dependabot-alerts: + runs-on: ubuntu-latest + outputs: + dependabot_alert_status: ${{ steps.check-dependabot-alerts.outputs.dependabot_alert_status }} + steps: + - name: Check for dependabot alerts + id: check-dependabot-alerts + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea + with: + github-token: ${{ secrets.GH_PAT }} + script: | + async function checkAlerts() { + const owner = '${{ github.repository_owner }}'; + const repo = '${{ github.event.repository.name }}'; + + const dependabotAlerts = await github.rest.dependabot.listAlertsForRepo({ + owner, + repo, + headers: { + 'accept': 'applications/vnd.github+json' + } + }); + const activeDependabotAlerts = dependabotAlerts.data.filter(alert => alert.state === 'open'); + core.setOutput('dependabot_alert_status', activeDependabotAlerts.length > 0 ? '1': '0'); + } + await checkAlerts(); + + check-secret-scanning-alerts: + runs-on: ubuntu-latest + outputs: + secret_scanning_alert_status: ${{ steps.check-secret-scanning-alerts.outputs.secret_scanning_alert_status }} + steps: + - name: Check for secret scanning alerts + id: check-secret-scanning-alerts + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea + with: + github-token: ${{ secrets.GH_PAT }} + script: | + async function checkAlerts() { + const owner = '${{ github.repository_owner }}'; + const repo = '${{ github.event.repository.name }}'; + + const secretScanningAlerts = await github.rest.secretScanning.listAlertsForRepo({ + owner, + repo, + }); + const activeSecretScanningAlerts = secretScanningAlerts.data.filter(alert => alert.state === 'open'); + core.setOutput('secret_scanning_alert_status', activeSecretScanningAlerts.length > 0 ? '1': '0'); + } + await checkAlerts(); + + put-metric-data: + runs-on: ubuntu-latest + needs: [check-code-scanning-alerts, check-dependabot-alerts, check-secret-scanning-alerts] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@12e3392609eaaceb7ae6191b3f54bbcb85b5002b + with: + role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }} + aws-region: us-west-2 + - name: Put Code Scanning Alert Metric Data + run: | + if [ "${{ needs.check-code-scanning-alerts.outputs.code_scanning_alert_status }}" == "1" ]; then + aws cloudwatch put-metric-data --metric-name CodeScanningAlert --namespace SecurityMonitoringMetrics --value 1 --unit Count --dimensions ProjectName=sagemaker-python-sdk + else + aws cloudwatch put-metric-data --metric-name CodeScanningAlert --namespace SecurityMonitoringMetrics --value 0 --unit Count --dimensions ProjectName=sagemaker-python-sdk + fi + - name: Put Dependabot Alert Metric Data + run: | + if [ "${{ needs.check-dependabot-alerts.outputs.dependabot_alert_status }}" == "1" ]; then + aws cloudwatch put-metric-data --metric-name DependabotAlert --namespace SecurityMonitoringMetrics --value 1 --unit Count --dimensions ProjectName=sagemaker-python-sdk + else + aws cloudwatch put-metric-data --metric-name DependabotAlert --namespace SecurityMonitoringMetrics --value 0 --unit Count --dimensions ProjectName=sagemaker-python-sdk + fi + - name: Put Secret Scanning Alert Metric Data + run: | + if [ "${{ needs.check-secret-scanning-alerts.outputs.secret_scanning_alert_status }}" == "1" ]; then + aws cloudwatch put-metric-data --metric-name SecretScanningAlert --namespace SecurityMonitoringMetrics --value 1 --unit Count --dimensions ProjectName=sagemaker-python-sdk + else + aws cloudwatch put-metric-data --metric-name SecretScanningAlert --namespace SecurityMonitoringMetrics --value 0 --unit Count --dimensions ProjectName=sagemaker-python-sdk + fi \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5b496055e9..b4f0c0d6c6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ build src/*.egg-info .cache -.coverage +.coverage* sagemaker_venv* *.egg-info .tox @@ -13,15 +13,20 @@ dist/ **/*.pyc **.pyc scratch*.py +scratch/ .eggs *.egg examples/tensorflow/distributed_mnist/data *.iml + +# Sphinx documentation +docs/_build/ doc/_build doc/_static doc/_templates **/.DS_Store venv/ +.venv/ *~ .pytest_cache/ *.swp @@ -29,4 +34,12 @@ venv/ env/ .vscode/ **/tmp -.python-version \ No newline at end of file +.python-version +*.html +**/_repack_script_launcher.sh +sagemaker_train/src/**/container_drivers/sm_train.sh +sagemaker_train/src/**/container_drivers/sourcecode.json +sagemaker_train/src/**/container_drivers/distributed.json +.kiro +docs/api/generated/ +.hypothesis diff --git a/.pydocstylerc b/.pydocstylerc deleted file mode 100644 index a5083c0d63..0000000000 --- a/.pydocstylerc +++ /dev/null @@ -1,4 +0,0 @@ -[pydocstyle] -inherit = false -ignore = D104,D107,D202,D203,D213,D214,D400,D401,D404,D406,D407,D411,D413,D414,D415,D417 -match = (?!record_pb2).*\.py diff --git a/.pylintrc b/.pylintrc index 9c16afcc22..223580f4d3 100644 --- a/.pylintrc +++ b/.pylintrc @@ -42,7 +42,7 @@ unsafe-load-any-extension=no # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code -extension-pkg-whitelist=numpy +extension-pkg-allow-list=numpy,math,_struct,_hashlib # Allow optimization of some AST trees. This will activate a peephole AST # optimizer, which will apply various small optimizations. For instance, it can @@ -94,6 +94,24 @@ disable= useless-object-inheritance, # TODO: Enable this check and fix code once Python 2 is no longer supported. super-with-arguments, raise-missing-from, + C0116, # Missing function or method docstring + C0209, # Use f-string instead of format + E0015, # Unrecognized option found in config + E0702, # Raising a string instead of an exception + E1101, # Module has no member (likely dynamic attr) + E1136, # Value assigned to something inferred as None + R0022, # Useless option value in config + R1710, # Inconsistent return statements + R1714, # Consider using `in` with comparisons + R1729, # Use a generator + R1732, + R1735, # Consider using a dict or list literal + W0237, # Argument renamed in override + W0613, # Unused argument + W0621, # Redefining name from outer scope + W0719 + W1404, # Implicit string concatenation + W1514, # `open()` used without encoding [REPORTS] # Set the output format. Available formats are text, parseable, colorized, msvs @@ -309,7 +327,7 @@ ignore-mixin-members=yes # (useful for modules/projects where namespaces are manipulated during runtime # and thus existing member attributes cannot be deduced by static analysis. It # supports qualified module names, as well as Unix pattern matching. -ignored-modules=distutils +ignored-modules= # List of class names for which member attributes should not be checked (useful # for classes with dynamically set attributes). This supports the use of @@ -383,7 +401,7 @@ max-returns=6 max-branches=12 # Maximum number of statements in function / method body -max-statements=100 +max-statements=105 # Maximum number of parents for a class (see R0901). max-parents=7 @@ -435,4 +453,4 @@ analyse-fallback-blocks=no # Exceptions that will emit a warning when being caught. Defaults to # "Exception" -overgeneral-exceptions=Exception +overgeneral-exceptions=builtins.Exception diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..528ee18626 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,30 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +sphinx: + configuration: docs/conf.py + +formats: [] + +python: + install: + - requirements: docs/requirements.txt + - method: pip + path: ./sagemaker-core + - method: pip + path: ./sagemaker-train + - method: pip + path: ./sagemaker-serve + - method: pip + path: ./sagemaker-mlops + - method: pip + path: . + extra_requirements: + - docs \ No newline at end of file diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index b84167fa1c..0000000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,16 +0,0 @@ -# ReadTheDocs environment customization to allow us to use conda to install -# libraries which have C dependencies for the doc build. See: -# https://docs.readthedocs.io/en/latest/config-file/v2.html - -version: 2 - -python: - version: 3.6 - install: - - method: pip - path: . - - requirements: doc/requirements.txt - -sphinx: - configuration: doc/conf.py - fail_on_warning: true # http://www.sphinx-doc.org/en/master/man/sphinx-build.html#id6 diff --git a/07-ml-model-development(1).ipynb b/07-ml-model-development(1).ipynb new file mode 100644 index 0000000000..471b642cee --- /dev/null +++ b/07-ml-model-development(1).ipynb @@ -0,0 +1,1260 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "95cfbd26-7fa9-4dd7-9367-c27e3c9e03bb", + "metadata": {}, + "source": [ + "# Leveraging Lakehouse data with Amazon SageMaker XGBoost and AutoML\n", + "_**Supervised learning with MLFlow logging of experiments**_\n", + "\n", + "---\n", + "\n", + "---\n", + "\n", + "## Contents\n", + "\n", + "1. [Background](#Background)\n", + "1. [Prepration](#Preparation)\n", + "1. [Data Preparation](#DataPreparation)\n", + "1. [Training XGBoost](#XGBoost)\n", + "1. [Training AutoML](#AutoML)\n", + "1. [Deployment and inference test](#Deployment_and_inference_test)\n", + "1. [Evaluation](#Evaluation)\n", + "\n", + "---\n", + "\n", + "## Background\n", + "One of the key advantages of the new SageMaker AI Unified Studio is its ability to integrate data from multiple sources. In this notebook, we'll walk through an example of bringing data from a Lakehouse to train models using XGBoost and AutoML. We'll also leverage the power of MLFlow servers to capture and analyze the training data.\n", + "\n", + "This notebook demonstrates how to predict a customer's purchase potential based on a set of features. We'll go through the following steps:\n", + "\n", + "* Setting up your Amazon SageMaker AI notebook\n", + "* Querying data sources using Athena\n", + "* Transforming the data to feed into Amazon SageMaker algorithms\n", + "* Training a model using the Gradient Boosting algorithm (XGBoost)\n", + "* Launching an AutoML task to target the same feature\n", + "* Utilizing MLFlow to capture and visualize experiment data\n", + "\n", + "---\n", + "\n", + "## Preparation\n", + "\n", + "Let's start by bringing in the Python libraries that we'll use throughout the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d777a36-e56e-467a-a69e-817c57fee926", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import pandas as pd\n", + "import numpy as np\n", + "import logging\n", + "import sagemaker\n", + "import mlflow\n", + "import os\n", + "from datetime import datetime, timezone\n", + "from sagemaker.modules import Session\n", + "from sagemaker_studio import Project" + ] + }, + { + "cell_type": "markdown", + "id": "1aa126ff-222b-4796-b14f-e8044a6e361d", + "metadata": {}, + "source": [ + "Now, let's set up our logging and specify the necessary configurations:\n", + "\n", + "1. Configure the logging we'll use, including the ARN of the MLFlow server we've set up in the prerequisite\n", + "2. Specify the S3 bucket and prefix for storing training and model data\n", + "3. Set up the IAM role ARN to provide necessary permissions for training and hosting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daba65bb-43b8-4e54-92e9-9c7e33073559", + "metadata": {}, + "outputs": [], + "source": [ + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "id": "bd639c27-1a78-4507-b98f-cdb418ab87fe", + "metadata": {}, + "source": [ + "### Copy MLFlow Tracking Server ARN\n", + "\n", + "Copy/Paste the ARN of your Project MLFlow Tracking Server. You can find it by navigating to the Project->Compute page, then selecting \"MLFlow Tracking Server\" tab. Select the `Copy ARN` button" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4d6000-5e19-4c0c-a6ab-7a5455317b61", + "metadata": {}, + "outputs": [], + "source": [ + "project = Project()\n", + "#mlflow_arn = project.mlflow_tracking_server_arn\n", + "\n", + "# Cut/Paste the ARN from the Tracking Server instance\n", + "## mlflow_arn = \"arn:aws:sagemaker:us-west-2:767398116961:mlflow-tracking-server/tracking-server-blogxwjruvstqo-cv0wvz63pbj11s-dev\"\n", + "mlflow_arn = \"COPY_TRACKING_SERVER_ARN_HERE\"\n", + "print(f\"ARN: {mlflow_arn}\")\n", + "\n", + "mlflow.set_tracking_uri(mlflow_arn)" + ] + }, + { + "cell_type": "markdown", + "id": "b96facd5", + "metadata": {}, + "source": [ + "One of the added benefit of SageMaker Unified Studio is the use of Project to bring resources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff00e488-7591-4446-9444-788d9df49f66", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize AWS session\n", + "session = boto3.Session()\n", + "bucket_root = project.s3.root\n", + "role = project.iam_role\n", + "\n", + "# Parse the S3 URI\n", + "s3_parts = bucket_root.replace(\"s3://\", \"\").split(\"/\")\n", + "bucket = s3_parts[0]\n", + "prefix = \"/\".join(s3_parts[1:])\n", + "\n", + "## If you prefer NOT using the new SageMaker AI Project framework, here is an alternative\n", + "#session = sagemaker.Session()\n", + "#bucket = session.default_bucket()\n", + "#from sagemaker import get_execution_role\n", + "#role = get_execution_role()\n", + "#sagemaker_client = boto3.Session().client(service_name='sagemaker',region_name=region)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7310961-92a5-4bef-a3c8-57a5cd11a1ee", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Using Bucket: {bucket}\")\n", + "print(f\"Using prefix: {prefix}\")\n", + "print(f\"Using Role: {role}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4a51b428", + "metadata": {}, + "source": [ + "Now, let's retrieve the name of the project's database through the default catalog:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea29364f-1c0d-4ddd-8f24-5068c06f615b", + "metadata": {}, + "outputs": [], + "source": [ + "# A good example of the Project class is getting the name of the project's database through the default catalog\n", + "catalog = project.connection().catalog()\n", + "project_database = catalog.databases[0].name\n", + "project_database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf429cae-55ac-4db9-819e-272462ef217d", + "metadata": {}, + "outputs": [], + "source": [ + "# Note: If your account has more than one Catalog, use this code to lookup names\n", + "id = 0\n", + "for db in catalog.databases:\n", + " print(f\"Index {id}: {db}\")\n", + " id += 1" + ] + }, + { + "cell_type": "markdown", + "id": "0eccfa13-9ec6-445a-85a6-f153bcb3ab0d", + "metadata": {}, + "source": [ + "### Data Preparation\n", + "\n", + "First, we need to upload the data file named \"5000-sales-records.csv\". If you are running this notebook from the **Sagemaker Unified Studio Workshop**, this file can be downloaded from the instructions page. Next, we can upload the file using the S3 Browser on the Project->Data page. Once the file is successfully uploaded, open the S3 Console, and locate the file by navigating to the folder prefix where you uploaded it. (Note: file uploads can normally be found under the `local-uploads` prefix).\n", + "\n", + "From the S3 console, select the file \"5000-sales-records.csv\" and hit \"Copy S3 URI\" button. Then paste the URI within the quotes in the read_csv() call below. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc19b755-4880-4dba-8f23-fd7b490db2db", + "metadata": {}, + "outputs": [], + "source": [ + "# Using pandas to read CSV directly from S3 URI\n", + "\n", + "# Example:\n", + "# data = pd.read_csv(\"s3://csv-file-store-72f9fec0/dzd_d22v67c8i2tzv4/blogxwjruvstqo/dev/local-uploads/1756921917470/5000-sales-records.csv\")\n", + "data = pd.read_csv(\"COPY_S3_URI_HERE\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d866bb71-4f54-49fc-ad67-023f3668f32f", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename columns to match Spark Dataframe infer\n", + "data.rename(columns={\n", + " \"Region\": \"region\",\n", + " \"Country\": \"country\",\n", + " \"Item Type\": \"item type\",\n", + " \"Sales Channel\": \"sales channel\",\n", + " \"Order Priority\": \"order priority\",\n", + " \"Order Date\": \"order date\",\n", + " \"Order ID\": \"order id\",\n", + " \"Ship Date\": \"ship date\",\n", + " \"Units Sold\": \"units sold\",\n", + " \"Unit Price\": \"unit price\",\n", + " \"Unit Cost\": \"unit cost\",\n", + " \"Total Revenue\": \"total revenue\",\n", + " \"Total Cost\": \"total cost\",\n", + " \"Total Profit\": \"total profit\",\n", + " }, \n", + " inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "605c2be3-077b-4b8c-8882-6deac8a398da", + "metadata": {}, + "outputs": [], + "source": [ + "# Dump Dataframe metadata\n", + "logger.info(f\"DataFrame shape: {data.shape}\")\n", + "logger.info(\"\\nDataFrame info:\")\n", + "logger.info(data.info())" + ] + }, + { + "cell_type": "markdown", + "id": "c43a504a-3657-4b56-b2f5-62c7a93d636f", + "metadata": {}, + "source": [ + "Now that we have our data queried and available, let's prepare it for our machine learning models. We'll perform the following steps:\n", + "\n", + "1. Split the data into features (X) and target variable (y)\n", + "2. Handle any missing values\n", + "3. Encode categorical variables\n", + "4. Scale numerical features\n", + "5. Split the data into training and testing sets\n", + "\n", + "Let's start by preparing our feature matrix and target variable:" + ] + }, + { + "cell_type": "markdown", + "id": "9e4367eb-6c81-4b70-ada8-edb8136dd71f", + "metadata": {}, + "source": [ + "Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format. For this example, we'll stick to CSV. Note that the first column must be the target variable and the CSV should not include headers. Also, notice that although repetitive it's easiest to do this after the train|validation|test split rather than before. This avoids any misalignment issues due to random reordering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c68276-84cc-42f2-bc80-24fc1305797a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def process_data(data: pd.DataFrame):\n", + " \"\"\"\n", + " Process and prepare data for modeling\n", + " \"\"\"\n", + " # Create copy to avoid modifying original\n", + " df = data.copy()\n", + " \n", + " # Drop 'order id' column\n", + " df = df.drop('order id', axis=1)\n", + " \n", + " # Convert date columns to datetime and extract features\n", + " date_columns = ['order date', 'ship date']\n", + " for col in date_columns:\n", + " df[col] = pd.to_datetime(df[col])\n", + " df[f'{col}_year'] = df[col].dt.year\n", + " df[f'{col}_month'] = df[col].dt.month\n", + " df[f'{col}_quarter'] = df[col].dt.quarter\n", + " \n", + " # Drop original date columns\n", + " df = df.drop(columns=date_columns)\n", + " \n", + " # Create lag features for 'total revenue'\n", + " for i in range(1, 4):\n", + " df[f'revenue_lag_{i}'] = df.groupby(['item type', 'sales channel'])['total revenue'].shift(i)\n", + " \n", + " # Drop rows with NaN values\n", + " df = df.dropna()\n", + " \n", + " # Convert categorical variables to dummy variables\n", + " categorical_columns = ['region', 'country', 'item type', 'sales channel', 'order priority']\n", + " df_encoded = pd.get_dummies(df, columns=categorical_columns)\n", + " \n", + " # Prepare features and target\n", + " target_column = 'total profit' # Assuming 'total profit' is the target variable\n", + " numeric_columns = ['units sold', 'unit price', 'unit cost', 'total revenue', 'total cost']\n", + " feature_columns = [col for col in df_encoded.columns if col != target_column]\n", + " X = df_encoded[feature_columns]\n", + " y = df_encoded[target_column].astype(float)\n", + " \n", + " # Train-test-validation split\n", + " X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1729)\n", + " X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=1729)\n", + " \n", + " # Scale numeric features\n", + " scaler = StandardScaler()\n", + " numeric_features = [col for col in X_train.columns if col in numeric_columns + \n", + " ['order date_year', 'order date_month', 'order date_quarter',\n", + " 'ship date_year', 'ship date_month', 'ship date_quarter'] +\n", + " [f'revenue_lag_{i}' for i in range(1, 4)]]\n", + " \n", + " X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])\n", + " X_val[numeric_features] = scaler.transform(X_val[numeric_features])\n", + " X_test[numeric_features] = scaler.transform(X_test[numeric_features])\n", + " \n", + " return X_train, X_val, X_test, y_train, y_val, y_test, feature_columns, scaler\n", + "\n", + "# Process the data\n", + "X_train, X_val, X_test, y_train, y_val, y_test, feature_columns, scaler = process_data(data)\n", + "\n", + "# Print some information about the processed data\n", + "print(\"\\nProcessed data shape:\", X_train.shape)\n", + "print(\"\\nFirst few rows of processed data:\")\n", + "print(X_train.head())\n", + "print(X_train.shape)\n", + "print(X_train.info())\n", + "print(\"\\nColumn names:\")\n", + "print(X_train.columns.tolist())\n", + "\n", + "# Verify target variable\n", + "print(\"\\nSummary statistics of the target variable:\")\n", + "print(y_train.describe())" + ] + }, + { + "cell_type": "markdown", + "id": "d4dd59f1-27b1-4d25-8f55-8c8f3f422e51", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Training XGBoost\n", + "\n", + "### Option 1: Using the SageMaker Decorator\n", + "Now we know that most of our features have skewed distributions, some are highly correlated with one another, and some appear to have non-linear relationships with our target variable. Also, for targeting future prospects, good predictive accuracy is preferred to being able to explain why that prospect was targeted. Taken together, these aspects make gradient boosted trees a good candidate algorithm.\n", + "\n", + "There are several intricacies to understanding the algorithm, but at a high level, gradient boosted trees works by combining predictions from many simple models, each of which tries to address the weaknesses of the previous models. By doing this the collection of simple models can actually outperform large, complex models. Other Amazon SageMaker notebooks elaborate on gradient boosting trees further and how they differ from similar algorithms.\n", + "\n", + "`xgboost` is an extremely popular, open-source package for gradient boosted trees. It is computationally powerful, fully featured, and has been successfully used in many machine learning competitions. \n", + "\n", + "Let's train a first version of XGBoost using this open-source library and using SageMaker's @remote decorator. You can use the @remote decorator to annotate a function. SageMaker AI will transform the code inside the decorator into a SageMaker training job. \n", + "\n", + "Note how we log various parameters, metrics, tags, and artifacts to MLflow. When the training is finished, don't forget to open up MLflow to take a look at the experiment results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "569c1c16", + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "import os\n", + "import joblib\n", + "from sagemaker.remote_function import remote\n", + "\n", + "\n", + "def train_model(X_train, y_train, X_val, y_val):\n", + " \"\"\"\n", + " Train XGBoost model\n", + " \"\"\"\n", + " # Initialize model\n", + " model = xgb.XGBRegressor(\n", + " n_estimators=100,\n", + " learning_rate=0.1,\n", + " max_depth=5,\n", + " random_state=42\n", + " )\n", + " \n", + " # Train model\n", + " model.fit(\n", + " X_train, \n", + " y_train,\n", + " eval_set=[(X_val, y_val)],\n", + " verbose=False\n", + " )\n", + " \n", + " return model\n", + "\n", + "\n", + "@remote(job_name_prefix=\"xgboost-sales-forecast\", \n", + " instance_type=\"ml.m5.large\", \n", + " keep_alive_period_in_seconds=600,)\n", + "def model_train(X_train, y_train, X_val, y_val, mlflow_arn):\n", + " \"\"\"\n", + " Main function to orchestrate the model training process\n", + " \"\"\"\n", + " mlflow.set_tracking_uri(mlflow_arn)\n", + " mlflow.set_experiment(\"XG-Boost\")\n", + " \n", + " with mlflow.start_run(run_name=f\"xgboost-decorator-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}\"):\n", + " # Log information about the data\n", + " mlflow.log_param(\"train_samples\", len(X_train))\n", + " mlflow.log_param(\"val_samples\", len(X_val))\n", + " mlflow.log_param(\"features\", X_train.shape[1])\n", + " \n", + " # Train model\n", + " model = train_model(X_train, y_train, X_val, y_val)\n", + " \n", + " # Log model parameters\n", + " params = model.get_params()\n", + " mlflow.log_params(params)\n", + " \n", + " # Log validation results\n", + " results = model.evals_result()\n", + " for epoch, rmse_value in enumerate(results['validation_0']['rmse']):\n", + " mlflow.log_metric('train_rmse', rmse_value, step=epoch)\n", + "\n", + " # Log final metrics\n", + " final_rmse = results['validation_0']['rmse'][-1]\n", + " best_rmse = min(results['validation_0']['rmse'])\n", + " best_epoch = results['validation_0']['rmse'].index(best_rmse)\n", + " mlflow.log_metrics({\n", + " 'final_rmse': final_rmse,\n", + " 'best_rmse': best_rmse,\n", + " 'best_epoch': best_epoch\n", + " })\n", + "\n", + " # Set tags for the run\n", + " mlflow.set_tag(\"model_type\", \"XGBoost\")\n", + " mlflow.set_tag(\"framework\", \"OSS\")\n", + " \n", + " # Infer model signature and register model\n", + " predictions = model.predict(X_val)\n", + " signature = mlflow.models.infer_signature(X_train, predictions)\n", + " mlflow.xgboost.log_model(model, \"model\", registered_model_name=\"xgboost-lib-regression\", signature=signature)\n", + " \n", + " # Save model\n", + " path = \"/opt/ml/model\"\n", + " joblib.dump(model, os.path.join(path, 'revenue_forecast_model.joblib'))\n", + " return model, predictions\n", + "\n", + "# Run the training\n", + "xgb_model, output = model_train(X_train, y_train, X_val, y_val, mlflow_arn)" + ] + }, + { + "cell_type": "markdown", + "id": "0b9f781a", + "metadata": {}, + "source": [ + "### Option 2: Using SageMaker's built-in algorithm\n", + "\n", + "Amazon SageMaker also has a managed, distributed [training framework for XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html). This section shows how you can use train this version of XGBoost. Instead of using the @remote decorator, this section shows how to use the [ModelTrainer](https://sagemaker.readthedocs.io/en/stable/api/training/model_trainer.html) SDK to create a training job.\n", + "\n", + "First we must adjust the data to be suitable for this version of XGBoost." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b9f7f37", + "metadata": {}, + "outputs": [], + "source": [ + "def save_for_sagemaker_xgboost(X, y, filename):\n", + " \"\"\"\n", + " Save data in a format compatible with SageMaker's XGBoost algorithm.\n", + " \"\"\"\n", + " # Combine target and features\n", + " data = pd.concat([y.reset_index(drop=True), X.reset_index(drop=True)], axis=1)\n", + " \n", + " # Convert boolean columns to int\n", + " bool_columns = data.select_dtypes(include=['bool']).columns\n", + " data[bool_columns] = data[bool_columns].astype(int)\n", + " \n", + " # Ensure all data is numeric\n", + " data = data.apply(pd.to_numeric, errors='coerce')\n", + " \n", + " # Replace any remaining non-numeric values with 0\n", + " data = data.fillna(0)\n", + " \n", + " # Save to csv without header and index\n", + " data.to_csv(filename, header=False, index=False)\n", + " print(f\"Data saved to {filename}\")\n", + "\n", + "# Combine train and validation sets\n", + "X_train_full = pd.concat([X_train, X_val])\n", + "y_train_full = pd.concat([y_train, y_val])\n", + "\n", + "# Save training data (including validation data)\n", + "save_for_sagemaker_xgboost(X_train_full, y_train_full, 'train.csv')\n", + "\n", + "# Save test data\n", + "save_for_sagemaker_xgboost(X_test, y_test, 'test.csv')\n", + "\n", + "# Print some information about the saved files\n", + "print(\"\\nTrain file info:\")\n", + "print(pd.read_csv('train.csv', header=None).info())\n", + "\n", + "print(\"\\nTest file info:\")\n", + "print(pd.read_csv('test.csv', header=None).info())\n", + "\n", + "# Verify first few rows of each file\n", + "print(\"\\nFirst few rows of train.csv:\")\n", + "print(pd.read_csv('train.csv', header=None).head())\n", + "\n", + "print(\"\\nFirst few rows of test.csv:\")\n", + "print(pd.read_csv('test.csv', header=None).head())\n" + ] + }, + { + "cell_type": "markdown", + "id": "b6b83856-154e-4789-add8-db58b27e41a6", + "metadata": {}, + "source": [ + "In the cell above, we performed preprocessing on our dataset and produced output files \"train.csv\" and \"test.csv\". Next, we'll upload these local files to our S3 location." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da67f22f-ff96-4609-a197-68abc2a18877", + "metadata": {}, + "outputs": [], + "source": [ + "session.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train_xgboost/train.csv')).upload_file('train.csv')\n", + "session.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test_xgboost/test.csv')).upload_file('test.csv')\n", + "session.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation_xgboost/test.csv')).upload_file('test.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "aa0e05af-81dc-472f-9f57-3d0fb85f663b", + "metadata": {}, + "source": [ + "We'll need to specify the ECR container location for Amazon SageMaker's implementation of XGBoost." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43fd0ef4-388a-4354-94b6-396cecbe34cb", + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')" + ] + }, + { + "cell_type": "markdown", + "id": "9e92e1f5-905c-4202-993f-b9d07f24f33c", + "metadata": {}, + "source": [ + "Then, because we're training with the CSV file format, we'll create `TrainingInput` objects that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e64769cd-f2ff-4c1c-94f5-e488f8c649fa", + "metadata": {}, + "outputs": [], + "source": [ + "s3_input_train = sagemaker.inputs.TrainingInput(\n", + " s3_data='s3://{}/{}/train_xgboost/'.format(bucket, prefix),\n", + " content_type='csv'\n", + ")\n", + "s3_input_test = sagemaker.inputs.TrainingInput(\n", + " s3_data='s3://{}/{}/test_xgboost/'.format(bucket, prefix),\n", + " content_type='csv'\n", + ")\n", + "s3_input_validation = sagemaker.inputs.TrainingInput(\n", + " s3_data='s3://{}/{}/validation_xgboost/'.format(bucket, prefix),\n", + " content_type='csv'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7e4db3d1-a6e8-46ea-9a6e-35339e4b9ef5", + "metadata": {}, + "source": [ + "First we'll need to specify training parameters to the estimator. This includes:\n", + "1. The `xgboost` algorithm container\n", + "1. The IAM role to use\n", + "1. Training instance type and count\n", + "1. S3 location for output data\n", + "1. Algorithm hyperparameters\n", + "\n", + "And then a `.fit()` function which specifies the input data. In this case we have both a training and validation set which are passed in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "003300a9-c6c3-4459-bedc-c9697da03940", + "metadata": {}, + "outputs": [], + "source": [ + "from mlflow.models import infer_signature\n", + "\n", + "sm_session = sagemaker.Session()\n", + "xgb_estimator = sagemaker.estimator.Estimator(container,\n", + " role, \n", + " instance_count=1, \n", + " instance_type='ml.m5.xlarge',\n", + " output_path=f's3://{bucket}/{prefix}/output',\n", + " sagemaker_session=sm_session)\n", + "\n", + "hyperparameters = {\n", + " \"max_depth\": 6,\n", + " \"eta\": 0.2,\n", + " \"gamma\": 4,\n", + " \"min_child_weight\": 8,\n", + " \"subsample\": 0.6,\n", + " \"verbosity\": 0,\n", + " \"objective\": \"reg:linear\",\n", + " \"num_round\": 75,\n", + "}\n", + "xgb_estimator.set_hyperparameters(**hyperparameters)\n", + "\n", + "mlflow.set_experiment(\"XG-Boost\")\n", + "with mlflow.start_run(run_name=f\"xgboost-builtin-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}\"):\n", + " # Log the hyperparameters\n", + " mlflow.log_params(hyperparameters)\n", + "\n", + " # Fit the model and capture training metrics\n", + " xgb_estimator.fit({'train': s3_input_train, 'validation': s3_input_test})\n", + " \n", + " # Get the training job name\n", + " job_name = xgb_estimator.latest_training_job.job_name\n", + " \n", + " # Get the training job description\n", + " client = sm_session.boto_session.client('sagemaker')\n", + " training_job_description = client.describe_training_job(TrainingJobName=job_name)\n", + " \n", + " # Extract and log metrics\n", + " for metric in training_job_description['FinalMetricDataList']:\n", + " metric_name = metric['MetricName']\n", + " metric_value = metric['Value']\n", + " mlflow.log_metric(metric_name, metric_value)\n", + "\n", + " # Set tags for the run\n", + " mlflow.set_tag(\"model_type\", \"XGBoost\")\n", + " mlflow.set_tag(\"framework\", \"SageMaker\")\n", + "\n", + " # Register the model\n", + " mlflow.register_model(f\"runs:/{mlflow.active_run().info.run_id}/model\", \"xgboost-sm-regression\")\n", + "\n", + " print(f\"Model saved in run {mlflow.active_run().info.run_uuid}\")" + ] + }, + { + "cell_type": "markdown", + "id": "62bfccde-d940-424e-903c-bf1122c38714", + "metadata": {}, + "source": [ + "Now that you have successfully completed the training of the XGBoost model and SageMaker Autopilot job on the dataset, you can deploy xgboost and create a model from any of the candidates by using [Inference Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "693f1424-7cac-411f-82f0-836630ba6bb4", + "metadata": {}, + "outputs": [], + "source": [ + "xgb_predictor = xgb_estimator.deploy(initial_instance_count=1,\n", + " instance_type='ml.m5.xlarge')" + ] + }, + { + "cell_type": "markdown", + "id": "022d9db7-594f-48f1-84e8-b5123d4aede3", + "metadata": {}, + "source": [ + "First we'll need to determine how we pass data into and receive data from our endpoint. Our data is currently stored as NumPy arrays in memory of our notebook instance. To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV.\n", + "\n", + "*Note: For inference with CSV format, SageMaker XGBoost requires that the data does NOT include the target variable.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "225ae8c7-e079-48e6-9672-f5252182a4c4", + "metadata": {}, + "outputs": [], + "source": [ + "xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()" + ] + }, + { + "cell_type": "markdown", + "id": "c89f8de3-4bb4-4c93-9c03-414506a4366f", + "metadata": { + "execution": { + "iopub.execute_input": "2025-01-06T19:29:20.445680Z", + "iopub.status.busy": "2025-01-06T19:29:20.444866Z", + "iopub.status.idle": "2025-01-06T19:29:20.487983Z", + "shell.execute_reply": "2025-01-06T19:29:20.486542Z", + "shell.execute_reply.started": "2025-01-06T19:29:20.445642Z" + } + }, + "source": [ + "Now, we'll use a simple function to:\n", + "1. Loop over our test dataset\n", + "1. Split it into mini-batches of rows \n", + "1. Convert those mini-batches to CSV string payloads (notice, we drop the target variable from our dataset first)\n", + "1. Retrieve mini-batch predictions by invoking the XGBoost endpoint\n", + "1. Collect predictions and convert from the CSV output our model provides into a NumPy array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cb267f7-8ca8-4d33-afac-52aebf6d2fbc", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the CSV file\n", + "test_data = pd.read_csv('test.csv')\n", + "\n", + "def predict(data, predictor, rows=500):\n", + " split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n", + " predictions = []\n", + " \n", + " for array in split_array:\n", + " # Convert numpy array to CSV string\n", + " csv = '\\n'.join([','.join(map(str, row)) for row in array])\n", + " \n", + " # Get predictions\n", + " prediction = predictor.predict(csv)\n", + " \n", + " # Decode and convert to numpy array\n", + " prediction_array = np.fromstring(prediction.decode('utf-8'), sep=',')\n", + " \n", + " predictions.append(prediction_array)\n", + " \n", + " # Concatenate all predictions\n", + " return np.concatenate(predictions)\n", + "\n", + "# Print column names and data info for debugging\n", + "print(\"Original columns:\", test_data.columns)\n", + "print(test_data.info())\n", + "\n", + "# Remove the first column (target variable)\n", + "X_test = test_data.iloc[:, 1:]\n", + "\n", + "# Print shape to confirm\n", + "print(\"Shape of X_test:\", X_test.shape)\n", + "print(\"Columns of X_test:\", X_test.columns)\n", + "\n", + "# Ensure we have the correct number of features\n", + "if X_test.shape[1] != 224:\n", + " print(f\"Warning: You have {X_test.shape[1]} features. The model expects 224.\")\n", + " print(\"Current features:\", X_test.columns.tolist())\n", + " # If needed, you can manually select the correct 8 features:\n", + " # X_test = X_test[['feature1', 'feature2', ..., 'feature8']]\n", + "\n", + "# Now you can use X_test in your predict function\n", + "predictions = predict(X_test.values, xgb_predictor)\n" + ] + }, + { + "cell_type": "markdown", + "id": "c5868b0c-991d-4519-b899-ab43f30a7b3c", + "metadata": {}, + "source": [ + "Now we'll output a score for the predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3717de0-f579-4051-b83f-d7c94623af6f", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", + "\n", + "# Extract the actual values (first column)\n", + "y_true = test_data.iloc[:, 0]\n", + "\n", + "# Ensure predictions are in the same format as y_true\n", + "y_pred = predictions # This should be the predictions from your previous cell\n", + "\n", + "# Make sure y_true and y_pred have the same length\n", + "assert len(y_true) == len(y_pred), \"Mismatch in length between actual and predicted values\"\n", + "\n", + "# Calculate evaluation metrics\n", + "mse = mean_squared_error(y_true, y_pred)\n", + "rmse = np.sqrt(mse)\n", + "mae = mean_absolute_error(y_true, y_pred)\n", + "r2 = r2_score(y_true, y_pred)\n", + "\n", + "print(f\"Mean Squared Error: {mse:.4f}\")\n", + "print(f\"Root Mean Squared Error: {rmse:.4f}\")\n", + "print(f\"Mean Absolute Error: {mae:.4f}\")\n", + "print(f\"R-squared Score: {r2:.4f}\")\n", + "\n", + "# If you want to see a sample of the actual vs predicted values\n", + "comparison_df = pd.DataFrame({'Actual': y_true, 'Predicted': y_pred})\n", + "print(\"\\nSample of Actual vs Predicted values:\")\n", + "print(comparison_df.head(10))\n", + "\n", + "# If you want to plot the actual vs predicted values\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(y_true, y_pred, alpha=0.5)\n", + "plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n", + "plt.xlabel('Actual Values')\n", + "plt.ylabel('Predicted Values')\n", + "plt.title('Actual vs Predicted Values')\n", + "plt.show()\n", + "\n", + "# Calculate and print additional statistics\n", + "print(\"\\nAdditional Statistics:\")\n", + "print(f\"Mean of Actual Values: {y_true.mean():.4f}\")\n", + "print(f\"Mean of Predicted Values: {y_pred.mean():.4f}\")\n", + "print(f\"Standard Deviation of Actual Values: {y_true.std():.4f}\")\n", + "print(f\"Standard Deviation of Predicted Values: {y_pred.std():.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "69a4699d-d734-447e-9b87-23d0c33df7b3", + "metadata": {}, + "source": [ + "#### Optional: Hyperparameter Tuning Job\n", + "\n", + "We can optionally run a Hyperparameter Optimization (HPO) Job to improve our results. This will run multiple training jobs with different values for the hyperparameters based on the ranges specified below. The HPO job automatically chooses new parameter values based on the results of previous jobs, eventually leading to better results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dc4dd4b-440c-4e2b-9a14-1d3b929fc70e", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tuner import HyperparameterTuner\n", + "from sagemaker.parameter import ContinuousParameter, IntegerParameter\n", + "\n", + "\n", + "hyperparameter_ranges = {\n", + " \"max_depth\": IntegerParameter(5, 10),\n", + " \"eta\": ContinuousParameter(0.001, 0.3),\n", + " \"min_child_weight\": IntegerParameter(5, 10),\n", + " \"subsample\": ContinuousParameter(0.3, 0.8),\n", + " \"num_round\": IntegerParameter(50, 100),\n", + "}\n", + "\n", + "objective_metric_name = \"validation:rmse\"\n", + "\n", + "tuner = HyperparameterTuner(\n", + " xgb_estimator,\n", + " objective_metric_name,\n", + " hyperparameter_ranges,\n", + " objective_type=\"Minimize\",\n", + " strategy=\"Bayesian\",\n", + " max_jobs=10,\n", + " max_parallel_jobs=5,\n", + ") " + ] + }, + { + "cell_type": "markdown", + "id": "2b4075b5-df7b-4666-86f8-3f44f42aeadc", + "metadata": {}, + "source": [ + "When we run the HPO job, we can log the results of the hyperparameter tuning job and each child training job into MLflow. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f90281c8-a5b7-47f7-bf1d-884eaf742045", + "metadata": {}, + "outputs": [], + "source": [ + "def format_param_range(param_range):\n", + " formatted_param_range = {\n", + " f\"{param_range['Name']}_min_value\": param_range['MinValue'],\n", + " f\"{param_range['Name']}_max_value\": param_range['MaxValue']\n", + " }\n", + " return formatted_param_range\n", + "\n", + "\n", + "mlflow.set_experiment(\"XG-Boost-HPO\")\n", + "with mlflow.start_run(run_name=f\"xgboost-hpo-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}\"):\n", + " tuner.fit({\"train\": s3_input_train, \"validation\": s3_input_validation})\n", + " tuner_descr = tuner.describe()\n", + " \n", + " # Log parameters relevant to overall tuning job\n", + " mlflow.log_params(tuner_descr['HyperParameterTuningJobConfig']['ResourceLimits'])\n", + " mlflow.log_param('Strategy', tuner_descr['HyperParameterTuningJobConfig']['Strategy'])\n", + " mlflow.log_params(tuner_descr['TrainingJobDefinition']['StaticHyperParameters'])\n", + " for integer_param in tuner_descr['HyperParameterTuningJobConfig']['ParameterRanges']['IntegerParameterRanges']:\n", + " mlflow.log_params(format_param_range(integer_param))\n", + " for continuous_param in tuner_descr['HyperParameterTuningJobConfig']['ParameterRanges']['ContinuousParameterRanges']:\n", + " mlflow.log_params(format_param_range(continuous_param))\n", + " for categorical_param in tuner_descr['HyperParameterTuningJobConfig']['ParameterRanges']['CategoricalParameterRanges']:\n", + " mlflow.log_params(format_param_range(categorical_param))\n", + " mlflow.log_param('BestTrainingJobName', tuner_descr['BestTrainingJob']['TrainingJobName'])\n", + "\n", + " # Set tags for the run\n", + " mlflow.set_tag(\"model_type\", \"XGBoost\")\n", + " mlflow.set_tag(\"framework\", \"SageMaker\")\n", + "\n", + " # Log parameters and metrics for each training job\n", + " train_summaries = tuner.analytics().training_job_summaries()\n", + " for train_results in train_summaries:\n", + " with mlflow.start_run(run_name=train_results['TrainingJobName'], nested=True):\n", + " mlflow.log_params(train_results['TunedHyperParameters'])\n", + " mlflow.log_metric(train_results['FinalHyperParameterTuningJobObjectiveMetric']['MetricName'],\n", + " train_results['FinalHyperParameterTuningJobObjectiveMetric']['Value'])\n", + " mlflow.set_tag(\"model_type\", \"XGBoost\")\n", + " mlflow.set_tag(\"framework\", \"SageMaker\")" + ] + }, + { + "cell_type": "markdown", + "id": "eb14d426-4ba2-4525-ae60-25124ca22a43", + "metadata": {}, + "source": [ + "It is also easy to deploy the best model from the hyperparameter tuning job. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c816a536-bc81-4c2f-a166-d450e0f270dd", + "metadata": {}, + "outputs": [], + "source": [ + "tuner.deploy(\n", + " initial_instance_count=1, \n", + " instance_type='ml.m5.xlarge'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "af854c3c-7b28-440d-bfb9-601307e11a6d", + "metadata": {}, + "source": [ + "---\n", + "## AutoML Training\n", + "\n", + "Amazon SageMaker Autopilot is an automated machine learning (commonly referred to as AutoML) solution for tabular datasets. You can use SageMaker Autopilot in different ways: on autopilot (hence the name) or with human guidance, without code through SageMaker Studio, or using the AWS SDKs. This notebook, as a first glimpse, will use the AWS SDKs to simply create and deploy a machine learning model.\n", + "\n", + "This part of the notebook demonstrates how you can use Autopilot on this dataset to get the most accurate ML pipeline through exploring a number of potential options, or \"candidates\". Each candidate generated by Autopilot consists of two steps. The first step performs automated feature engineering on the dataset and the second step trains and tunes an algorithm to produce a model. When you deploy this model, it follows similar steps. Feature engineering followed by inference, to decide whether the lead is worth pursuing or not. The notebook contains instructions on how to train the model as well as to deploy the model to perform batch predictions on a set of leads. Where it is possible, use the Amazon SageMaker Python SDK, a high level SDK, to simplify the way you interact with Amazon SageMaker." + ] + }, + { + "cell_type": "markdown", + "id": "9f8d2871-85a2-42cc-b07b-ed1837ce96a6", + "metadata": {}, + "source": [ + "First, we need to upload the entire dataset to S3.\n", + "\n", + "Caution: Before running the cell below, you must upload the data file \"5000-sales-records.csv\" to the local directory!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5866a843-88d1-4206-b05c-11ebca7972b1", + "metadata": {}, + "outputs": [], + "source": [ + "# Upload data for AutoML Job\n", + "\n", + "session.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train_automl/train.csv')).upload_file('5000-sales-records.csv')\n", + "session.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test_automl/test.csv')).upload_file('5000-sales-records.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "a687e5c8-c205-4884-bd9e-785045258cb5", + "metadata": {}, + "source": [ + "### AutoML Configuration\n", + "\n", + "You can specify the type of problem you want to solve with your dataset (`Regression, MulticlassClassification, BinaryClassification`). In case you are not sure, SageMaker Autopilot will infer the problem type based on statistics of the target column (the column you want to predict). \n", + "\n", + "You have the option to limit the running time of a SageMaker Autopilot job by providing either the maximum number of pipeline evaluations or candidates (one pipeline evaluation is called a `Candidate` because it generates a candidate model) or providing the total time allocated for the overall Autopilot job. Under default settings, this job takes about four hours to run. This varies between runs because of the nature of the exploratory process Autopilot uses to find optimal training parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63c0bed7-21d6-48cb-9584-3e9f5c899304", + "metadata": {}, + "outputs": [], + "source": [ + "from time import gmtime, strftime, sleep\n", + "import json\n", + "import mlflow\n", + "import boto3\n", + "\n", + "# Set up MLflow experiment\n", + "mlflow.set_experiment(\"AutoML-Job\")\n", + "\n", + "# Start MLflow run\n", + "with mlflow.start_run(run_name=\"AutoML-Job-Run\"):\n", + "\n", + " input_data_config = [{\n", + " 'DataSource': {\n", + " 'S3DataSource': {\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3Uri': f's3://{bucket}/{prefix}/train_automl'\n", + " }\n", + " },\n", + " 'ContentType': 'text/csv;header=present',\n", + " 'TargetAttributeName': 'Total Profit'\n", + " }]\n", + "\n", + " output_data_config = {\n", + " 'S3OutputPath': f's3://{bucket}/{prefix}/output_automl'\n", + " }\n", + "\n", + " auto_ml_job_config = {\n", + " 'CompletionCriteria': {\n", + " 'MaxCandidates': 5\n", + " }\n", + " }\n", + "\n", + " autoMLJobObjective = {\n", + " \"MetricName\": \"MSE\" \n", + " }\n", + "\n", + " # Log configurations to MLflow\n", + " mlflow.log_dict(input_data_config, \"input_data_config.json\")\n", + " mlflow.log_dict(output_data_config, \"output_data_config.json\")\n", + " mlflow.log_dict(auto_ml_job_config, \"auto_ml_job_config.json\")\n", + " mlflow.log_dict(autoMLJobObjective, \"autoMLJobObjective.json\")\n", + "\n", + " # Configuration\n", + " timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", + " auto_ml_job_name = 'demo' + timestamp_suffix\n", + " print('AutoMLJobName: ' + auto_ml_job_name)\n", + " mlflow.log_param(\"AutoMLJobName\", auto_ml_job_name)\n", + "\n", + " # Create AutoML job\n", + " sm = boto3.client('sagemaker')\n", + " sm.create_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name,\n", + " InputDataConfig=input_data_config,\n", + " OutputDataConfig=output_data_config,\n", + " AutoMLJobConfig=auto_ml_job_config,\n", + " AutoMLJobObjective=autoMLJobObjective,\n", + " ProblemType=\"Regression\", \n", + " RoleArn=role \n", + " )\n", + "\n", + " # Wait for the AutoML job to complete\n", + " while True:\n", + " response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n", + " status = response['AutoMLJobStatus']\n", + " if status in ['Completed', 'Failed', 'Stopped']:\n", + " break\n", + " print(f\"AutoML job status: {status}\")\n", + " sleep(60)\n", + "\n", + " # Log final job status\n", + " mlflow.log_param(\"FinalJobStatus\", status)\n", + "\n", + " if status == 'Completed':\n", + " # Log best candidate info\n", + " best_candidate = response['BestCandidate']\n", + " mlflow.log_dict(best_candidate, \"best_candidate.json\")\n", + " \n", + " # Log objective metric\n", + " objective_metric = best_candidate['FinalAutoMLJobObjectiveMetric']\n", + " mlflow.log_metric(objective_metric['MetricName'], objective_metric['Value'])\n", + "\n", + " # Log other metrics if available\n", + " if 'CandidateProperties' in best_candidate:\n", + " for metric in best_candidate['CandidateProperties'].get('Metrics', []):\n", + " mlflow.log_metric(metric['MetricName'], metric['Value'])\n", + "\n", + " print(f\"AutoML job {auto_ml_job_name} finished with status: {status}\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69c7bac7-f750-4263-aea7-73a8e2d17a25", + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker_client = boto3.client('sagemaker')\n", + "\n", + "best_candidate = sagemaker_client.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']\n", + "best_candidate_name = best_candidate['CandidateName']\n", + "print(best_candidate)\n", + "print('\\n')\n", + "\n", + "print(\"CandidateName: \" + best_candidate_name)\n", + "print(\"FinalAutoMLJobObjectiveMetricName: \" + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])\n", + "print(\"FinalAutoMLJobObjectiveMetricValue: \" + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))" + ] + }, + { + "cell_type": "markdown", + "id": "956b70d5", + "metadata": {}, + "source": [ + "### Create Model for best candidate\n", + "\n", + "When the AutoML job has finished running, you can easily create a SageMaker Model object using the best candidate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82f69a99-7d29-426b-a147-db6222ebbfda", + "metadata": {}, + "outputs": [], + "source": [ + "from time import gmtime, strftime\n", + "\n", + "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n", + "\n", + "model_name = 'demo-' + timestamp_suffix\n", + "print(f\"Model name: {model_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1167ba0b-5f3c-4b70-8692-ba5a567c9da8", + "metadata": {}, + "outputs": [], + "source": [ + "# Create Model\n", + "model = sagemaker_client.create_model(\n", + " Containers=best_candidate['InferenceContainers'],\n", + " ModelName=model_name,\n", + " ExecutionRoleArn=role\n", + ")\n", + "\n", + "print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))" + ] + }, + { + "cell_type": "markdown", + "id": "27b99203-019b-4c16-aca5-856e6751c192", + "metadata": {}, + "source": [ + "### View other candidates\n", + "You can view all the candidates (pipeline evaluations with different hyperparameter combinations) that were explored by SageMaker Autopilot and sort them by their final performance metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a8d4b63-4ef1-4f53-899e-69882a98ca07", + "metadata": {}, + "outputs": [], + "source": [ + "candidates = sagemaker_client.list_candidates_for_auto_ml_job(\n", + " AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']\n", + "\n", + "index = 1\n", + "for candidate in candidates:\n", + " print (str(index) + \" \" + candidate['CandidateName'] + \" \" + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))\n", + " index += 1" + ] + }, + { + "cell_type": "markdown", + "id": "cac7ef2d-0bb3-4a59-a3e7-bfaef07757ec", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "The Autopilot job creates many underlying artifacts such as dataset splits, preprocessing scripts, or preprocessed data, etc. This code, when un-commented, deletes them. This operation deletes all the generated models and the auto-generated notebooks as well. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ba43ea-6c41-4a60-8730-ff4327e35428", + "metadata": {}, + "outputs": [], + "source": [ + "#s3 = boto3.resource('s3')\n", + "#bucket = s3.Bucket(bucket)\n", + "\n", + "#job_outputs_prefix = '{}/output/{}'.format(prefix,auto_ml_job_name)\n", + "#bucket.objects.filter(Prefix=job_outputs_prefix).delete()\n", + "# xgb_predictor.delete_endpoint(delete_endpoint_config=True)\n", + "# tuner.delete_endpoint(delete_endpoint_config=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/CHANGELOG.md b/CHANGELOG.md index d211500561..90c7b7330e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,3422 @@ # Changelog +## v3.9.0 (2026-04-23) + +### New Features +- **Train**: Add `wait_timeout` parameter to `train()` for SFT, DPO, RLAIF, RLVR, and BaseTrainer +- **Evaluate**: Add MLflow experiment link to eval output +- **JumpStart**: Allow `SAGEMAKER_HUB_NAME` environment variable to override the `HUB_NAME` constant + +### Bug Fixes +- **HyperparameterTuner**: Pass through full `OutputDataConfig` from `ModelTrainer` so `kms_key_id`, `compression_type`, and other fields are preserved +- **HyperparameterTuner / ModelTrainer**: Propagate environment variables that were previously dropped +- **sagemaker-core**: Improve error messages for waiter timeouts +- **ModelBuilder**: Stop overwriting user-provided `HF_MODEL_ID` for DJL Serving +- **ModelBuilder**: Keep `/opt/ml/model` writable when using `source_code` with DJL LMI +- **Evaluate**: Skip `None` hyperparameters in `to_dict` instead of converting them to the string `"None"` +- **Nova**: Add `us-west-2` to Nova supported regions +- **DJL LMI**: Update ISO account mappings + +## v3.8.0 (2026-04-16) + +### New Features +- **Feature Group Manager**: Feature Group Manager support +- **Image Upgrades**: Image upgrades + +### Bug Fixes +- **ModelBuilder**: Add MLFlowConfig to Base Model +- **Docker**: Support for docker compose > v2 +- **HuggingFace**: Improve SDK v3 Hugging Face support +- **Dependencies**: Remove Pytorch hard dependency + +## v3.7.1 (2026-03-31) + +### Features +- **Telemetry**: Added telemetry emitter to `ScriptProcessor` and `FrameworkProcessor`, enabling SDK usage tracking for processing jobs via the telemetry attribution module (new `PROCESSING` feature enum added to telemetry constants) + +### Fixes +- **ModelBuilder**: Fixed `accept_eula` handling in ModelBuilder's LoRA deployment path — previously hardcoded to `True`, now respects the user-provided value and raises a `ValueError` if not explicitly set to `True` +- **Evaluate**: Fixed Lambda handler name derivation in the Evaluator — hardcoded the handler to `lambda_function.lambda_handler` instead of deriving it from the source filename, which caused invocation failures when the source file had a non-default name + +## v3.7.0 (2026-03-25) + +### Fixes +- **ModelBuilder**: Sync Nova hosting configs with AGISageMakerInference (#5664) +- **Evaluate**: Remove GPT OSS model evaluation restriction (#5658) + +### Features +- **AWS Batch**: Add support for Quota Management job submission and job priority update (#5659) +- **AWS Batch**: Extend list_jobs_by_share for quota_share_name (#5669) +- **Evaluate**: Support IAM role for BaseEvaluator (#5671) +- **Telemetry**: Add telemetry attribution module for SDK usage provenance (#5661) +- **MLflow**: Metrics visualization, enhanced wait UI, and eval job links (#5662) + +### Chores +- Updated SDK to use latest LMIv22 image for v3.x (#5640) +- Migration guide update (#5655) +- AWS Batch integ test resources are now uniquely named by test run (#5666) + +## v3.6.0 (2026-03-19) + +### Fixes +- **HyperparameterTuner**: Include sm_drivers channel in HyperparameterTuner jobs (#5516) +- **Pipeline**: Fix handling of training step dependencies to allow successful pipeline creation +- **ModelBuilder**: Fix the bug in deploy from LORA finetuning job + +### Features +- **Feature Processor**: Port feature processor to v3 +- **Jumpstart**: Add EUSC region config for JumpStart + +## v3.5.0 (2026-03-02) + +### Features +- **Feature Store v3**: New version of Feature Store functionality +- **Batch job listing by share identifier**: Added support for listing Batch jobs filtered by share identifier +- **Stop condition for model customization trainers**: Added stopping condition support to model customization trainers +- **EMRStep smart output**: Enhanced EMR step output handling with smart output capabilities +- **Transform AMI version support**: Added support for specifying AMI version in SageMaker transform jobs + +### Enhancements +- **Inference pipeline notebook example**: Added example notebook demonstrating inference pipeline usage +- **Migration documentation**: Added migration documentation + +### Bug Fixes +- **Model Customization bugs**: Fixed multiple issues in Model Customization functionality +- **Default stopping condition removal**: Removed default stopping condition for MC trainer to prevent conflicts +- **Instance groups parameter handling**: Fixed issue where default instance_type/instance_count were incorrectly applied when instance_groups was set +- **JumpStart alt config resolution**: Resolved alternative configuration resolution for JumpStart models +- **Inference processor naming**: Updated inference processor identifier from 'inf2' to 'neuronx' +- **HuggingFace Neuronx PyTorch version**: Corrected the PyTorch version for HuggingFace Neuronx +- **License additions**: Added license to sagemaker-mlops and sagemaker-serve packages + +## v3.4.1 (2026-02-10) + +### Fixes +- **Pipelines**: Correct Tag class usage in pipeline creation (#5526) +- **ModelTrainer**: Support PipelineVariables in hyperparameters (#5519) +- **HyperparameterTuner**: Include ModelTrainer internal channels (#5516) +- **Experiments**: Don't apply default experiment config for pipelines in non-Eureka GA +regions (#5500) + +### Features +- **JumpStart**: Added ISO regions support (#5505) +- **JumpStart**: Added version 1.4 and 1.5 (#5538) + +### Chores +- Added unit and integration tests for JumpStart search functionality (#5544) +- Removed java-kotlin from CodeQL workflow (#5517) + +## v3.4.0 (2026-01-22) + +### Features + - feat: add emr-serverless step for SageMaker Pipelines + +### Bug fixes and Other Changes + - Add Nova recipe training support in ModelTrainer + - Add Partner-app Auth provider + - Add sagemaker dependency for remote function by default V3 + + +## v3.3.1 (2026-01-12) +### Bug fixes and Other Changes + * ProcessingJob fix - Remove tags in Processor while Job creation + * Telemetry Updates + * sagemaker-mlops bug fix - Correct source code 'dependencies' parameter to 'requirements' + * aws_batch bug fix - remove experiment config parameter as it Estimator is deprecated. + + +## v3.3.0 (2025-12-19) + +### Features + * AWS_Batch: queueing of training jobs with ModelTrainer +### Bug fixes and Other Changes + * Fixes for model registry with ModelBuilder + +## v3.2.0 (2025-12-18) + +### Features + * Evaluator handshake with trainer + * Datasets Format validation +### Bug fixes and Other Changes + * Add xgboost 3.0-5 to release + * Fix get_child_process_ids parsing issue + +## v3.1.1 (2025-12-10) + +### Bug fixes and Other Changes +* Add validation to bedrock reward models +* Hyperparameter issue fixes, Add validation s3 output path +* Fix the recipe selection for multiple recipe scenario +* Train wait() timeout exception handling +* Update example notebooks to reflect recent code changes +* Update `model_package_group_name` param to `model_package_group` in finetuning interfaces +* remove `dataset` param for benchmark evaluator + +## v3.1.0 (2025-12-03) + +### Features + +* Fine-tuning SDK: SFT, RLVR, and RLAIF techniques with standardized parameter design +* AIRegistry Integration: Added CRUD operations for datasets and evaluators +* Enhanced Training Experience: Implemented MLFlow metrics tracking and deployment workflows + +## v3.0.1 (2025-11-19) + +* Update project dependencies to include submodules: sagemaker-core, sagemaker-train, sagemaker-serve, sagemaker-mlops + +## v3.0.0 (2025-11-19) + +### Major Version Release + +#### ⚠️ Breaking Changes + +#### Important: Please review these breaking changes before upgrading. + +* Version 3.0.0 represents a significant milestone in our product's evolution. This major release introduces a modernized architecture, enhanced performance, and powerful new features while maintaining our commitment to user experience and reliability. +* Older interfaces such as Estimator, Model, Predictor and all their subclasses will not be supported in V3. +* Please review documentation of interfaces for parameters support (especially ModelBuilder) + +## v2.254.1 (2025-10-31) + +### Bug Fixes and Other Changes + + * update get_execution_role to directly return the ExecutionRoleArn if it presents in the resource metadata file + * [hf] HF PT Training DLCs + +## v2.254.0 (2025-10-29) + +### Features + + * Triton v25.09 DLC + +### Bug Fixes and Other Changes + + * Add Numpy 2.0 support + * add HF Optimum Neuron DLCs + * [Hugging Face][Pytorch] Inference DLC 4.51.3 + * [hf] HF Inference TGI + +## v2.253.1 (2025-10-14) + +### Bug Fixes and Other Changes + + * Update instance type regex to also include hyphens + * Revert the change "Add Numpy 2.0 support" + * [hf-tei] add image uri to utils + * add TEI 1.8.2 + +## v2.253.0 (2025-10-10) + +### Features + + * Added condition to allow eval recipe. + * add model_type hyperparameter support for Nova recipes + +### Bug Fixes and Other Changes + + * Fix for a failed slow test: numpy fix + * Add numpy 2.0 support + * chore: domain support for eu-isoe-west-1 + * Adding default identity implementations to InferenceSpec + * djl regions fixes #5273 + * Fix flaky integ test + +## v2.252.0 (2025-09-29) + +### Features + + * change S3 endpoint env name + * add eval custom lambda arn to hyperparameters + +### Bug Fixes and Other Changes + + * merge rba without the iso region changes + * handle trial component status message longer than API supports + * Add nova custom lambda in hyperparameter from estimator + * add retryable option to emr step in SageMaker Pipelines + * Feature/js mlops telemetry + * latest tgi + +## v2.251.1 (2025-08-29) + +### Bug Fixes and Other Changes + + * chore: onboard tei 1.8.0 + +## v2.251.0 (2025-08-21) + +### Features + + * support pipeline versioning + +### Bug Fixes and Other Changes + + * GPT OSS Hotfix + * dockerfile stuck on interactive shell + * add sleep for model deployment + +## v2.250.0 (2025-08-08) + +### Features + + * Add support for InstancePlacementConfig in Estimator for training jobs running on ultraserver capacity + +### Bug Fixes and Other Changes + + * Add more constraints to test requirements + +## v2.249.0 (2025-07-31) + +### Features + + * AWS Batch for SageMaker Training jobs + +### Bug Fixes and Other Changes + + * Directly use customer-provided endpoint name for ModelBuilder deployment. + * update image_uri_configs 07-23-2025 07:18:25 PST + +## v2.248.2 (2025-07-22) + +### Bug Fixes and Other Changes + + * Relax boto3 version requirement + * update image_uri_configs 07-22-2025 07:18:25 PST + * update image_uri_configs 07-18-2025 07:18:28 PST + * add hard dependency on sagemaker-core pypi lib + * When rootlessDocker is enabled, return a fixed SageMaker IP + +## v2.248.1 (2025-07-16) + +### Bug Fixes and Other Changes + + * Nova training support + +## v2.248.0 (2025-07-15) + +### Features + + * integrate amtviz for visualization of tuning jobs + +### Bug Fixes and Other Changes + + * build(deps): bump requests in /tests/data/serve_resources/mlflow/pytorch + * build(deps): bump protobuf from 4.25.5 to 4.25.8 in /requirements/extras + * build(deps): bump mlflow in /tests/data/serve_resources/mlflow/xgboost + * build(deps): bump torch in /tests/data/modules/script_mode + * sanitize git clone repo input url + * Adding Hyperpod feature to enable hyperpod telemetry + * Adding Hyperpod feature to enable hyperpod telemetry + * Bump SMD version to enable custom workflow deployment. + * Update TF DLC python version to py312 + * update image_uri_configs 07-04-2025 07:18:27 PST + * update image_uri_configs 06-26-2025 07:18:35 PST + * relax protobuf to <6.32 + +## v2.247.1 (2025-06-23) + +### Bug Fixes and Other Changes + + * update image_uri_configs 06-19-2025 07:18:34 PST + +## v2.247.0 (2025-06-13) + +### Features + + * Add support for MetricDefinitions in ModelTrainer + +### Bug Fixes and Other Changes + + * update jumpstart region_config, update image_uri_configs 06-12-2025 07:18:12 PST + * Add ignore_patterns in ModelTrainer to ignore specific files/folders + * Allow import failure for internal _hashlib module + +## v2.246.0 (2025-06-04) + +### Features + + * Triton v25.04 DLC + +### Bug Fixes and Other Changes + + * Update Attrs version to widen support + * update estimator documentation regarding hyperparameters for source_dir + +## v2.245.0 (2025-05-28) + +### Features + + * Correct mypy type checking through PEP 561 + +### Bug Fixes and Other Changes + + * MLFLow update for dependabot + * addWaiterTimeoutHandling + * merge method inputs with class inputs + * update image_uri_configs 05-20-2025 07:18:17 PST + +## v2.244.2 (2025-05-19) + +### Bug Fixes and Other Changes + + * include model channel for gated uncompressed models + * clarify model monitor one time schedule bug + * update jumpstart region_config 05-15-2025 07:18:15 PST + * update image_uri_configs 05-14-2025 07:18:16 PST + * Add image configs and region config for TPE (ap-east-2) + * Improve defaults handling in ModelTrainer + +## v2.244.1 (2025-05-15) + +### Bug Fixes and Other Changes + + * Fix Flask-Limiter version + * Fix test_huggingface_tei_uris() + * huggingface-llm-neuronx dlc + * huggingface-neuronx dlc image_uri + * huggingface-tei dlc image_uri + * Fix test_deploy_with_update_endpoint() + * add AG v1.3 + * parameter mismatch in update_endpoint + * remove --strip-component for untar source tar.gz + * Fix type annotations + * chore: Allow omegaconf >=2.2,<3 + * honor json serialization of HPs + * Map llama models to correct script + * pin test dependency + * fix bad initialization script error message + * Improve error logging and documentation for issue 4007 + * build(deps): bump scikit-learn + * build(deps): bump mlflow + * build(deps): bump mlflow in /tests/data/serve_resources/mlflow/pytorch + * chore: Add tei 1.6.0 image + +## v2.244.0 (2025-05-02) + +### Features + + * support custom workflow deployment in ModelBuilder using SMD image. + +### Bug Fixes and Other Changes + + * Add Owner ID check for bucket with path when prefix is provided + * Add model server timeout + * pin mamba version to 24.11.3-2 to avoid inconsistent test runs + * Update ModelTrainer to support s3 uri and tar.gz file as source_dir + * chore: add huggingface images + +## v2.243.3 (2025-04-23) + +### Bug Fixes and Other Changes + + * update readme to reflect py312 upgrade + * Revert the PR changes 5122 + * Py312 upgrade step 2: Update dependencies, integ tests and unit tests + * update pr test to deprecate py38 and add py312 + * update image_uri_configs 04-16-2025 07:18:18 PST + * update image_uri_configs 04-15-2025 07:18:10 PST + * update image_uri_configs 04-11-2025 07:18:19 PST + +## v2.243.2 (2025-04-16) + +### Bug Fixes and Other Changes + + * tgi image uri unit tests + * Fix deepdiff dependencies + +## v2.243.1 (2025-04-11) + +### Bug Fixes and Other Changes + + * Added handler for pipeline variable while creating process job + * Fix issue #4856 by copying environment variables + * remove historical job_name caching which causes long job name + * Update instance gpu info + * Master + * Add mlflow tracking arn telemetry + * chore: fix semantic versioning for wildcard identifier + * flaky test + +### Documentation Changes + + * update pipelines step caching examples to include more steps + * update ModelStep data dependency info + +## v2.243.0 (2025-03-27) + +### Features + + * Enabled update_endpoint through model_builder + +### Bug Fixes and Other Changes + + * Update for PT 2.5.1, SMP 2.8.0 + * chore: move jumpstart region definitions to json file + * fix flaky clarify model monitor test + * fix flaky spark processor integ + * use temp file in unit tests + * Update transformers version + * Aligned disable_output_compression for @remote with Estimator + * Update Jinja version + * update image_uri_configs 03-26-2025 07:18:16 PST + * chore: fix integ tests to use latest version of model + * update image_uri_configs 03-25-2025 07:18:13 PST + * Skip tests failed due to deprecated instance type + * update image_uri_configs 03-21-2025 07:17:55 PST + * factor in set instance type when building JumpStart models in ModelBuilder. + * ADD Documentation to ReadtheDocs for Upgrading torch versions + * add new regions to JUMPSTART_LAUNCHED_REGIONS + +## v2.242.0 (2025-03-14) + +### Features + + * add integ tests for training JumpStart models in private hub + +### Bug Fixes and Other Changes + + * Torch upgrade + * Prevent RunContext overlap between test_run tests + * remove s3 output location requirement from hub class init + * Fixing Pytorch training python version in tests + * update image_uri_configs 03-11-2025 07:18:09 PST + * resolve infinite loop in _find_config on Windows systems + * pipeline definition function doc update + +## v2.241.0 (2025-03-06) + +### Features + + * Make DistributedConfig Extensible + * support training for JumpStart model references as part of Curated Hub Phase 2 + * Allow ModelTrainer to accept hyperparameters file + +### Bug Fixes and Other Changes + + * Skip tests with deprecated instance type + * Ensure Model.is_repack() returns a boolean + * Fix error when there is no session to call _create_model_request() + * Use sagemaker session's s3_resource in download_folder + * Added check for the presence of model package group before creating one + * Fix key error in _send_metrics() + +## v2.240.0 (2025-02-25) + +### Features + + * Add support for TGI Neuronx 0.0.27 and HF PT 2.3.0 image in PySDK + +### Bug Fixes and Other Changes + + * Remove main function entrypoint in ModelBuilder dependency manager. + * forbid extras in Configs + * altconfig hubcontent and reenable integ test + * Merge branch 'master-rba' into local_merge + * py_version doc fixes + * Add backward compatbility for RecordSerializer and RecordDeserializer + * update image_uri_configs 02-21-2025 06:18:10 PST + * update image_uri_configs 02-20-2025 06:18:08 PST + +### Documentation Changes + + * Removed a line about python version requirements of training script which can misguide users. + +## v2.239.3 (2025-02-19) + +### Bug Fixes and Other Changes + + * added ap-southeast-7 and mx-central-1 for Jumpstart + * update image_uri_configs 02-19-2025 06:18:15 PST + +## v2.239.2 (2025-02-18) + +### Bug Fixes and Other Changes + + * Add warning about not supporting torch.nn.SyncBatchNorm + * pass in inference_ami_version to model_based endpoint type + * Fix hyperparameter strategy docs + * Add framework_version to all TensorFlowModel examples + * Move RecordSerializer and RecordDeserializer to sagemaker.serializers and sagemaker.deserialzers + +## v2.239.1 (2025-02-14) + +### Bug Fixes and Other Changes + + * keep sagemaker_session from being overridden to None + * Fix all type hint and docstrings for callable + * Fix the workshop link for Step Functions + * Fix Tensorflow doc link + * Fix FeatureGroup docstring + * Add type hint for ProcessingOutput + * Fix sourcedir.tar.gz filenames in docstrings + * Fix documentation for local mode + * bug in get latest version was getting the max sorted alphabetically + * Add cleanup logic to model builder integ tests for endpoints + * Fixed pagination failing while listing collections + * fix ValueError when updating a data quality monitoring schedule + * Add docstring for image_uris.retrieve + * Create GitHub action to trigger canaries + * update image_uri_configs 02-04-2025 06:18:00 PST + +## v2.239.0 (2025-02-01) + +### Features + + * Add support for deepseek recipes + +### Bug Fixes and Other Changes + + * mpirun protocol - distributed training with @remote decorator + * Allow telemetry only in supported regions + * Fix ssh host policy + +## v2.238.0 (2025-01-29) + +### Features + + * use jumpstart deployment config image as default optimization image + +### Bug Fixes and Other Changes + + * chore: add new images for HF TGI + * update image_uri_configs 01-29-2025 06:18:08 PST + * skip TF tests for unsupported versions + * Merge branch 'master-rba' into local_merge + * Add missing attributes to local resourceconfig + * update image_uri_configs 01-27-2025 06:18:13 PST + * update image_uri_configs 01-24-2025 06:18:11 PST + * add missing schema definition in docs + * Omegaconf upgrade + * SageMaker @remote function: Added multi-node functionality + * remove option + * fix typo + * fix tests + * Add an option for user to remove inputs and container artifacts when using local model trainer + +## v2.237.3 (2025-01-09) + +### Bug Fixes and Other Changes + + * pin metadata-version to 2.3 + * model server might have already done a serialization. honor that by not decoding the request again if it is not already bytes or bytestream + * Disable jumpstart tests missing clean up logic + * Jumpstart ap southeast 5 + * add autogluon 1.2 + * updated inference script to cover context + * security update -> use sha256 instead of md5 for file hashing + * Fix Flake8 Violations + * Added parsing string support for situations where custom code might be used (ie. mlflow) + * Updating Inference Optimization Validations + +## v2.237.2 (2024-12-17) + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-13-2024 17:07:12 PST + * Cloudpickle upgrade + +## v2.237.1 (2024-12-12) + +### Bug Fixes and Other Changes + + * chore: remove support for ecr spec fallbacks for jumpstart models + * Cloudpickle Revert + * Cloudpickle update + * Numpy update + * Protobuf update + * Update to fetch latest Cloudpickle version + +## v2.237.0 (2024-12-05) + +### Features + + * Support SageMakerTrainingPlan for training jobs + * AMI support for BRM + * Adding Bedrock Store model support for HubService + +### Bug Fixes and Other Changes + + * Fix unit tests + * update boto3 and sagemaker-core version + * fix gpu_image uri + * Hotfix to construct rubik uri correctly + * fix codestyles + * fix merge artifact + * fix merge artifact + * fix test_requiremenets.txt + * chore: Merge from main + +## v2.236.0 (2024-12-04) + +### Features + + * Partner App Auth Provider for SDK support + * add pre-processing and post-processing logic to inference_spec + * add utility function to capture local snapshot + * support script mode with local train.sh + +### Bug Fixes and Other Changes + + * Add graphene to doc requirements + * Add graphne to the doc requirements + * Enable the Recipe tests marked with @pytest.mark.skip(reason="Hyperpod recipe code unavailable" + * Add model trainer documentation + * Usage docs for training recipes + * Neuron URIs update + * Update URIs to public for training recipes + * Changes for SMP v2.7.0 + * Change default source directory to current, add option to specify source dir + * Remove default values for fields in recipe_overrides and fix recipe path. + * Update MANIFEST.in so that wheel builds correctly + * fix the file uploading signature verification error + * remove example notebooks artifacts + * Morpheus tests + * Integ tests for local mode model trainer + * Update hyperpod recipe uris + * Add interface units for ModelTrainer + * Model Trainer Bucket improvements + * Update ModelTrainer Interface Parameters + * add in-process mode definition to docs + * Intelligent defaults for Model Trainer + * Fix tests and codestyle + * add integ test for base_model_builder_deploy and remove print statement + * Revert image builder + * pin xgboost dlc to 1.7.1 to fix test + * Skip JS model mapping with env vars or image URI provided + * Use sagemaker core Session + * Integration tests for Model Builder Handshake + * [Updated] Add telemetry to ModelTrainer, Estimator and ModelBuilder + * Update kandinsky in ModelTrainer and allow setting requirements + * add modelID support to model builder InProcess model + * Add Rich Logging to Model Builder + * Notebooks update for Bugbash + * Add bugbash bootstrapping + * add inference morpheus nbs + * Update ModelTrainer Notebooks + * Bug fixes + * Single container local training + * update notebooks + * update notebooks + * Add recipes examples + * Unified Deployment interface in Model Builder + * Use exact python path in trainer template + * Support building image from Dockerfile + * Add Support for Training Recipes + * Trainer handshake + * Pass hyperparameters as CLI args + * Add in_process mode support for DJL and TorchServe servers + * Remove ignored files + * Simplify Config Class Names and DistributedRunner structures + * Fix bug in script mode setup ModelTrainer + * Mask Sensitive Env Logs in Container + * Add path to set Additional Settings in ModelTrainer + * Add Distributed Training Support Model Trainer + * Cleanup ModelTrainer code + * Latest Container Image + * General image builder + * Cleanup ModelTrainer + * Revert Image Spec + * Support intelligent parameters + * Add enviornment variable bootstrapping script + * Add example notebook + * Add unit tests for ModelTrainer + * Image Spec refactoring and updates + * Base model trainer + +## v2.235.2 (2024-11-22) + +## v2.235.1 (2024-11-20) + +### Bug Fixes and Other Changes + + * Update sagemaker-core dep + * update image_uri_configs 11-20-2024 06:17:41 PST + +## v2.235.0 (2024-11-19) + +### Features + + * Optimize() validations across TRT, VLLM, Neuron container optimizations + +### Bug Fixes and Other Changes + + * update image_uri_configs 11-19-2024 06:17:58 PST + +## v2.234.0 (2024-11-19) + +### Features + + * optimization technique related validations. + +### Bug Fixes and Other Changes + + * Revert "change: add TGI 2.4.0 image uri (#4922)" + * pin testing deps + * add TGI 2.4.0 image uri + * add jumpstart ap-southeast-5 + * Move sagemaker-mlflow to extras + +## v2.233.0 (2024-11-04) + +### Features + + * triton v24.09 + * Marketplace model support in HubService + +### Bug Fixes and Other Changes + + * Fixing JumpStart Tests + * bumping smp version from 2.6.0 to 2.6.1 + * Updates for DJL 0.30.0 release + +## v2.232.3 (2024-10-30) + +### Bug Fixes and Other Changes + + * update image_uri_configs 10-29-2024 07:17:56 PST + * Skip pytorch tests incompatible with latest version 2.4.0 + * adding eu-central-2 bucket info to JS constants + * update image_uri_configs 10-23-2024 11:26:03 PST + * update image_uri_configs 10-17-2024 07:17:55 PST + * update image_uri_configs 10-03-2024 07:17:59 PST + * update image_uri_configs 09-27-2024 07:18:01 PST + * modified pull request template + * fixing typo in dependecy setup + * release: huggingface tgi neuronx 0.0.25 image + * Revert "update cloudpickle version to >=2.2.1 in pyproject.toml (#4899)" + * update cloudpickle version to >=2.2.1 in pyproject.toml + * update cloudpickle version to >=2.2.1 + * chore(deps): bump pyspark from 3.3.1 to 3.3.2 in /requirements/extras + * changes for PT 2.4 currency upgrade + * chore: add lmi image config in me-central-1 + * tests: Implement integration tests covering JumpStart PrivateHub workflows + * Use Miniforge to replace MambaForge + +## v2.232.2 (2024-10-03) + +### Bug Fixes and Other Changes + + * Pass kwargs to HuggingFaceModel.deploy() + * improve logging and exception messages + * remove deprecated distutils + * update image_uri_configs 09-24-2024 07:18:00 PST + +## v2.232.1 (2024-09-19) + +### Bug Fixes and Other Changes + + * update image_uri_configs 09-17-2024 07:17:54 PST + * support latest container version in image_uris and DJLModel for lmi c… + +## v2.232.0 (2024-09-12) + +### Features + + * add deployment config name in modelbuilder telemetry + * add Clarify image URIs for us-isof + +### Bug Fixes and Other Changes + + * chore: add flaky test markers & skip region with low P3 instance capacity + * update image_uri_configs 09-11-2024 11:54:11 PST + * update image_uri_configs 09-10-2024 07:18:01 PST + * [change] add us-gov and cn region repo accounts to djl and hugging face image metadata + * update image_uri_configs 09-06-2024 07:17:55 PST + * add us-gov region repo accounts to djl image metadata + * pass name from modelbuilder constructor to created model + +## v2.231.0 (2024-08-30) + +### Features + + * Add SageMaker Core to the dependency + +### Bug Fixes and Other Changes + + * Disable test_mnist_async + * SMP v2.5 + * update image_uri_configs 08-29-2024 07:17:59 PST + +## v2.230.0 (2024-08-28) + +### Features + + * FastAPI integration for In_Process Mode (2/2) + +### Bug Fixes and Other Changes + + * chore: add HF LLM neuronx 0.0.24 image + * TF-2.16 test modification and handling + * fix test fail + * Add troubleshooting links to exceptions + * cross account private hub model fine-tuning + * chore: cleanup jumpstart factory + * disable failing integration tests + +## v2.229.0 (2024-08-15) + +### Features + + * Support for ModelBuilder In_Process Mode (1/2) + * Pulling in dependencies (in_process mode) using conda environment + * Add optional CodeArtifact login to FrameworkProcessing job script + * implemented security-monitoring to send metrics to CW #1510 + +### Bug Fixes and Other Changes + + * alt configs model deployment and training issues + * fix keras extension in integ test + * update image_uri_configs 08-13-2024 07:17:54 PST + * trn1 instance family does not support volume size + * Update model.py + * removed log statement + * update image_uri_configs 08-09-2024 07:18:00 PST + * Added torchrun compatibility for distributet training across multiple GPUs in a single node (single instance) + * BiasConfig type hint + * add model monitor image accounts for ap-southeast-5 and eu-central-2 + * aligned UTC times with PST + * ensure hpt jobs inherit tags from config + * add JumpStart PDT and OSU regions + * chore(deps): bump certifi in /src/sagemaker/serve/utils + * Updates for DJL 0.29.0 release + * chore(deps): bump apache-airflow from 2.9.2 to 2.9.3 in /requirements/extras + * chore(deps): bump torch from 2.0.1 to 2.2.0 in /tests/data/serve_resources/mlflow/pytorch + * avoided printing stack trace and escaped input + * removing kwargs as this is breaking predictor_cls param for mode… + +## v2.228.0 (2024-08-06) + +### Features + + * triton v24.05 + +### Bug Fixes and Other Changes + + * chore: telemetry for deployment configs + * censoring sensitive values from being logged + * update image_uri_configs 08-05-2024 07:17:38 PST + * enable uncompressed model artifacts upload to S3 for SAGEMAKER_ENDPOINT overwrite for TGI, TEI, MMS model servers + * ModelReference deployment for Alt Configs models + * Add optional typecheck for nullable parameters + * Update package metadata + * release TEI 1.4.0 + +## v2.227.0 (2024-07-30) + +### Features + + * added code scanning through CodeQL + +### Bug Fixes and Other Changes + + * Fixed cpu isntance type for the estimator register test + * update image_uri_configs 07-29-2024 11:28:28 PST + * avoid AccessDenied error for a while on SageMaker Studio wtih do… + * SMP PT 2.3 Fix + * chore: pin framework version in serverless inference tests + * image uri in TGI 2.2.0 image + * explicitly access enum member values to avoid Python version related regression + * chore: add huggingface TGI 2.2.0 config + * update image_uri_configs 07-22-2024 11:53:54 PST + * update image_uri_configs 07-17-2024 07:17:38 PST + * update image_uri_configs 07-16-2024 07:17:45 PST + * add support for new regions + +## v2.226.1 (2024-07-17) + +## v2.226.0 (2024-07-12) + +### Features + + * Curated hub improvements + * InferenceSpec support for MMS and testing + +### Bug Fixes and Other Changes + + * ModelBuilder not passing HF_TOKEN to model. + * update image_uri_configs 07-10-2024 07:18:04 PST + +## v2.225.0 (2024-07-10) + +### Features + + * model optimization + +### Bug Fixes and Other Changes + + * fix integ test + * update uris for v1.1.1 + * update image_uri_configs 07-04-2024 07:17:24 PST + +## v2.224.4 (2024-07-04) + +### Bug Fixes and Other Changes + + * allow for inf spec and server override to be passed + +## v2.224.3 (2024-07-03) + +### Bug Fixes and Other Changes + + * Upgrade local dependencies + * Improve docstrings for estimator tags + +## v2.224.2 (2024-06-27) + +### Bug Fixes and Other Changes + + * Update DJLModel class for latest container releases + * list_models() for python3.8 + +## v2.224.1 (2024-06-21) + +### Bug Fixes and Other Changes + + * JumpStart CuratedHub Launch + * Update README.rst to show conda-forge version of SageMaker SDK + * Update tox.ini + * chore(deps): bump apache-airflow from 2.9.1 to 2.9.2 in /requirements/extras + * Model server override logic + +## v2.224.0 (2024-06-19) + +### Features + + * JumpStartModel attach + +### Bug Fixes and Other Changes + + * feat(sagemaker-mlflow): New features for SageMaker MLflow + * Upgrading to PT 2.3 for release + * chore: use ml.g5.2xlarge for integ test + * Enable telemetry logging for Remote function + * Fix Dependabot Issues - MLFlow Version + +## v2.223.0 (2024-06-13) + +### Features + + * add 'ModelCard' property to Register step + +### Bug Fixes and Other Changes + + * Fix Sniping bug fix + * Implement custom telemetry logging in SDK + * Fix ci unit-tests + * update image_uri_configs 06-12-2024 07:17:03 PST + +## v2.222.1 (2024-06-12) + +### Bug Fixes and Other Changes + + * First changes + * estimator.deploy not respecting instance type + +## v2.222.0 (2024-06-07) + +### Features + + * jumpstart telemetry + +### Bug Fixes and Other Changes + + * update image_uri_configs 06-06-2024 07:17:31 PST + * bump requests from 2.31.0 to 2.32.2 in /requirements/extras + * chore: add HF LLM neuronx 0.0.23 image + * Updates for DJL 0.28.0 release + * chore(deps): bump mlflow from 2.11.1 to 2.12.1 in /tests/data/serve_resources/mlflow/tensorflow + * chore(deps): bump mlflow from 2.11.1 to 2.12.1 in /tests/data/serve_resources/mlflow/xgboost + * chore(deps): bump mlflow from 2.10.2 to 2.12.1 in /tests/data/serve_resources/mlflow/pytorch + * chore(deps): bump apache-airflow from 2.9.0 to 2.9.1 in /requirements/extras + * chore(deps): bump requests from 2.31.0 to 2.32.2 in /tests/data/serve_resources/mlflow/pytorch + * Fix ci unit-tests + * Making project name in workflow files dynamic + * update image_uri_configs 05-29-2024 07:17:35 PST + * Update: SM Endpoint Routing Strategy Support. + +## v2.221.1 (2024-05-22) + +### Bug Fixes and Other Changes + + * Convert pytorchddp distribution to smdistributed distribution + * Add tei cpu image + +## v2.221.0 (2024-05-20) + +### Features + + * onboard tei image config to pysdk + +### Bug Fixes and Other Changes + + * JS Model with non-TGI/non-DJL deployment failure + * cover tei with image_uris.retrieve API + * Add more debuging + * model builder limited container support for endpoint mode. + * Image URI should take precedence for HF models + +## v2.220.0 (2024-05-15) + +### Features + + * AutoGluon 1.1.0 image_uris update + * add new images for HF TGI release + * Add telemetry support for mlflow models + +### Bug Fixes and Other Changes + + * add debug logs to workflow container dist creation + * model builder race condition on sagemaker session + * Add tensorflow_serving support for mlflow models and enable lineage tracking for mlflow models + * update image_uri_configs 05-09-2024 07:17:41 PST + * skip flakey tests pending investigation + +## v2.219.0 (2024-05-08) + +### Features + + * allow choosing js payload by alias in private method + +### Bug Fixes and Other Changes + + * chore(deps): bump jinja2 from 3.1.3 to 3.1.4 in /requirements/extras + * chore(deps): bump tqdm from 4.66.2 to 4.66.3 in /tests/data/serve_resources/mlflow/pytorch + * chore(deps): bump jinja2 from 3.1.3 to 3.1.4 in /doc + * Updates for SMP v2.3.1 + +## v2.218.1 (2024-05-03) + +### Bug Fixes and Other Changes + + * Fix UserAgent logging in Python SDK + * chore: release tgi 2.0.1 + * chore: update skipped flaky tests + +## v2.218.0 (2024-05-01) + +### Features + + * set default allow_pickle param to False + +### Bug Fixes and Other Changes + + * properly close files in lineage queries and tests + +## v2.217.0 (2024-04-24) + +### Features + + * support session tag chaining for training job + +### Bug Fixes and Other Changes + + * Add Triton v24.03 URI + * mainline alt config parsing + * Fix tox installs + * Add PT 2.2 Graviton Inference DLC + +## v2.216.1 (2024-04-22) + +### Bug Fixes and Other Changes + + * add DXB and CGK to Jumpstart regions + * chore(deps): bump apache-airflow from 2.8.4 to 2.9.0 in /requirements/extras + * bump djl-inference 0.27.0 neuronx sdk to 2.18.1 + * chore: release TGI 2.0.0 + +## v2.216.0 (2024-04-17) + +### Features + + * optimum 0.0.21 + * Add TF 2.14 Graviton Inference support + * JumpStart alternative config parsing + * TGI 1.4.5 + +### Bug Fixes and Other Changes + + * chore(deps): bump black from 22.3.0 to 24.3.0 in /requirements/extras + * Add back serialization for automatic speech recognition + * bump apache-airflow version to 2.8.4 + * remove trailing slash when uploading to S3 with dataset_builder.to_csv_file + * Update Collaborator Check workflow to check for users which are part of collaborator team + * forward network_isolation parameter to Estimators when False + * Flaky slow test + * Revert "Test SM PySDK Variations" + +### Documentation Changes + + * Add supported task types to schema builder omission + +## v2.215.0 (2024-04-12) + +### Features + + * JumpStart Gated Model Support in ModelBuilder Local Modes + * Changes to support remote schema retrieval for task types (question-answering, fill-mask) and added e2e tests for both local and remote hf schema logic. + * Upgrade smp to version 2.3.0 + +### Bug Fixes and Other Changes + + * disable modelbuilder mlflow local integ tests + * add integ-tests to codebuild-ci.yml + * [Feat] Support MLflow Model Format Through ModelBuilder + * Test SM PySDK Variations + * typo in jumpstart manifest and refine tests + * add kix to launched regions + * Remove space specific business logic from Python SDK function to fetch execution role + * Remove notebook tests from CI health check and the script + +## v2.214.3 (2024-04-04) + +### Bug Fixes and Other Changes + + * [Fix] Switch to subprocess in ModelBuilder when capturing dependencies + * chore: skip flaky test + +## v2.214.2 (2024-04-01) + +### Bug Fixes and Other Changes + + * Skip JS Tune integration test + * bump apache-airflow version to 2.8.3 + * bump onnx version to >=1.15.0 + * Updates for DJL 0.27.0 release + * Tune (local mode) support for Jumpstart Models + * attach jumpstart estimator for gated model + +## v2.214.1 (2024-03-27) + +### Bug Fixes and Other Changes + + * Update schema dependency version + * remove failing deprecated tests from suite + * update readme, trigger p311 tests + * JumpStart list models flaky tests + * fix badge in README + +## v2.214.0 (2024-03-22) + +### Features + + * add support to ``clarify.py`` for time series explainability jobs + +### Bug Fixes and Other Changes + + * remove pytorch test for deprecated function + * skip test_experiment_analytics to unblock release + * Create workflow module scoped sagemaker_session to resolve test race condition + * Simplify how we process test dependencies, which are supposed to include all extras. + * skip failing feature store search integ test + * skip failing pt test + * list jumpstart models with invalid version strings + * urge customers to install latest version + +## v2.213.0 (2024-03-15) + +### Features + + * Add support for Streaming Inference + * tgi optimum 0.0.19, 0.0.20 releases + * support JumpStart proprietary models + * Add ModelDataSource and SourceUri support for model package and while registering + * Accept user-defined env variables for the entry-point + * Add overriding logic in ModelBuilder when task is provided + +### Bug Fixes and Other Changes + + * Improvement of the tuner documentation + * Skip of tests which are long running and causing the ResourceLimitInUse exception + * Add AutoML -> AutoMLV2 mapper + * add ci-health checks + * split coverage out from testenv in tox.ini + * add PT 2.2 support for smdistributed, pytorchddp, and torch_distributed distributions + * sagemaker session region not being used + * chore: emit warning when no instance specific gated training env var is available, and raise exception when accept_eula flag is not supplied + * enable github actions for PRs + * Move sagemaker pysdk version check after bootstrap in remote job + * make unit tests compatible with pytest-xdist + * Update tblib constraint + +## v2.212.0 (2024-03-06) + +### Features + + * Update SM Python SDK for PT 2.2.0 SM DLC + +### Bug Fixes and Other Changes + + * Create custom tarfile extractall util to fix backward compatibility issue + * Upgrade smp to version 2.2 + * Enhance model builder selection logic to include model size + +## v2.211.0 (2024-03-05) + +### Features + + * pin dll version to support python3.11 to the sdk + * instance specific jumpstart host requirements + * Add TensorFlow 2.14 image configs + * Add AutoMLV2 support + * Support selective pipeline execution between function step and regular step + * Add new Triton DLC URIs + +### Bug Fixes and Other Changes + + * Skip No Canvas regions for test_deploy_best_candidate + * make sure gpus are found in local_gpu run + * Bump Apache Airflow version to 2.8.2 + * properly close sagemaker config file after loading config + * remove enable_network_isolation from the python doc + +### Documentation Changes + + * Add doc for new feature processor APIs and classes + +## v2.210.0 (2024-02-28) + +### Features + + * Prepend SageMaker Studio App Type to boto3 User Agent string + * TGI optimum 0.0.18 (general+llm) + * TGI 1.4.2 + +### Bug Fixes and Other Changes + + * tolerate vulnerable old model for integ test and temporarily skip test_list_jumpstart_models_script_filter + * add missing regions to pytorch config + * Add validation for sagemaker version on remote job + * fixed implementation of fail_on_violation for transform with monitoring + +## v2.209.0 (2024-02-24) + +### Features + + * ModelBuilder to fetch local schema when no SchemaBuilder present. + * AutoGluon 1.0.0 image_uris update + +### Bug Fixes and Other Changes + + * skip pytorch training compiler integ test + * add fixes for tarfile extractall functionality PEP-721 + * Fix telemetry image uri option logic for ModelBuilder + * Add telemetry metrics on usage of default images for ModelBuilder + * Fix error message typo + * Add "distribution" parameter into record_set + +## v2.208.0 (2024-02-15) + +### Features + + * Telemetry metrics + * TGI 1.4.0 + * Support collection type and target store for feature store ingestion. + +### Bug Fixes and Other Changes + + * bump jinja2 to 3.1.3 in doc/requirments.txt + * chore(deps): bump jinja2 from 3.0.3 to 3.1.3 in /requirements/extras + * Fix dependabot alert in transformers package + * Bump Apache Airflow version to 2.8.0 + * skip failing mxnet tests + +### Documentation Changes + + * change order of pipelines topics + * Explain the ClarifyCheckStep and QualityCheckStep parameters + * fix the ClarifyCheckStep documentation to mention PDP + +## v2.207.1 (2024-02-06) + +### Bug Fixes and Other Changes + + * Add PT 2.1 as a supported framework for the smdistributed distribution + * Enable private docker registry support for ModelBuilder + * HF PT 2.1 Image Configs + +### Documentation Changes + + * add setup commands for documentation generation + +## v2.207.0 (2024-02-05) + +### Features + + * Introduce HF Transformers to ModelBuilder + * retrieve jumpstart estimator and predictor without specifying model id (infer from tags) + +### Bug Fixes and Other Changes + + * SMP PT upgrade to 2.1 + * Fetch HF metadata only when explicit type is not selected + * relax upper bound for urllib dependency + +## v2.206.0 (2024-01-31) + +### Features + + * Logic to detect hardware GPU count and aggregate GPU memory size in MiB + +### Bug Fixes and Other Changes + + * fixed create monitoring schedule failing after validation error + * Support PipelineVariable for ModelQualityCheckConfig attributes + * TGI NeuronX 0.0.17 + +## v2.205.0 (2024-01-25) + +### Features + + * Support selective pipeline execution for function step + +### Bug Fixes and Other Changes + + * remove fastapi and uvicorn dependencies + * Support using PipelineDefinitionConfig in local mode + * update get_execution_role_arn from metadata file if present + * update image_uri_configs 01-24-2024 06:17:33 PST + * Add validation for empty ParameterString value in start local pipeline + +## v2.204.0 (2024-01-23) + +### Features + + * add throughput management support for feature group + * Support custom repack model settings + * parallelize notebook search utils, add new operators + +### Bug Fixes and Other Changes + + * Enable galactus integ tests + * JumpStart - TLV region launch + * add warning message for job-prefixed pipeline steps when no job name is provided + * TGI NeuronX + * Updates for DJL 0.26.0 release + * update sphinx version + * Add PyTorch 2.1.0 SM Training DLC to UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM list + * Huggingface glue failing tests + * change ConditionNot incorrect property Expression to Condition + +## v2.203.1 (2024-01-09) + +### Bug Fixes and Other Changes + + * TGI 1.3.3 + * skip failing integs + * query hf api for model md + * update image_uri_configs 12-29-2023 06:17:34 PST + +## v2.203.0 (2023-12-28) + +### Features + + * support local mode in SageMaker Studio (#1300) + * Supporting tbac in load_run + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-25-2023 06:17:33 PST + * Disable failed test in IR + * Raise Exception for debug + * create role if needed in `get_execution_role` + +## v2.202.1 (2023-12-22) + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-22-2023 06:17:35 PST + * update model path in local mode + * Using logging instead of prints + +### Documentation Changes + + * update issue template. + +## v2.202.0 (2023-12-21) + +### Features + + * support remote debug for sagemaker training job + +### Bug Fixes and Other Changes + + * update image_uri_configs 12-21-2023 08:32:41 PST + * Update tblib constraint + +## v2.201.0 (2023-12-20) + +### Features + + * Use specific images for SMP v2 jobs + * Added update for model package + +### Bug Fixes and Other Changes + + * Add write permission to job output dirs for remote and step decorator running on non-root job user + * Move func and args serialization of function step to step level + +### Documentation Changes + + * SMP v2 doc updates (#1423) + * fix ModelBuilder sample notebook links + +## v2.200.1 (2023-12-14) + +### Bug Fixes and Other Changes + + * Merge branch 'master-rba' into local_merge + * Fix user agent tag issue + * update image_uri_configs 12-13-2023 14:04:54 PST + * update image_uri_configs 12-13-2023 12:23:06 PST + +## v2.200.0 (2023-12-13) + +### Deprecations and Removals + + * remove explicit `partitions` key requirement on pysdk side. + +### Features + + * add non-repeating config logger + * Add SageMaker Geospatial verison 1.x images + * TGI 1.2.0 Image Uri + * support model.register() with triton model + * Enable retrieving function step results for local mode + +### Bug Fixes and Other Changes + + * TGI 1.3.1 + * excessive jumpstart instance type logging + * Support local mode for remote function + * `Session.download_data` can not download nested objects + * Fix writing into non-closed file with git clone command + * mitigation of xgboost container incompatibility with new version + * update image and hardware validation with inf and graviton + * remove two setuptools deprecations + * minor jumpstart dev ex improvements + * save utils fix + * Correct DJL neuronx regions + * delete unused file inference-experience-dev-tester.sh + * Fix Experiment Run integ test w.r.t unexpected boto3 version + * Bump test dependencies versions + * fast follow on js uncompressed support - ModelBuilder + * Modify Region List for Neuron Images (HF neuron/neuronx, PT Neuron) + +### Documentation Changes + + * Mention for custom Docker Image + +## v2.199.0 (2023-11-30) + +### Features + + * Update boto3 version to 1.33.3 + * Goldfinch InferenceComponent integration + * Add Pipeline step decorator, NotebookJobStep, and scheduler + * ModelBuilder for simplified model testing and deployment + +### Bug Fixes and Other Changes + + * Skip failing integ tests + * chore: add jumpstart gated model integration tests + * disable integration tests for Inference Component based endpoint for non supported regions + * remove unnecessary whl file - Galactus + * refactor endpoint type enums, comments, docstrings, method names… + * Address SA feedback regarding deployment straight to Endpoint Mode - Galactus + * fix HuggingFace GEN2 model deployment arguments + * when customer role does not have permission to read logs from CW, default to standard logging - Galactus + * Add pagination for ListInferenceComponent API responses, address non-blocking comments + * Create CSVSerializerWrapper - Galactus + * Model builder Final Fixes + * remote function include_local_workdir default value + * use copy instead of move in bootstrap script + * WaiterError on failed pipeline execution. results() + * Add a unit test for consistency between step and remote decorator + * merge WorkdirConfig and custom_filter_filter parameters + * Add back mixed step type integ test + * do not delete temp folder generated by sdist + * make collect_parameters decorator as private + * HMAC signing for ModelBuilder Triton python backend + +### Documentation Changes + + * update docstring for Generation two endpoint and validation message + * galactus doc updates + * CustomFileFilter doc updates + * NotebookJobSteps class parameter severity update + +## v2.198.0 (2023-11-27) + +### Features + + * re:Invent 2023 keynote1 + +## v2.197.0 (2023-11-07) + +### Features + + * PT2.1 SM Training/Inference DLC Release + +### Bug Fixes and Other Changes + + * Release HuggingFace PT Neuronx training image 1.13.1 + * HuggingFace PT Neuronx release in SDK + +## v2.196.0 (2023-10-27) + +### Features + + * inference instance type conditioned on training instance type + +### Bug Fixes and Other Changes + + * improved jumpstart tagging + +## v2.195.1 (2023-10-26) + +### Bug Fixes and Other Changes + + * Allow either instance_type or instance_group to be defined in… + * enhance image_uris unit tests + +## v2.195.0 (2023-10-25) + +### Features + + * jumpstart gated model artifacts + * jumpstart extract generated text from response + * jumpstart contruct payload utility + +### Bug Fixes and Other Changes + + * relax upper bound on urllib in local mode requirements + * bump urllib3 version + * allow smdistributed to be enabled with torch_distributed. + * fix URL links + +### Documentation Changes + + * remove python 2 reference + * update framework version links + +## v2.194.0 (2023-10-19) + +### Features + + * Added register step in Jumpstart model + * jumpstart instance specific metric definitions + +### Bug Fixes and Other Changes + + * Updates for DJL 0.24.0 Release + * use getter for resource-metadata dict + * add method to Model class to check if repack is needed + +## v2.193.0 (2023-10-18) + +### Features + + * jumpstart model artifact instance type variants + * jumpstart instance specific hyperparameters + * Feature Processor event based triggers (#1132) + * Support job checkpoint in remote function + * jumpstart model package arn instance type variants + +### Bug Fixes and Other Changes + + * Fix hyperlinks in feature_processor.scheduler parameter descriptions + * add image_uris_unit_test pytest mark + * bump apache-airflow to `v2.7.2` + * clone distribution in validate_distribution + * fix flaky Inference Recommender integration tests + +### Documentation Changes + + * Update PipelineModel.register documentation + * specify that input_shape in no longer required for torch 2.0 mod… + +## v2.192.1 (2023-10-13) + +### Bug Fixes and Other Changes + + * update local mode schema + * import error in unsupported js regions + * Update Ec2 instance type to g5.4xlarge in test_huggingface_torch_distributed.py + +## v2.192.0 (2023-10-11) + +### Features + + * jumpstart estimator enable infra check flag + * jumpstart default payloads + * allow non-python files in job dependencies + * allow configuring docker container in local mode + +### Bug Fixes and Other Changes + + * js tagging s3 prefix + * Batch transform: Add support for split_type == "None" in local mode + * use correct line endings and s3 uris on windows + * Fixed bug in _create_training_details + * DJL Neuronx 0.24.0 + +### Documentation Changes + + * Include FeatureGroup's load_feature_definitions API documentation + +## v2.191.0 (2023-10-05) + +### Features + + * Selective Step Execution milestone 2 features + * feature-processor extra data sources support + +## v2.190.0 (2023-10-04) + +### Features + + * Add support for in-memory feature groups and collection type features in Feature Store. + +### Bug Fixes and Other Changes + + * chore: xfail resource in use failure for specific test + * Add missing API docs for processors + +### Documentation Changes + + * Bring back (de)serializers documentation + * Add missing AirFlow operators + link to airflow documentation + +## v2.189.0 (2023-10-03) + +### Features + + * add feature processor APIs to public doc + * s3 prefix model data for JumpStartModel + * Model Package support for updating approval + +### Bug Fixes and Other Changes + + * Add bucket owner check + * transform step unit test + * Release TGI 1.1.0 Image + +## v2.188.0 (2023-09-26) + +### Features + + * jumpstart instance type variants + * New methods to ingest and create Feature Groups + +### Bug Fixes and Other Changes + + * auto ml integ tests and add flaky test markers + * Enhance unit-tests to automatically consume image URIs config registries from config JSONs + +## v2.187.0 (2023-09-19) + +### Features + + * add HealthCheckConfig support + * SkipModelValidation in modelRegistry + +### Bug Fixes and Other Changes + + * Update fw_utils.py - support 2.0.1 container for DDP and Torch distri… + * bump apache-airflow to v2.7.1 + +## v2.186.0 (2023-09-14) + +### Features + + * TGI 1.0.3 Image URI Config + +## v2.185.0 (2023-09-12) + +### Features + + * Local Mode - Add Support for Docker Compose V2 + +### Bug Fixes and Other Changes + + * handle bad jumpstart default session + * Add Data Wrangler TLV and version 3.x images + +## v2.184.0.post0 (2023-09-11) + +### Documentation Changes + + * add interactive apps rst file + +## v2.184.0 (2023-09-07) + +### Features + + * Enable notebook instances to get presigned url + +### Bug Fixes and Other Changes + + * update scikit-learn, scipy, and apache-airflow deps for dependabot + * log message when sdk defaults not applied + +## v2.183.0 (2023-09-05) + +### Deprecations and Removals + + * remove support for py37 + +### Features + + * Neo service GA in TLV + +### Bug Fixes and Other Changes + + * Update pytorch.json with 2.0.1 for inference and training + * get python version dynamically for remote function tests + * HuggingFaceProcessor parameterized instance_type when image_uri is absent + +## v2.182.0 (2023-08-29) + +### Features + + * image url for modelmonitor in TLV region + * Enable spot training on remote decorator and executor + +## v2.181.0 (2023-08-28) + +### Features + + * StabilityAI DLC Image URIs + +### Bug Fixes and Other Changes + + * temporarily skip kmeans notebook + +## v2.180.0 (2023-08-24) + +### Features + + * Add presigned URLs for interactive apps + * Add detail profiler V2 options and tests + +## v2.179.0 (2023-08-21) + +### Features + + * attach method for jumpstart estimator + +### Bug Fixes and Other Changes + + * pipeline upsert failed to pass parallelism_config to update + +## v2.178.0 (2023-08-17) + +### Features + + * Support to get latest monitoring execution processing logs + +### Bug Fixes and Other Changes + + * Add context to predict_fn example + * gated models unsupported region + * jumpstart cache using sagemaker session s3 client + * add TFS 2.13 Graviton SM images + * pipeline variable kms key + * integration test for gated jumpstart training model + * tags for jumpstart model package models + +## v2.177.1 (2023-08-14) + +### Bug Fixes and Other Changes + + * chore: excessive jumpstart bucket logging + +## v2.177.0 (2023-08-11) + +### Features + + * Add TLV accounts for 1P Algorithms + +## v2.176.0 (2023-08-10) + +### Features + + * Add TF 2.13 Training and Inference SM images + +### Bug Fixes and Other Changes + + * revert-PR_3903 + * skip tensorflow local mode notebook test + * change instance type for huggingface test to ml.g5.8xlarge + +## v2.175.0 (2023-08-05) + +### Features + + * Add huggingface-llm 0.9.3 dlc images + +### Bug Fixes and Other Changes + + * Upgrade default version for djl to v0.23.0 + * Pass kms_key to _upload_analysis_config when provided + +## v2.174.0 (2023-08-02) + +### Features + + * meta llama fine tuning + * support online store ttl for records + * Deploy uncompressed ML model from S3 to SageMaker Hosting endpoints + * AutoGluon 0.8.2 image_uris update + +### Bug Fixes and Other Changes + + * [Feature] Propagate tags to lineage resources + * excessive jumpstart logging + * chore: jumpstart deprecation messages + * build(deps): bump pygments from 2.11.2 to 2.15.0 in /requirements/tox + * Remove deleted notebook tests from test confg + * chore: add jumpstart llama 2 tests + +### Documentation Changes + + * add smp class for supporting flash attn + +## v2.173.0 (2023-07-15) + +### Features + + * jumpstart EULA models + +### Bug Fixes and Other Changes + + * Update the apache airflow constraints + * Update apache airflow version + * bump up djl inference image uri versions + +## v2.172.0 (2023-07-13) + +### Features + + * Add check for if TrialComponent is already associated with a Trial in Run + * Add features_to_explain to shap config + +### Bug Fixes and Other Changes + + * Support protobuf4 + * Remove unnecessary get caller identity call + * Missing JumpStart estimator args + * Add volume to partition djl_inference + +### Documentation Changes + + * Correct runtime param + * fix wait_for_endpoint docstring + +## v2.171.0 (2023-07-06) + +### Features + + * Add PipelineDefinitionConfig to pipelines to toggle custom job … + +### Bug Fixes and Other Changes + + * Upgrade DJL deepspeed versions + * Remove unused dependency `protobuf3-to-dict` + * skip intelligent volume_size allocation based on instance type if it is a pipeline parameter + +## v2.170.0 (2023-07-05) + +### Features + + * Enable customizing artifact output path + +### Bug Fixes and Other Changes + + * Add il-central-1 support for all SM DLCs + * jumpstart async inference config predictor support + * Update CreateEdgePackagingJob resourceKey with type string + +## v2.169.0 (2023-06-29) + +### Features + + * Add support for tags in to_pipeline API for feature processor + * model registry integration to model cards to support model packages + * SDK Defaults - DebugHookConfig defaults in TrainingJob API + * Add segment config for Clarify + +### Bug Fixes and Other Changes + + * Neuronx image retrieval missing sdk information + +### Documentation Changes + + * Doc updates for SDK defaults - S3 Params, Env Variables, Disable Profiler, and DebugHookConfig + +## v2.168.0 (2023-06-22) + +### Features + + * Support uncompressed model upload + * Add optional monitoring_config_override parameter in suggest_baseline API + * SDK defaults add disable profiler to createTrainingJob + +### Bug Fixes and Other Changes + + * Enable spark processing container in KIX + * Fix key prefix preventing jumpstart model repack + +## v2.167.0 (2023-06-21) + +### Features + + * add SageMaker FeatureStore feature processing + +### Bug Fixes and Other Changes + + * Chore/reset cache if js model not found + +## v2.166.0 (2023-06-19) + +### Features + + * Add `inf2` support to `HuggingFaceModel` + * adding resourcekey and tags for api in config for SDK defaults + +### Bug Fixes and Other Changes + + * Remove deprecated option.s3url in favor of option.model_id + * Use sagemaker config keyword + * SDK Defaults Config - Handle config injection for None Sessions + * Fix HPO Grid Search comparison and name + +## v2.165.0 (2023-06-13) + +### Features + + * Add support for Deployment Recommendation ID in model.deploy(). No tagging support + +### Bug Fixes and Other Changes + + * maketplace integs + * Add tagging assert to inference recommender integ tests + * breaking deviations in _create_sagemaker_model call + +### Documentation Changes + + * Add missing quotation mark + +## v2.164.0 (2023-06-08) + +### Features + + * SDK Defaults - Environment Variables + * Update Transformers 4.28 - PyTorch 2.0.0 Training and Inference Image URI + +### Bug Fixes and Other Changes + + * tag more integs as flaky for auto-retry + * Remove docker-compose from local requirements + * enable neo framework version support on ml_inf2 and ml_trn1 + +## v2.163.0 (2023-06-07) + +### Features + + * Add huggingface-llm 0.8.2 dlc images + +### Bug Fixes and Other Changes + + * Update to more actionable error message + * Loosen local reqs for PyYAML + +## v2.162.0 (2023-06-06) + +### Features + + * Add tagging support for create ir job + * Selective Step Execution feature in Pipelines + * Add Neuronx Image uri - Transformers 4.28 - PyTorch 1.13 + +### Bug Fixes and Other Changes + + * skip pipelines abalone notebook test + * Update neo multiversion support to include edge devices + +### Documentation Changes + + * JumpStart Utility Doc Update + +## v2.161.0 (2023-06-01) + +### Features + + * Add huggingface-llm 0.6.0 dlc images + * Add autotune for HyperparameterTuner + +### Bug Fixes and Other Changes + + * Remove release tag from non-global test + * SDK defaults for volume size, JS Estimator image uri region, Predictor str method + +## v2.160.0 (2023-05-31) + +### Features + + * PyTorch 2.0 release + * Add TFS 2.12.1 Graviton image + +### Bug Fixes and Other Changes + + * Fix failing integ test + * remove unnecessary log messages for loading existing experiment runs + * build(deps): bump requests from 2.27.1 to 2.31.0 in /requirements/extras + * SDK Defaults - switch from config printing to logging + +## v2.159.0 (2023-05-23) + +### Features + + * Add TF Serving 2.12.1 images to the SM PySDK + +### Bug Fixes and Other Changes + + * Update the list of extension packages pylint is allowed to load + +## v2.158.0 (2023-05-22) + +### Features + + * Enable default role for Spark processors + * SDK Defaults - S3 Params for Session + * Bump up images for DJL transformers Neuronx DLCs + +### Bug Fixes and Other Changes + + * Relax local-mode PyPI requirements on urllib3 + +### Documentation Changes + + * Fix Tensorflow and PyTorch supported version in HuggingFaceProcessor + * Update doc for model_server_workers param in PyTorchModel + +## v2.157.0 (2023-05-18) + +### Features + + * Handle use case where endpoint is created outside of python … + +### Bug Fixes and Other Changes + + * Make type annotation of UploadedCode consistent + * Add SELinux label to local docker volumes + +## v2.156.0 (2023-05-17) + +### Features + + * Partition support for DJLModel using SM Training job + * Update run-notebook-test to consider skips failures + +### Bug Fixes and Other Changes + + * Update apache airflow and update test requirements + * Perform integrity checks for remote function execution + * Add p2 instances to integ tests + * Fix typo in logging message within ir mixin + * double Run create on load_run + * Update dtype logic for huggingface backend for new containers + +### Documentation Changes + + * Update container version for SKLearn + * Add description for parameters in TransformInput + +## v2.155.0 (2023-05-15) + +### Features + + * Add support for SageMaker Serverless inference Provisioned Concurrency feature + +### Bug Fixes and Other Changes + + * Revert "fix: make RemoteExecutor context manager non-blocking on pend… + * Add BOM to no No P2 Availability region list + +## v2.154.0 (2023-05-11) + +### Features + + * Add integ tests for remote_function, auto_capture functionality + * jumpstart model estimator classes + +### Bug Fixes and Other Changes + + * integs - pytorch transformer deps and add test retry + * adding .lower() so new Pandas dtypes will match the type lookup. + * Pass KMS value to create processing job + +## v2.153.0 (2023-05-09) + +### Features + + * Support npz archives in NumpyDeserializer + * Add FasterTransformer DJL support + * support for Sample Weights for SageMaker Autopilot + +### Bug Fixes and Other Changes + + * retry is_run assertion + * Avoid 'AttributeError' for endpoint_name, if deploy() is not yet called + * Fix LambdaStep Creation + * Fix error when instance_count>1 in remote_function + * Remove deprecated update_endpoint from deploy() args in TensorFlowModel + * Update DJL deepspeed and fastertransformer DLC image uris + * remote_function python version mismatch issue + +## v2.152.0 (2023-05-04) + +### Features + + * add support for lineage visualization using pyvis + * Expose Experiment class publicly + * PyTorch 1.13 release + +### Bug Fixes and Other Changes + + * Change data_type argument to dtype to keep consistent with D… + * Skip edge test + * make RemoteExecutor context manager non-blocking on pending futures + * Add inferentia2 DLC images for djl framework + * Fix typo in using_pytorch.rst + * Unable to attach estimator to training job when KeepAlivePeriodInSeconds specified + * update LMI container image + * Update Clarify SHAPConfig baseline to allow JSON structures + +### Documentation Changes + + * Fix broken link in DJL SageMaker docs + * currency update for the SageMaker data parallelism lib + * SM model parallel library v1.15.0 release note + +## v2.151.0 (2023-04-27) + +### Features + + * Update Transformers 4.26 - TensorFlow 2.11.0 Image URI + * Add Extra Parameters to Lambda Function Wrapper + +### Bug Fixes and Other Changes + + * Add kms key support for Model registration + * Enable inference recommender slow tests + * Pass sagemaker session to downstream s3 calls + * Add ap-south-1 to no p3 regions + * skip test for p2 instance for TF2.12 and above + +### Documentation Changes + + * Fix minor misses from the remote function doc release + +## v2.150.0 (2023-04-26) + +### Features + + * Introduce TensorBoard app class + +### Bug Fixes and Other Changes + + * Update data wrangler images + +## v2.149.0 (2023-04-25) + +### Features + + * Support TF2.12 SageMaker DLC + +### Bug Fixes and Other Changes + + * update the doc for Join function + * change s3UploadMode of sagemaker clarify processing output for computer vision jobs. + +### Documentation Changes + + * Add Remote Function updates + +## v2.148.0 (2023-04-20) + +### Features + + * [huggingface] Add `torch.distributed` support for Trainium and `torchrun` + * Add PyTorch 2.0 to SDK + +### Bug Fixes and Other Changes + + * updating batch transform job in monitoring schedule + +## v2.147.0 (2023-04-18) + +### Features + + * support different types of deletion mode + +## v2.146.1 (2023-04-17) + +### Bug Fixes and Other Changes + + * skip failing tests temporarily + * Added ml.p4d and ml.p4de as supported instances for DeepSpeed + +### Documentation Changes + + * Add Model Registry Model Collection + +## v2.146.0 (2023-04-13) + +### Features + + * Add support for JSON model inputs for Clarify Processor + +### Bug Fixes and Other Changes + + * Feature/list collection + * improve reliability of Run integration test + * Add a comment that smdataparallel lib excludes tf 2.12 support + +### Documentation Changes + + * Update reference to load run method in documentation + +## v2.145.0 (2023-04-06) + +### Features + + * add support for async inline error notifications + * Add methods for feature group to list feature metadata parameters and tags + * Support huggingface hub model_id for DJL Models + +### Bug Fixes and Other Changes + + * load_sagemaker_config should lazy initialize a default S3 resource + +## v2.144.0 (2023-04-05) + +### Features + + * support create Clarify explainer enabled endpoint for Clarify Online Explainability + * Combined inference and training script artifact + * jumpstart instance types + * Deprecation warning for framework profiling for TF 2.12 and on, PT 2.0 and on + +### Bug Fixes and Other Changes + + * always delete temporary directory even during exception + * Fixes the completion_criteria_config dict in the to_input_req method + * Update CHANGELOG.md + +### Documentation Changes + + * Update SageMaker Debugger doc + +## v2.143.0 (2023-03-29) + +### Features + + * Support for SageMaker SDK Defaults + +### Bug Fixes and Other Changes + + * update feature store offline s3 path used in tests + +## v2.142.0 (2023-03-27) + +### Features + + * combined model + script artifact + +## v2.141.0 (2023-03-24) + +### Features + + * AutoGluon 0.7.0 image_uris update + * Add DJL FasterTransformer image uris + * EMR step runtime role support + * locations for EMR configuration and Spark dependencies + * Adding support for 1P Algorithms in ZAZ, ZRH, HYD, MEL + +### Documentation Changes + + * Update FeatureGroup kms key id documentation + +## v2.140.1 (2023-03-21) + +### Bug Fixes and Other Changes + + * Fix cross account register model + * Handle instance support for Hugging Face tests + * Upgrade apache-airflow-providers-amazon version + * build(deps): bump apache-airflow from 2.4.1 to 2.5.1 + * Mark test_create_model_package test for xfail + * Disable module-not-measured warnings to avoid clutter in build logs + +## v2.140.0 (2023-03-17) + +### Features + + * SDK changes for TRCOMP support + +### Bug Fixes and Other Changes + + * [Feature - Hugging Face] Update Transformers 4.26 - PyTorch 1.13.1 Image uri + +## v2.139.0 (2023-03-15) + +### Features + + * Add XGBoost framework 1.7-1 version + +### Bug Fixes and Other Changes + + * Fix image_uris.retrieve() function to return ValueError when framework is not allowed for an instance_type + +## v2.138.0 (2023-03-13) + +### Features + + * Jumpstart training metrics + +### Bug Fixes and Other Changes + + * Add new region support for MX, PT, TF on SM Training + +## v2.137.0 (2023-03-10) + +### Features + + * support JSON for input dataset and model output + +### Bug Fixes and Other Changes + + * Wait on describe for tag propagation + * Extracted profile_name directly from sagemaker.Session if None + * Avoid double encoding to JSON in InferenceRecommenderMixin + * RepackStep must use the same KMS key as the Model + +## v2.136.0 (2023-03-09) + +### Features + + * with_feature_group [feature_store] + * Djl Large Model Support + * Decouple model.right_size() from model registry + +### Bug Fixes and Other Changes + + * Fix integration test error in test_default_right_size_and_deploy_unregistered_base_model + * Add djl 0.21.0 dlc images + +### Documentation Changes + + * Torchrun gpu support documentation change + +## v2.135.1.post0 (2023-03-02) + +### Documentation Changes + + * update feature store dataset builder docs + +## v2.135.1 (2023-03-01) + +### Bug Fixes and Other Changes + + * Revert back to stable apache-airflow-providers-amazon from 7.2.1 to 4.0.0. + * Typo in graviton algos + * build(deps): bump apache-airflow-providers-amazon from 4.0.0 to 7.2.1 in /requirements/extras + * Support cloning private repo using ssh key + * Create a default SageMaker Session inside FeatureGroup class + +### Documentation Changes + + * fix typo in README + +## v2.135.0 (2023-02-23) + +### Features + + * Add DLC accounts for MEL Region + * allow use of short lived creds for local container + +### Bug Fixes and Other Changes + + * update lambda function when function arn is provided + +## v2.134.1 (2023-02-22) + +### Bug Fixes and Other Changes + + * local mode deletion of temp files on job end + * Cron expression resetting on update monitor + * added support to update arguments in create_monitoring_schedule + +## v2.134.0 (2023-02-22) + +### Features + + * Add python 3.9 and spark 3.2 support for spark processor + * Adding support for Multi Worker Mirrored Strategy in TF estimator + +### Bug Fixes and Other Changes + + * tag permission issue - remove describe before create + +## v2.133.0 (2023-02-18) + +### Features + + * feature store with_feature_group functionality changes + * Adding support for SageMaker Training Compiler PyTorch 1.13 + * support of the intelligent stopping in the tuner + * AutoGluon 0.6.2 image_uris update + * Support for flexible instance types in the HPO + * Add business details and hyper parameters fields and update test_model_card.py + +### Bug Fixes and Other Changes + + * disable the tuner test + * Skip test_run_from_transform_job integ test to unblock python-sdk code pipeline + * Revert "feature: feature store with_feature_group functionality changes" + * advanced inference recommendation jobs parameters check + * make model_config optional when predicted labels are provided for bias detection + +## v2.132.0 (2023-02-07) + +### Features + + * support cluster lifecycle management for Sagemaker EMR step + * Inference recommendation id deployment support + +## v2.131.1 (2023-02-03) + +### Bug Fixes and Other Changes + + * test dub gpu integs with p3 + * fix(experiments/run.py): Stop duplication of RUN_TC_TAG on Consecutive Experiment Runs + * Enable load_run without name args in Transform env + * Remove confusing log line emitted during feature group ingestion + * Enable Experiment integ test on beta clients + * Make test_processor_with_role_as_pipeline_parameter more concrete + +### Documentation Changes + + * add security note for the estimator hyperparameter arg + * SageMaker distributed - model parallism library release note + * Add a deprecation note for DetailedProfileConfig + +## v2.131.0 (2023-01-31) + +### Features + + * Display file diff on black-check + * Support for environment variables in the HPO + * Support role as PipelineParameter in Processor class + * Add TrainingImageConfig support for SageMaker training jobs + +### Bug Fixes and Other Changes + + * use FeatureGroup's Session in nonconcurrency ingestion + * Update feature_group.py ingest() description + * Do not use print function. User logger instead + * Add batch_get_record and search API for FeatureStore + * hashing problem for framework processors with identical source dirs + +## v2.130.0 (2023-01-26) + +### Features + + * Add PyTorch 1.13.1 to SDK + * Adding image_uri config for DJL containers + * Support specifying env-vars when creating model from model package + * local download dir for Model and Estimator classes + +### Bug Fixes and Other Changes + + * increase creation time slack minutes + * Enable load_run auto pass in experiment config + * Add us-isob-east-1 accounts and configs + * Clean up Pipeline unit tests + +## v2.129.0 (2023-01-19) + +### Features + + * add p2 deprecation for PT>=1.13 + * TF2.11 Update to PySDK + +### Bug Fixes and Other Changes + + * Improve Pipeline integ tests and fix resource leak + * Update TF version to 2.8.4 + +## v2.128.0 (2023-01-10) + +### Features + + * right_size() for inference recommender + +### Bug Fixes and Other Changes + + * tf 2.9.3 release images + * Retry ValueError for airflow tests + +## v2.127.0 (2023-01-03) + +### Features + + * tensorflow inference 2.10.1 release + +## v2.126.0 (2022-12-22) + +### Features + + * AutoGluon 0.6.1 image_uris + +### Bug Fixes and Other Changes + + * Fix broken link in doc + * Do not specify S3 path for disabled profiler + +### Documentation Changes + + * fix the incorrect property reference + +## v2.125.0 (2022-12-19) + +### Features + + * add RandomSeed to support reproducible HPO + +### Bug Fixes and Other Changes + + * Correct SageMaker Clarify API docstrings by changing JSONPath to JMESPath + +## v2.124.0 (2022-12-16) + +### Features + + * Doc update for TableFormatEnum + * Add p4de to smddp supported instance types + * Add disable_profiler field in config and propagate changes + * Added doc update for dataset builder + +### Bug Fixes and Other Changes + + * Use Async Inference Config when available for endpoint update + +### Documentation Changes + + * smdistributed libraries release notes + +## v2.123.0 (2022-12-15) + +### Features + + * Add support for TF2.9.2 training images + * Add SageMaker Experiment + +## v2.122.0 (2022-12-14) + +### Features + + * Feature Store dataset builder, delete_record, get_record, list_feature_group + * Add OSU region to frameworks for DLC + +### Bug Fixes and Other Changes + + * the Hyperband support fix for the HPO + * unpin packaging version + * Remove content type image/jpg from analysis configuration schema + +## v2.121.2 (2022-12-12) + +### Bug Fixes and Other Changes + + * Update for Tensorflow Serving 2.11 inference DLCs + * Revert "fix: type hint of PySparkProcessor __init__" + * Skip Bad Transform Test + +## v2.121.1 (2022-12-09) + +### Bug Fixes and Other Changes + + * Pop out ModelPackageName from pipeline definition + * Fix failing jumpstart cache unit tests + +## v2.121.0 (2022-12-08) + +### Features + + * Algorithms Region Expansion OSU/DXB + +### Bug Fixes and Other Changes + + * FrameworkProcessor S3 uploads + * Add constraints file for apache-airflow + +## v2.120.0 (2022-12-07) + +### Features + + * Add Neo image uri config for Pytorch 1.12 + * Adding support for SageMaker Training Compiler in PyTorch estimator starting 1.12 + * Update registries with new region account number mappings. + * Add DXB region to frameworks by DLC + +### Bug Fixes and Other Changes + + * support idempotency for framework and spark processors + +## v2.119.0 (2022-12-03) + +### Features + + * Add Code Owners file + * Added transform with monitoring pipeline step in transformer + * Update TF 2.9 and TF 2.10 inference DLCs + * make estimator accept json file as modelparallel config + * SageMaker Training Compiler does not support p4de instances + * Add support for SparkML v3.3 + +### Bug Fixes and Other Changes + + * Fix bug forcing uploaded tar to be named sourcedir + * Update local_requirements.txt PyYAML version + * refactoring : using with statement + * Allow Py 3.7 for MMS Test Docker env + * fix PySparkProcessor __init__ params type + * type hint of PySparkProcessor __init__ + * Return ARM XGB/SKLearn tags if `image_scope` is `inference_graviton` + * Update scipy to 1.7.3 to support M1 development envs + * Fixing type hints for Spark processor that has instance type/count params in reverse order + * Add DeepAR ap-northeast-3 repository. + * Fix AsyncInferenceConfig documentation typo + * fix ml_inf to ml_inf1 in Neo multi-version support + * Fix type annotations + * add neo mvp region accounts + +## v2.118.0 (2022-12-01) + +### Features + + * Update boto3 version to 1.26.20 + * support table format option for create feature group. + * Support Amazon SageMaker Model Cards + * support monitoring alerts api + * Support Amazon SageMaker AutoMLStep + +### Bug Fixes and Other Changes + + * integration test in anticipate of ProfilerConfig API changes + * Add more integ test logic for AutoMLStep + * update get_execution_role_arn to use role from DefaultSpaceSettings + * bug on AutoMLInput to allow PipelineVariable + * FinalMetricDataList is missing from the training job search resu… + * add integration tests for Model Card + * update AutoMLStep with cache improvement + +### Documentation Changes + + * automlstep doc update + +## v2.117.0 (2022-11-15) + +### Features + + * add support for PT1.12.1 + +## v2.116.0 (2022-10-28) + +### Features + + * support customized timeout for model data download and inference container startup health check for Hosting Endpoints + * Trainium Neuron support for PyTorch + * Pipelines cache keys update + * Caching Improvements for SM Pipeline Workflows + +## v2.115.0 (2022-10-27) + +### Features + + * Add support for TF 2.10 training + * Disable profiler for Trainium instance type + * support the Hyperband strategy with the StrategyConfig + * support the GridSearch strategy for hyperparameter optimization + +### Bug Fixes and Other Changes + + * Update Graviton supported instance families + +## v2.114.0 (2022-10-26) + +### Features + + * Graviton support for XGB and SKLearn frameworks + * Graviton support for PyTorch and Tensorflow frameworks + * do not expand estimator role when it is pipeline parameter + * added support for batch transform with model monitoring + +### Bug Fixes and Other Changes + + * regex in tuning integs + * remove debugger environment var set up + * adjacent slash in s3 key + * Fix Repack step auto install behavior + * Add retry for airflow ParsingError + +### Documentation Changes + + * doc fix + +## v2.113.0 (2022-10-21) + +### Features + + * support torch_distributed distribution for Trainium instances + +### Bug Fixes and Other Changes + + * bump apache-airflow from 2.4.0 to 2.4.1 in /requirements/extras + +### Documentation Changes + + * fix kwargs and descriptions of the smdmp checkpoint function + * add the doc for the MonitorBatchTransformStep + +## v2.112.2 (2022-10-11) + +### Bug Fixes and Other Changes + + * Update Neo-TF2.x versions to TF2.9(.2) + +### Documentation Changes + + * fix typo in PR template + +## v2.112.1 (2022-10-10) + +### Bug Fixes and Other Changes + + * fix(local-mode): loosen docker requirement to allow 6.0.0 + * CreateModelPackage API error for Scikit-learn and XGBoost frameworkss + +## v2.112.0 (2022-10-09) + +### Features + + * added monitor batch transform step (pipeline) + +### Bug Fixes and Other Changes + + * Add PipelineVariable annotation to framework estimators + +## v2.111.0 (2022-10-05) + +### Features + + * Edit test file for supporting TF 2.10 training + +### Bug Fixes and Other Changes + + * support kms key in processor pack local code + * security issue by bumping apache-airflow from 2.3.4 to 2.4.0 + * instance count retrieval logic + * Add regex for short-form sagemaker-xgboost tags + * Upgrade attrs>=20.3.0,<23 + * Add PipelineVariable annotation to Amazon estimators + +### Documentation Changes + + * add context for pytorch + +## v2.110.0 (2022-09-27) + +### Features + + * Support KeepAlivePeriodInSeconds for Training APIs + * added ANALYSIS_CONFIG_SCHEMA_V1_0 in clarify + * add model monitor image accounts for ap-southeast-3 + +### Bug Fixes and Other Changes + + * huggingface release test + * Fixing the logic to return instanceCount for heterogeneousClusters + * Disable type hints in doc signature and add PipelineVariable annotations in docstring + * estimator hyperparameters in script mode + +### Documentation Changes + + * Added link to example notebook for Pipelines local mode + +## v2.109.0 (2022-09-09) + +### Features + + * add search filters + +### Bug Fixes and Other Changes + + * local pipeline step argument parsing bug + * support fail_on_violation flag for check steps + * fix links per app security scan + * Add PipelineVariable annotation for all processor subclasses + +### Documentation Changes + + * the SageMaker model parallel library 1.11.0 release + +## v2.108.0 (2022-09-02) + +### Features + + * Adding support in HuggingFace estimator for Training Compiler enhanced PyTorch 1.11 + +### Bug Fixes and Other Changes + + * add sagemaker clarify image account for cgk region + * set PYTHONHASHSEED env variable to fixed value to fix intermittent failures in release pipeline + * trcomp fixtures to override default fixtures for integ tests + +### Documentation Changes + + * add more info about volume_size + +## v2.107.0 (2022-08-29) + +### Features + + * support python 3.10, update airflow dependency + +### Bug Fixes and Other Changes + + * Add retry in session.py to check if training is finished + +### Documentation Changes + + * remove Other tab in Built-in algorithms section and mi… + +## v2.106.0 (2022-08-24) + +### Features + + * Implement Kendra Search in RTD website + +### Bug Fixes and Other Changes + + * Add primitive_or_expr() back to conditions + * remove specifying env-vars when creating model from model package + * Add CGK in config for Spark Image + +## v2.105.0 (2022-08-19) + +### Features + + * Added endpoint_name to clarify.ModelConfig + * adding workgroup functionality to athena query + +### Bug Fixes and Other Changes + + * disable debugger/profiler in cgk region + * using unique name for lineage test to unblock PR checks + +### Documentation Changes + + * update first-party algorithms and structural updates + +## v2.104.0 (2022-08-17) + +### Features + + * local mode executor implementation + * Pipelines local mode setup + * Add PT 1.12 support + * added _AnalysisConfigGenerator for clarify + +### Bug Fixes and Other Changes + + * yaml safe_load sagemaker config + * pipelines local mode minor bug fixes + * add local mode integ tests + * implement local JsonGet function + * Add Pipeline annotation in model base class and tensorflow estimator + * Allow users to customize trial component display names for pipeline launched jobs + * Update localmode code to decode urllib response as UTF8 + +### Documentation Changes + + * New content for Pipelines local mode + * Correct documentation error + +## v2.103.0 (2022-08-05) + +### Features + + * AutoGluon 0.4.3 and 0.5.2 image_uris + +### Bug Fixes and Other Changes + + * Revert "change: add a check to prevent launching a modelparallel job on CPU only instances" + * Add gpu capability to local + * Link PyTorch 1.11 to 1.11.0 + +## v2.102.0 (2022-08-04) + +### Features + + * add warnings for xgboost specific rules in debugger rules + * Add PyTorch DDP distribution support + * Add test for profiler enablement with debugger_hook false + +### Bug Fixes and Other Changes + + * Two letter language code must be supported + * add a check to prevent launching a modelparallel job on CPU only instances + * Allow StepCollection added in ConditionStep to be depended on + * Add PipelineVariable annotation in framework models + * skip managed spot training mxnet nb + +### Documentation Changes + + * smdistributed libraries currency updates + +## v2.101.1 (2022-07-28) + +### Bug Fixes and Other Changes + + * added more ml frameworks supported by SageMaker Workflows + * test: Vspecinteg2 + * Add PipelineVariable annotation in amazon models + +## v2.101.0 (2022-07-27) + +### Features + + * Algorithms region launch on CGK + * enhance-bucket-override-support + * infer framework and version + * support clarify bias detection when facets not included + * Add CGK region to frameworks by DLC + +### Bug Fixes and Other Changes + + * Make repack step output path align with model repack path + * Support parameterized source code input for TrainingStep + +### Documentation Changes + + * heterogeneous cluster api doc fix + * smdmp v1.10 release note + +## v2.100.0 (2022-07-18) + +### Features + + * upgrade to support python 3.10 + * Add target_model to support multi-model endpoints + * Added support for feature group schema change and feature parameters + +### Bug Fixes and Other Changes + + * enable model.register without 'inference' & 'transform' instances + * rename RegisterModel inner steps to prevent duplicate step names + * remove primitive_or_expr() from conditions + * support pipeline variables for spark processors run arguments + * make 'ModelInput' field optional for inference recommendation + * Fix processing image uri param + * fix: neo inferentia as compilation target not using framework ver + +### Documentation Changes + + * SageMaker model parallel library v1.10.0 documentation + * add detail & links to clarify docstrings + +## v2.99.0 (2022-07-08) + +### Features + + * heterogeneous cluster set up in distribution config + * support heterogeneous cluster for training + * include fields to work with inference recommender + +### Bug Fixes and Other Changes + + * Moving the newly added field instance_group to the end of method + * image_uri does not need to be specified with instance_groups + * Loosen version of attrs dependency + * Add PipelineVariable annotation in estimatory, processing, tuner, transformer base classes + * model table link + +### Documentation Changes + + * documentation for heterogeneous cluster + +## v2.98.0 (2022-07-05) + +### Features + + * Adding deepar image + +### Documentation Changes + + * edit to clarify how to use inference.py + +## v2.97.0 (2022-06-28) + +### Deprecations and Removals + + * remove support for python 3.6 + +### Features + + * update prebuilt models documentation + +### Bug Fixes and Other Changes + + * Skipping test_candidate_estimator_default_rerun_and_deploy + * Update model name from 'compiled.pt' to 'model.pth' for neo + * update pytest, skip hf integ temp + * Add override_pipeline_parameter_var decorator to give grace period to update invalid pipeline var args + +## v2.96.0 (2022-06-20) + +### Features + + * Add helper method to generate pipeline adjacency list + +### Bug Fixes and Other Changes + + * changing trcomp integ tests to be able to run in all regions + +## v2.95.0 (2022-06-16) + +### Features + + * Adding Training Compiler support for TensorFlow estimator starting TF 2.9 + * Add support for TF 2.9 training + +### Bug Fixes and Other Changes + + * integs fallback from p3 to p2 instance + * bucket exists check for session.default_bucket + * make instance type fields as optional + +### Documentation Changes + + * improvements on the docstring of ModelStep + * Add XGBoostProcessor + +## v2.94.0 (2022-06-07) + +### Features + + * AutoGluon 0.4.2 image_uris support + +## v2.93.1 (2022-06-06) + +### Bug Fixes and Other Changes + + * add input parameterization tests for workflow job steps + * add parameterized tests to transformer + +## v2.93.0 (2022-06-03) + +### Features + + * MxNet 1.9 support + +### Bug Fixes and Other Changes + + * bump importlib-metadata version upperbound to support TF2.9 + * fix pipeline doc code example where process.run only accepts argument + * Fix Tensorflow default model_dir generation when output_path is pipeline variable + * Support transformer data parameterization + +## v2.92.2 (2022-05-31) + +### Bug Fixes and Other Changes + + * turn off Pipeline Parameter inheritance from python primitives + * Add more validations for pipeline step new interfaces + * Changed method description per AWS request + +## v2.92.1 (2022-05-26) + +### Bug Fixes and Other Changes + + * pin protobuf to < 4.0 to fix breaking change + +## v2.92.0 (2022-05-26) + +### Features + + * add 'Domain' property to RegisterModel step + +### Bug Fixes and Other Changes + + * support estimator output path parameterization + * Add back Prevent passing PipelineVariable object into image_uris.retrieve + * jumpstart amt tracking + * fix missing register method params for framework models + * fix docstring for decorated functions + * Documents: add sagemaker model building pipeline readthedocs + +## v2.91.1 (2022-05-19) + +### Bug Fixes and Other Changes + + * Revert Prevent passing PipelineVariable object into image_uris.retrieve + +## v2.91.0 (2022-05-19) + +### Features + + * Support Properties for StepCollection + +### Bug Fixes and Other Changes + + * Prevent passing PipelineVariable object into image_uris.retrieve + * support image_uri being property ref for model + * ResourceConflictException from AWS Lambda on pipeline upsert + +### Documentation Changes + + * release notes for SMDDP 1.4.1 and SMDMP 1.9.0 + +## v2.90.0 (2022-05-16) + +### Features + + * Add ModelStep for SageMaker Model Building Pipeline + +### Bug Fixes and Other Changes + + * update setup.py to add minimum python requirement of 3.6 + +## v2.89.0 (2022-05-11) + +### Features + + * Add PT 1.11 support + * add validation specification + +### Bug Fixes and Other Changes + + * repack model locally when local_code local mode + +### Documentation Changes + + * smdmp 1.8.1 release note + +## v2.88.3 (2022-05-06) + +### Bug Fixes and Other Changes + + * deprecate: Remove deprecated argument s3_data_distribution_type + * Feat/jumpstart model table update + +## v2.88.2 (2022-05-02) + +### Bug Fixes and Other Changes + + * Automl integ describe job check + * Implement subclass compatibility for workflow pipeline job steps + +## v2.88.1 (2022-04-27) + +### Bug Fixes and Other Changes + + * Add encryption setting to tar_and_upload_dir method + +## v2.88.0 (2022-04-26) + +### Features + + * jumpstart notebook utils -- list model ids, scripts, tasks, frameworks + +### Bug Fixes and Other Changes + + * local mode printing of credentials during docker login closes #2180 + * disable endpoint context test + +### Documentation Changes + + * sm model parallel 1.8.0 release notes + +## v2.87.0 (2022-04-20) + +### Features + + * Add Jumpstart example notebooks + * add Tensorflow and Pytorch version for SM Training Compiler and expand to regular regions + +### Bug Fixes and Other Changes + + * integs for training compiler in non-PDX regions + * TrainingStep cache misses due to timestamp based job name + * retry context delete + * Add more logging when unexpected number of artifacts found + +## v2.86.2 (2022-04-14) + +### Bug Fixes and Other Changes + + * #using uuid to randomize, otherwise system timestamp is used + +## v2.86.1 (2022-04-13) + +### Bug Fixes and Other Changes + + * xgboost, sklearn network isolation for jumpstart + +### Documentation Changes + + * fix minor typo + +## v2.86.0 (2022-04-12) + +### Features + + * Adds Spark Processing Notebook to Notebook Tests + +## v2.85.0 (2022-04-11) + +### Features + + * update lambda code on pipeline create/update/upsert for Lamb… + * jumpstart model url + * add serverless inference image_uri retrieve support + +### Bug Fixes and Other Changes + + * Add back the Fix for Pipeline variables related customer issues + * Support file URIs in ProcessingStep's code parameter + +## v2.84.0 (2022-04-07) + +### Features + + * dependabot integ - move all deps to requirements.txt + * add xgboost framework version 1.5-1 + +## v2.83.0 (2022-04-04) + +### Features + + * Hugging Face Transformers 4.17 for TF 2.6 + +### Bug Fixes and Other Changes + + * IOC image version select issue + +## v2.82.2 (2022-04-01) + +### Bug Fixes and Other Changes + + * Revert "fix: Fix Pipeline variables related customer issues (#2959)" + * Refactor repack_model script injection, fixes tar.gz error + +## v2.82.1 (2022-03-31) + +### Bug Fixes and Other Changes + + * Update Inferentia Image URI Config + * Fix Pipeline variables related customer issues + * more logging info for static pipeline test data setup + +## v2.82.0 (2022-03-30) + +### Features + + * pluggable instance fallback mechanism, add CapacityError + * support passing Env Vars to local mode training + +## v2.81.1 (2022-03-29) + +### Bug Fixes and Other Changes + + * Update black-check version, add support for Spark 3.1 Processing + +## v2.81.0 (2022-03-26) + +### Features + + * Retrieve data configuration + * enable EnableInterContainerTrafficEncryption for model monitoring + * Hugging Face Transformers 4.17 for PT 1.10 + +### Bug Fixes and Other Changes + + * remove `new` from serverless + * temporarily skip tests impacted by data inconsistency + * Implement override solution for pipeline variables + +### Documentation Changes + + * add documentation for image_uri serverless use case + * minor fixes for smddp 1.4.0 doc + +## v2.80.0 (2022-03-18) + +### Features + + * Add support for TF2.7 + * Add support for TF 2.8 + * TF242 ioc support + * Add support for TF 2.6.3 + * Support for remote docker host + * AutoGluon 0.3.2 and 0.4.0 image_uris + +### Bug Fixes and Other Changes + + * Align max_wait definitions in EstimaorBase and Estimator + * Add JumpStart model table build notification + * gpu integs CapacityError - fallback to available compute + * gpu integs CapacityError - fallback to available compute + * jumpstart docs network isolation + +### Documentation Changes + + * sagemaker distributed model parallel 1.7.0 doc + +## v2.79.0 (2022-03-16) + +### Features + + * Inferentia Neuron support for HuggingFace + * custom base job name for jumpstart models/estimators + * Python 3.9 for readthedocs + +### Bug Fixes and Other Changes + + * container env generation for S3 URI and add test for the same + +### Documentation Changes + + * the SageMaker distributed data parallel v1.4.0 release + * update sagemaker training compiler docstring + * smddp doc update + +## v2.78.0 (2022-03-07) + +### Features + + * TensorFlow 2.4 for Neo + * Data Serializer + +### Bug Fixes and Other Changes + + * Style update in DataSerializer + * Remove sagemaker_job_name from hyperparameters in TrainingStep + * reorganize test files for workflow + * update code to get commit_id in codepipeline ## v2.77.1 (2022-02-25) @@ -11,7 +3429,7 @@ ### Features * override jumpstart content bucket - * jumpstart model id suggestions + * jumpstart model ID suggestions * adding customer metadata support to registermodel step ### Bug Fixes and Other Changes diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 3b64466870..0000000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,4 +0,0 @@ -## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c4007305c9..6a78a25c21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,6 +16,7 @@ information to effectively respond to your bug report or contribution. * [Run the Unit Tests](#run-the-unit-tests) * [Run the Integration Tests](#run-the-integration-tests) * [Make and Test Your Change](#make-and-test-your-change) + * [Lint Your Change](#lint-your-change) * [Commit Your Change](#commit-your-change) * [Send a Pull Request](#send-a-pull-request) * [Documentation Guidelines](#documentation-guidelines) @@ -61,6 +62,10 @@ Before sending us a pull request, please ensure that: 1. Follow the instructions at [Modifying an EBS Volume Using Elastic Volumes (Console)](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/requesting-ebs-volume-modifications.html#modify-ebs-volume) to increase the EBS volume size associated with the newly created EC2 instance. 1. Wait 5-10min for the new EBS volume increase to finalize. 1. Allow EC2 to claim the additional space by stopping and then starting your EC2 host. +2. Set up a venv to manage dependencies: + 1. `python -m venv ~/.venv/myproject-env` to create the venv + 2. `source ~/.venv/myproject-env/bin/activate` to activate the venv + 3. `deactivate` to exit the venv ### Pull Down the Code @@ -74,13 +79,13 @@ Before sending us a pull request, please ensure that: ### Run the Unit Tests 1. Install tox using `pip install tox` -1. Install coverage using `pip install .[test]` -1. cd into the sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk` +1. cd into the github project sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk` +1. Install coverage using `pip install '.[test]'` 1. Run the following tox command and verify that all code checks and unit tests pass: `tox tests/unit` - -You can also run a single test with the following command: `tox -e py36 -- -s -vv ::` +1. You can also run a single test with the following command: `tox -e py310 -- -s -vv ::` +1. You can run coverage via runcvoerage env : `tox -e runcoverage -- tests/unit` or `tox -e py310 -- tests/unit --cov=sagemaker --cov-append --cov-report xml` * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE` - * Example: `export IGNORE_COVERAGE=- ; tox -e py36 -- -s -vv tests/unit/test_estimator.py::test_sagemaker_model_s3_uri_invalid ; unset IGNORE_COVERAGE` + * Example: `export IGNORE_COVERAGE=- ; tox -e py310 -- -s -vv tests/unit/test_estimator.py::test_sagemaker_model_s3_uri_invalid ; unset IGNORE_COVERAGE` ### Run the Integration Tests @@ -89,9 +94,9 @@ Our CI system runs integration tests (the ones in the `tests/integ` directory), You should only worry about manually running any new integration tests that you write, or integration tests that test an area of code that you've modified. 1. Follow the instructions at [Set Up the AWS Command Line Interface (AWS CLI)](https://docs.aws.amazon.com/polly/latest/dg/setup-aws-cli.html). -1. To run a test, specify the test file and method you want to run per the following command: `tox -e py36 -- -s -vv ::` +1. To run a test, specify the test file and method you want to run per the following command: `tox -e py310 -- -s -vv ::` * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE` - * Example: `export IGNORE_COVERAGE=- ; tox -e py36 -- -s -vv tests/integ/test_tf_script_mode.py::test_mnist ; unset IGNORE_COVERAGE` + * Example: `export IGNORE_COVERAGE=- ; tox -e py310 -- -s -vv tests/integ/test_tf_script_mode.py::test_mnist ; unset IGNORE_COVERAGE` If you are writing or modifying a test that creates a SageMaker job (training, tuner, or transform) or endpoint, it's important to assign a concurrency-friendly `job_name` (or `endpoint_name`), or your tests may fail randomly due to name collisions. We have a helper method `sagemaker.utils.unique_name_from_base(base, max_length)` that makes test-friendly names. You can find examples of how to use it [here](https://github.com/aws/sagemaker-python-sdk/blob/3816a5658d3737c9767e01bc8d37fc3ed5551593/tests/integ/test_tfs.py#L37) and [here](https://github.com/aws/sagemaker-python-sdk/blob/3816a5658d3737c9767e01bc8d37fc3ed5551593/tests/integ/test_tuner.py#L616), or by searching for "unique\_name\_from\_base" in our test code. @@ -113,6 +118,13 @@ If you are writing or modifying a test that creates a SageMaker job (training, t 1. If your changes include documentation changes, please see the [Documentation Guidelines](#documentation-guidelines). 1. If you include integration tests, do not mark them as canaries if they will not run in all regions. +### Lint Your Change + +Before submitting, ensure your code meets our quality and style guidelines. Run: +```shell +tox -e flake8,pylint,docstyle,black-check,twine --parallel all +``` +Address any errors or warnings before opening a pull request. ### Commit Your Change @@ -227,6 +239,12 @@ For example, see the [Processing API reference](https://github.com/aws/sagemaker To build the Sphinx docs, run the following command in the `doc/` directory: +```shell +# Initial setup, only required for the first run +pip install -r requirements.txt +pip install -e ../ +``` + ```shell make html ``` diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..f49a4e16e6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index a1ce8c3b5e..0000000000 --- a/LICENSE.txt +++ /dev/null @@ -1,175 +0,0 @@ - Apache License - Version 2.0, January 2004 - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 3cf0e93c89..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,12 +0,0 @@ -recursive-include src/sagemaker *.py - -include src/sagemaker/image_uri_config/*.json - -include VERSION -include LICENSE.txt -include README.rst - -prune tests - -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] diff --git a/NOTICE.txt b/NOTICE.txt deleted file mode 100644 index 46da7e5caa..0000000000 --- a/NOTICE.txt +++ /dev/null @@ -1,2 +0,0 @@ -Amazon SageMaker Python SDK -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/README.rst b/README.rst index ab62eddad0..e39f955deb 100644 --- a/README.rst +++ b/README.rst @@ -22,52 +22,185 @@ SageMaker Python SDK :target: https://sagemaker.readthedocs.io/en/stable/ :alt: Documentation Status +.. image:: https://github.com/aws/sagemaker-python-sdk/actions/workflows/codebuild-ci-health.yml/badge.svg + :target: https://github.com/aws/sagemaker-python-sdk/actions/workflows/codebuild-ci-health.yml + :alt: CI Health + SageMaker Python SDK is an open source library for training and deploying machine learning models on Amazon SageMaker. -With the SDK, you can train and deploy models using popular deep learning frameworks **Apache MXNet** and **TensorFlow**. +With the SDK, you can train and deploy models using popular deep learning frameworks **Apache MXNet** and **PyTorch**. You can also train and deploy models with **Amazon algorithms**, which are scalable implementations of core machine learning algorithms that are optimized for SageMaker and GPU training. If you have **your own algorithms** built into SageMaker compatible Docker containers, you can train and host models using these as well. -For detailed documentation, including the API reference, see `Read the Docs `_. +To install SageMaker Python SDK, see `Installing SageMaker Python SDK <#installing-the-sagemaker-python-sdk>`_. + +❗🔥 SageMaker V3 Release +------------------------- + +Version 3.0.0 represents a significant milestone in our product's evolution. This major release introduces a modernized architecture, enhanced performance, and powerful new features while maintaining our commitment to user experience and reliability. + +**Important: Please review these breaking changes before upgrading.** + +* Older interfaces such as Estimator, Model, Predictor and all their subclasses will not be supported in V3. +* Please see our `V3 examples folder `__ for example notebooks and usage patterns. + + +Migrating to V3 +---------------- + +**Upgrading to 3.x** + +To upgrade to the latest version of SageMaker Python SDK 3.x: + +:: + + pip install --upgrade sagemaker + +If you prefer to downgrade to the 2.x version: + +:: + + pip install sagemaker==2.* + +See `SageMaker V2 Examples <#sagemaker-v2-examples>`__ for V2 documentation and examples. + +**Key Benefits of 3.x** + +* **Modular Architecture**: Separate PyPI packages for core, training, and serving capabilities + + * `sagemaker-core `__ + * `sagemaker-train `__ + * `sagemaker-serve `__ + * `sagemaker-mlops `__ + +* **Unified Training & Inference**: Single classes (ModelTrainer, ModelBuilder) replace multiple framework-specific classes +* **Object-Oriented API**: Structured interface with auto-generated configs aligned with AWS APIs +* **Simplified Workflows**: Reduced boilerplate and more intuitive interfaces + +**Training Experience** + +V3 introduces the unified ModelTrainer class to reduce complexity of initial setup and deployment for model training. This replaces the V2 Estimator class and framework-specific classes (PyTorchEstimator, SKLearnEstimator, etc.). + +This example shows how to train a model using a custom training container with training data from S3. + +*SageMaker Python SDK 2.x:* + +.. code:: python + + from sagemaker.estimator import Estimator + estimator = Estimator( + image_uri="my-training-image", + role="arn:aws:iam::123456789012:role/SageMakerRole", + instance_count=1, + instance_type="ml.m5.xlarge", + output_path="s3://my-bucket/output" + ) + estimator.fit({"training": "s3://my-bucket/train"}) + +*SageMaker Python SDK 3.x:* + +.. code:: python + + from sagemaker.train import ModelTrainer + from sagemaker.train.configs import InputData + + trainer = ModelTrainer( + training_image="my-training-image", + role="arn:aws:iam::123456789012:role/SageMakerRole" + ) + + train_data = InputData( + channel_name="training", + data_source="s3://my-bucket/train" + ) + + trainer.train(input_data_config=[train_data]) + +**See more examples:** `SageMaker V3 Examples <#sagemaker-v3-examples>`__ + +**Inference Experience** + +V3 introduces the unified ModelBuilder class for model deployment and inference. This replaces the V2 Model class and framework-specific classes (PyTorchModel, TensorFlowModel, SKLearnModel, XGBoostModel, etc.). + +This example shows how to deploy a trained model for real-time inference. + +*SageMaker Python SDK 2.x:* + +.. code:: python + + from sagemaker.model import Model + from sagemaker.predictor import Predictor + model = Model( + image_uri="my-inference-image", + model_data="s3://my-bucket/model.tar.gz", + role="arn:aws:iam::123456789012:role/SageMakerRole" + ) + predictor = model.deploy( + initial_instance_count=1, + instance_type="ml.m5.xlarge" + ) + result = predictor.predict(data) + +*SageMaker Python SDK 3.x:* + +.. code:: python + + from sagemaker.serve import ModelBuilder + model_builder = ModelBuilder( + model="my-model", + model_path="s3://my-bucket/model.tar.gz" + ) + endpoint = model_builder.build() + result = endpoint.invoke(...) + +**See more examples:** `SageMaker V3 Examples <#sagemaker-v3-examples>`__ + +SageMaker V3 Examples +--------------------- + +**Training Examples** + +#. `Custom Distributed Training Example `__ +#. `Distributed Local Training Example `__ +#. `Hyperparameter Training Example `__ +#. `JumpStart Training Example `__ +#. `Local Training Example `__ + +**Inference Examples** + +#. `HuggingFace Example `__ +#. `In-Process Mode Example `__ +#. `Inference Spec Example `__ +#. `JumpStart E2E Training Example `__ +#. `JumpStart Example `__ +#. `Local Mode Example `__ +#. `Optimize Example `__ +#. `Train Inference E2E Example `__ + +**ML Ops Examples** + +#. `V3 Hyperparameter Tuning Example `__ +#. `V3 Hyperparameter Tuning Pipeline `__ +#. `V3 Model Registry Example `__ +#. `V3 PyTorch Processing Example `__ +#. `V3 Pipeline Train Create Registry `__ +#. `V3 Processing Job Sklearn `__ +#. `V3 SageMaker Clarify `__ +#. `V3 Transform Job Example `__ + +**Looking for V2 Examples?** See `SageMaker V2 Examples <#sagemaker-v2-examples>`__ below. -Table of Contents ------------------ -#. `Installing SageMaker Python SDK <#installing-the-sagemaker-python-sdk>`__ -#. `Using the SageMaker Python SDK `__ -#. `Using MXNet `__ -#. `Using TensorFlow `__ -#. `Using Chainer `__ -#. `Using PyTorch `__ -#. `Using Scikit-learn `__ -#. `Using XGBoost `__ -#. `SageMaker Reinforcement Learning Estimators `__ -#. `SageMaker SparkML Serving <#sagemaker-sparkml-serving>`__ -#. `Amazon SageMaker Built-in Algorithm Estimators `__ -#. `Using SageMaker AlgorithmEstimators `__ -#. `Consuming SageMaker Model Packages `__ -#. `BYO Docker Containers with SageMaker Estimators `__ -#. `SageMaker Automatic Model Tuning `__ -#. `SageMaker Batch Transform `__ -#. `Secure Training and Inference with VPC `__ -#. `BYO Model `__ -#. `Inference Pipelines `__ -#. `Amazon SageMaker Operators in Apache Airflow `__ -#. `SageMaker Autopilot `__ -#. `Model Monitoring `__ -#. `SageMaker Debugger `__ -#. `SageMaker Processing `__ Installing the SageMaker Python SDK ----------------------------------- -The SageMaker Python SDK is built to PyPI and can be installed with pip as follows: - +The SageMaker Python SDK is built to PyPI and the latest version of the SageMaker Python SDK can be installed with pip as follows :: - pip install sagemaker + pip install sagemaker== You can install from source by cloning this repository and running a pip install command in the root directory of the repository: @@ -87,10 +220,17 @@ Supported Python Versions SageMaker Python SDK is tested on: -- Python 3.6 -- Python 3.7 -- Python 3.8 - Python 3.9 +- Python 3.10 +- Python 3.11 +- Python 3.12 + +Telemetry +~~~~~~~~~~~~~~~ + +The ``sagemaker`` library has telemetry enabled to help us better understand user needs, diagnose issues, and deliver new features. This telemetry tracks the usage of various SageMaker functions. + +If you prefer to opt out of telemetry, you can easily do so by setting the ``TelemetryOptOut`` parameter to ``true`` in the SDK defaults configuration. For detailed instructions, please visit `Configuring and using defaults with the SageMaker Python SDK `__. AWS Permissions ~~~~~~~~~~~~~~~ @@ -126,13 +266,15 @@ To run the unit tests with tox, run: tox tests/unit -**Integrations tests** +**Integration tests** To run the integration tests, the following prerequisites must be met 1. AWS account credentials are available in the environment for the boto3 client to use. 2. The AWS account has an IAM role named :code:`SageMakerRole`. It should have the AmazonSageMakerFullAccess policy attached as well as a policy with `the necessary permissions to use Elastic Inference `__. +3. To run remote_function tests, dummy ecr repo should be created. It can be created by running - + :code:`aws ecr create-repository --repository-name remote-function-dummy-container` We recommend selectively running just those integration tests you'd like to run. You can filter by individual test function names with: @@ -175,9 +317,9 @@ Setup a Python environment, and install the dependencies listed in ``doc/require :: # conda - conda create -n sagemaker python=3.7 + conda create -n sagemaker python=3.12 conda activate sagemaker - conda install sphinx=3.1.1 sphinx_rtd_theme=0.5.0 + conda install sphinx=5.1.1 sphinx_rtd_theme=0.5.0 # pip pip install -r doc/requirements.txt @@ -214,7 +356,7 @@ In order to host a SparkML model in SageMaker, it should be serialized with ``ML For more information on MLeap, see https://github.com/combust/mleap . -Supported major version of Spark: 2.4 (MLeap version - 0.9.6) +Supported major version of Spark: 3.3 (MLeap version - 0.20.0) Here is an example on how to create an instance of ``SparkMLModel`` class and use ``deploy()`` method to create an endpoint which can be used to perform prediction against your trained SparkML Model. @@ -238,3 +380,77 @@ For more information about the different ``content-type`` and ``Accept`` formats ``schema`` that SageMaker SparkML Serving recognizes, please see `SageMaker SparkML Serving Container`_. .. _SageMaker SparkML Serving Container: https://github.com/aws/sagemaker-sparkml-serving-container + + +SageMaker V2 Examples +--------------------- + +#. `Using the SageMaker Python SDK `__ +#. `Using MXNet `__ +#. `Using TensorFlow `__ +#. `Using Chainer `__ +#. `Using PyTorch `__ +#. `Using Scikit-learn `__ +#. `Using XGBoost `__ +#. `SageMaker Reinforcement Learning Estimators `__ +#. `SageMaker SparkML Serving <#sagemaker-sparkml-serving>`__ +#. `Amazon SageMaker Built-in Algorithm Estimators `__ +#. `Using SageMaker AlgorithmEstimators `__ +#. `Consuming SageMaker Model Packages `__ +#. `BYO Docker Containers with SageMaker Estimators `__ +#. `SageMaker Automatic Model Tuning `__ +#. `SageMaker Batch Transform `__ +#. `Secure Training and Inference with VPC `__ +#. `BYO Model `__ +#. `Inference Pipelines `__ +#. `Amazon SageMaker Operators in Apache Airflow `__ +#. `SageMaker Autopilot `__ +#. `Model Monitoring `__ +#. `SageMaker Debugger `__ +#. `SageMaker Processing `__ + +🚀 Model Fine-Tuning Support Now Available in V3 +------------------------------------------------- + +We're excited to announce model fine-tuning capabilities in SageMaker Python SDK V3! + +**What's New** + +Four new trainer classes for fine-tuning foundation models: + +* SFTTrainer - Supervised fine-tuning +* DPOTrainer - Direct preference optimization +* RLAIFTrainer - RL from AI feedback +* RLVRTrainer - RL from verifiable rewards + +**Quick Example** + +.. code:: python + + from sagemaker.train import SFTTrainer + from sagemaker.train.common import TrainingType + + trainer = SFTTrainer( + model="meta-llama/Llama-2-7b-hf", + training_type=TrainingType.LORA, + model_package_group_name="my-models", + training_dataset="s3://bucket/train.jsonl" + ) + + training_job = trainer.train() + +**Key Features** + +* ✨ LoRA & full fine-tuning +* 📊 MLflow integration with real-time metrics +* 🚀 Deploy to SageMaker or Bedrock +* 📈 Built-in evaluation (11 benchmarks) +* ☁️ Serverless training + +**Get Started** + +.. code:: python + + pip install sagemaker>=3.1.0 + +`📓 Example notebooks `__ \ No newline at end of file diff --git a/VERSION b/VERSION index 1facf61117..a5c4c76339 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.77.2.dev0 +3.9.0 diff --git a/bedrock-modelbuilder-deployment-nova.ipynb b/bedrock-modelbuilder-deployment-nova.ipynb new file mode 100644 index 0000000000..ba50e67ef0 --- /dev/null +++ b/bedrock-modelbuilder-deployment-nova.ipynb @@ -0,0 +1,568 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bedrock ModelBuilder Example\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2026/03/16 14:33:32 Refreshing aws credentials for default\n", + "2026/03/16 14:33:33 Successfully refreshed aws credentials for default\n" + ] + } + ], + "source": [ + "# Configure AWS credentials and region\n", + "! ada credentials update --provider=isengard --account=099324990371 --role=Admin --profile=default --once\n", + "! aws configure set region us-east-1" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[03/16/26 14:33:37] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/16/26 14:33:37]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=778733;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=885530;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /Users/twillit/Library/Application Support/sagemaker/config.yaml\n" + ] + } + ], + "source": [ + "# Setup\n", + "import boto3\n", + "import json\n", + "import time\n", + "import random\n", + "from sagemaker.core.resources import TrainingJob\n", + "from sagemaker.serve.bedrock_model_builder import BedrockModelBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "TRAINING_JOB_NAME = 'my-lora-run-tpnld-1773683343850'\n", + "ROLE_ARN = \"arn:aws:iam::099324990371:role/AmazonSageMaker-ExecutionRole-20260219T233135\"\n", + "REGION = 'us-east-1'\n", + "BUCKET = 'sagemaker-us-east-1-099324990371'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[03/16/26 14:33:39] WARNING  No region provided. Using default region.                                 utils.py:356\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/16/26 14:33:39]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;215;175;0mWARNING \u001b[0m No region provided. Using default region. \u001b]8;id=432135;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=278513;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/core/utils/utils.py#356\u001b\\\u001b[2m356\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Runs on sagemaker prod, region:us-east-1                                  utils.py:370\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Runs on sagemaker prod, region:us-east-\u001b[1;36m1\u001b[0m \u001b]8;id=134958;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=705239;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/core/utils/utils.py#370\u001b\\\u001b[2m370\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=665312;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=477402;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training job status: Completed\n", + "Using HF model path: s3://sagemaker-us-east-1-099324990371/model-customization/output-artifacts/my-lora-run-tpnld-1773683343850/output/model/checkpoints/hf_merged/\n" + ] + } + ], + "source": [ + "# Step 1: Get training job and prepare model path\n", + "training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME)\n", + "print(f\"Training job status: {training_job.training_job_status}\")\n", + "\n", + "# Use the hf_merged directory which has complete HuggingFace format\n", + "base_s3_path = training_job.model_artifacts.s3_model_artifacts\n", + "hf_model_path = base_s3_path.rstrip('/') + '/checkpoints/hf_merged/'\n", + "print(f\"Using HF model path: {hf_model_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[03/16/26 14:33:40] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/16/26 14:33:40]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=44052;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=307088;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking required files:\n", + "❌ config.json - MISSING\n", + "❌ tokenizer.json - MISSING\n", + "❌ tokenizer_config.json - MISSING\n", + "❌ model.safetensors - MISSING\n" + ] + } + ], + "source": [ + "# Step 2: Verify required files exist\n", + "s3_client = boto3.client('s3', region_name=REGION)\n", + "\n", + "required_files = ['config.json', 'tokenizer.json', 'tokenizer_config.json', 'model.safetensors']\n", + "model_prefix = hf_model_path.replace(f's3://{BUCKET}/', '')\n", + "\n", + "print(\"Checking required files:\")\n", + "for file in required_files:\n", + " try:\n", + " s3_client.head_object(Bucket=BUCKET, Key=model_prefix + file)\n", + " print(f\"✅ {file}\")\n", + " except:\n", + " print(f\"❌ {file} - MISSING\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ added_tokens.json exists\n" + ] + } + ], + "source": [ + "# Step 3: Create missing tokenizer files if needed\n", + "def ensure_tokenizer_files():\n", + " # Create added_tokens.json (usually empty for Llama)\n", + " try:\n", + " s3_client.head_object(Bucket=BUCKET, Key=model_prefix + 'added_tokens.json')\n", + " print(\"✅ added_tokens.json exists\")\n", + " except:\n", + " s3_client.put_object(\n", + " Bucket=BUCKET,\n", + " Key=model_prefix + 'added_tokens.json',\n", + " Body=json.dumps({}),\n", + " ContentType='application/json'\n", + " )\n", + " print(\"✅ Created added_tokens.json\")\n", + "\n", + "ensure_tokenizer_files()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking S3 structure...\n", + "Base prefix: model-customization/output-artifacts/my-lora-run-tpnld-1773683343850/output/model\n", + "Contents:\n", + "\n", + "Checking hf_merged path: model-customization/output-artifacts/my-lora-run-tpnld-1773683343850/output/model/checkpoints/hf_merged/\n", + "Files in hf_merged:\n", + " added_tokens.json\n", + "✅ Copied added_tokens.json\n" + ] + } + ], + "source": [ + "# Debug: Check what's actually in the S3 bucket\n", + "print(\"Checking S3 structure...\")\n", + "base_prefix = base_s3_path.replace(f's3://{BUCKET}/', '')\n", + "print(f\"Base prefix: {base_prefix}\")\n", + "\n", + "# List files to see the actual structure\n", + "response = s3_client.list_objects_v2(\n", + " Bucket=BUCKET,\n", + " Prefix=base_prefix,\n", + " Delimiter='/'\n", + ")\n", + "\n", + "print(\"Contents:\")\n", + "if 'Contents' in response:\n", + " for obj in response['Contents'][:10]: # Show first 10 files\n", + " print(f\" {obj['Key']}\")\n", + "\n", + "# Check specifically for hf_merged directory\n", + "hf_merged_prefix = base_prefix.rstrip('/') + '/checkpoints/hf_merged/'\n", + "print(f\"\\nChecking hf_merged path: {hf_merged_prefix}\")\n", + "\n", + "try:\n", + " response = s3_client.list_objects_v2(Bucket=BUCKET, Prefix=hf_merged_prefix)\n", + " if 'Contents' in response:\n", + " print(\"Files in hf_merged:\")\n", + " for obj in response['Contents']:\n", + " file_name = obj['Key'].replace(hf_merged_prefix, '')\n", + " print(f\" {file_name}\")\n", + " \n", + " # Now copy with correct paths\n", + " for obj in response['Contents']:\n", + " source_key = obj['Key']\n", + " file_name = source_key.replace(hf_merged_prefix, '')\n", + " dest_key = base_prefix.rstrip('/') + '/' + file_name\n", + " \n", + " try:\n", + " s3_client.copy_object(\n", + " Bucket=BUCKET,\n", + " CopySource={'Bucket': BUCKET, 'Key': source_key},\n", + " Key=dest_key\n", + " )\n", + " print(f\"✅ Copied {file_name}\")\n", + " except Exception as e:\n", + " print(f\"❌ Failed to copy {file_name}: {e}\")\n", + " else:\n", + " print(\"No files found in hf_merged directory\")\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job name: bedrock-nova-import-4982\n" + ] + }, + { + "data": { + "text/html": [ + "
[03/16/26 14:33:41] INFO     S3 artifacts path:                                        bedrock_model_builder.py:212\n",
+       "                             s3://sagemaker-us-east-1-099324990371/model-customization                             \n",
+       "                             /output-artifacts/my-lora-run-tpnld-1773683343850/output/                             \n",
+       "                             model                                                                                 \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/16/26 14:33:41]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m S3 artifacts path: \u001b]8;id=917035;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py\u001b\\\u001b[2mbedrock_model_builder.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=848667;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py#212\u001b\\\u001b[2m212\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m s3:\u001b[38;2;225;0;225m/\u001b[0m\u001b[38;2;225;0;225m/sagemaker-us-east-1-099324990371/model-customization\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225m/output-artifacts/my-lora-run-tpnld-1773683343850/output/\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225mmodel\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Manifest path:                                            bedrock_model_builder.py:219\n",
+       "                             s3://sagemaker-us-east-1-099324990371/model-customization                             \n",
+       "                             /output-artifacts/my-lora-run-tpnld-1773683343850/output/                             \n",
+       "                             output/manifest.json                                                                  \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Manifest path: \u001b]8;id=352000;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py\u001b\\\u001b[2mbedrock_model_builder.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=280360;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py#219\u001b\\\u001b[2m219\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m s3:\u001b[38;2;225;0;225m/\u001b[0m\u001b[38;2;225;0;225m/sagemaker-us-east-1-099324990371/model-customization\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225m/output-artifacts/my-lora-run-tpnld-1773683343850/output/\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225moutput/\u001b[0m\u001b[38;2;225;0;225mmanifest.json\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Looking for manifest at                                   bedrock_model_builder.py:226\n",
+       "                             s3://sagemaker-us-east-1-099324990371/model-customization                             \n",
+       "                             /output-artifacts/my-lora-run-tpnld-1773683343850/output/                             \n",
+       "                             output/manifest.json                                                                  \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Looking for manifest at \u001b]8;id=681920;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py\u001b\\\u001b[2mbedrock_model_builder.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=157754;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py#226\u001b\\\u001b[2m226\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m s3:\u001b[38;2;225;0;225m/\u001b[0m\u001b[38;2;225;0;225m/sagemaker-us-east-1-099324990371/model-customization\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225m/output-artifacts/my-lora-run-tpnld-1773683343850/output/\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225moutput/\u001b[0m\u001b[38;2;225;0;225mmanifest.json\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[03/16/26 14:33:42] INFO     Manifest content: {'checkpoint_s3_bucket':                bedrock_model_builder.py:232\n",
+       "                             's3://customer-escrow-099324990371-smtj-cc62fd20/my-lora-                             \n",
+       "                             run-tpnld-1773683343850/896'}                                                         \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/16/26 14:33:42]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Manifest content: \u001b[1m{\u001b[0m\u001b[38;2;0;135;0m'checkpoint_s3_bucket'\u001b[0m: \u001b]8;id=350064;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py\u001b\\\u001b[2mbedrock_model_builder.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=173941;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py#232\u001b\\\u001b[2m232\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;0;135;0m's3://customer-escrow-099324990371-smtj-cc62fd20/my-lora-\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;0;135;0mrun-tpnld-1773683343850/896'\u001b[0m\u001b[1m}\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Checkpoint URI:                                           bedrock_model_builder.py:239\n",
+       "                             s3://customer-escrow-099324990371-smtj-cc62fd20/my-lora-r                             \n",
+       "                             un-tpnld-1773683343850/896                                                            \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Checkpoint URI: \u001b]8;id=265208;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py\u001b\\\u001b[2mbedrock_model_builder.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=611184;file:///Users/twillit/.local/share/mise/installs/python/3.12.6/lib/python3.12/site-packages/sagemaker/serve/bedrock_model_builder.py#239\u001b\\\u001b[2m239\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m s3:\u001b[38;2;225;0;225m/\u001b[0m\u001b[38;2;225;0;225m/customer-escrow-099324990371-smtj-cc62fd20/my-lora-r\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[38;2;225;0;225mun-tpnld-1773683343850/\u001b[0m\u001b[38;2;225;0;225m896\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "arn:aws:bedrock:us-east-1:099324990371:custom-model/imported/fyjoelpl3jra\n" + ] + } + ], + "source": [ + "# Step 4: Create Bedrock model builder and deploy\n", + "job_name = f\"bedrock-nova-import-{random.randint(1000, 9999)}\"\n", + "print(f\"Job name: {job_name}\")\n", + "\n", + "# Create builder with correct model path\n", + "bedrock_builder = BedrockModelBuilder(\n", + " model=training_job\n", + ")\n", + "\n", + "# Deploy to Bedrock\n", + "deployment_result = bedrock_builder.deploy(\n", + " job_name=job_name,\n", + " imported_model_name=job_name,\n", + " role_arn=ROLE_ARN,\n", + " custom_model_name=job_name\n", + ")\n", + "\n", + "model_arn = deployment_result['modelArn']\n", + "print(model_arn)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model_arn = deployment_result['modelArn']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Creating\n", + "Model status: Active\n" + ] + } + ], + "source": [ + "# Create the custom model deployment\n", + "from uuid import uuid4\n", + "bedrock_client = boto3.client('bedrock', region_name=REGION)\n", + "\n", + "# Wait for model to be Active before deploying\n", + "while True:\n", + " status = bedrock_client.get_custom_model(modelIdentifier=model_arn).get(\"modelStatus\")\n", + " print(f\"Model status: {status}\")\n", + " if status == \"Active\":\n", + " break\n", + " if status == \"Failed\":\n", + " raise RuntimeError(\"Model creation failed\")\n", + " time.sleep(60)\n", + "\n", + "# Now safe to create deployment\n", + "deploy_resp = bedrock_client.create_custom_model_deployment(\n", + " modelDeploymentName=f\"deployment-{job_name}\",\n", + " modelArn=model_arn,\n", + " clientRequestToken=str(uuid4()),\n", + ")\n", + "deployment_arn = deploy_resp['customModelDeploymentArn']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for deployment to complete...\n", + "Deployment status: Creating\n", + "Deployment status: Creating\n", + "Deployment status: Creating\n", + "Deployment status: Creating\n", + "Deployment status: Creating\n", + "Deployment status: Creating\n", + "Deployment status: Active\n" + ] + } + ], + "source": [ + "# Step 5: Wait for custom model creation to complete\n", + "\n", + "print(\"Waiting for deployment to complete...\")\n", + "while True:\n", + " status = bedrock_client.get_custom_model_deployment(\n", + " customModelDeploymentIdentifier=deployment_arn\n", + " ).get(\"status\")\n", + " print(f\"Deployment status: {status}\")\n", + " if status == \"Active\":\n", + " break\n", + " if status == \"Failed\":\n", + " raise RuntimeError(\"Deployment failed\")\n", + " time.sleep(30)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Inference Message: What is the capital of France?\n", + "Model Response: The capital of France is Paris. Paris is not only the political center of France but also a major cultural, historical, and economic hub. It is situated in the northern part of the country, along the Seine River. Paris is renowned for its iconic landmarks such as the Eiffel Tower, the Louvre Museum, Notre-Dame Cathedral, and the Champs-Élysées. The city is also famous for its influence on art, fashion, cuisine, and philosophy.\n" + ] + } + ], + "source": [ + "# Step 6: Test inference with correct format\n", + "bedrock_runtime = boto3.client(\"bedrock-runtime\", region_name=\"us-east-1\")\n", + "message = \"What is the capital of France?\"\n", + "print(f\"Model Inference Message: {message}\")\n", + "resp = bedrock_runtime.converse(\n", + " modelId=deployment_arn,\n", + " messages=[{\"role\": \"user\", \"content\": [{\"text\": message}]}],\n", + " inferenceConfig={\"maxTokens\": 100, \"temperature\": 0.7},\n", + ")\n", + "\n", + "response_str = resp[\"output\"][\"message\"][\"content\"][0][\"text\"]\n", + "print(f\"Model Response: {response_str}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/bin/README b/bin/README deleted file mode 100644 index 6accc729fc..0000000000 --- a/bin/README +++ /dev/null @@ -1,5 +0,0 @@ -Put your python scripts into this directory. - -Any script that has a shebang line with python in it and is executable -will be automatically included in your package. All others must be -declared explicitly in the setup.py file. diff --git a/boto3_deployment_notebook.ipynb b/boto3_deployment_notebook.ipynb new file mode 100644 index 0000000000..b3f9cab141 --- /dev/null +++ b/boto3_deployment_notebook.ipynb @@ -0,0 +1,1202 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 24, + "id": "d71538e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker: 3.5.0\n", + "boto3: 1.42.58\n", + "botocore: 1.42.58\n" + ] + } + ], + "source": [ + "from importlib.metadata import version\n", + "print(f\"sagemaker: {version('sagemaker')}\")\n", + "print(f\"boto3: {version('boto3')}\")\n", + "print(f\"botocore: {version('botocore')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "e8d62358", + "metadata": {}, + "outputs": [], + "source": [ + "# initialize constants\n", + "REGION = \"us-west-2\"\n", + "TRAINING_JOB_NAME = \"test-lora-training-1-1773273846617\"\n", + "INSTANCE_TYPE = \"ml.g5.8xlarge\"\n", + "ROLE=\"arn:aws:iam::099324990371:role/service-role/AmazonSageMaker-ExecutionRole-20260219T233135\"\n", + "\n", + "LMI_IMAGE_URI = \"763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128\"\n", + "LMI_IMAGE_URI_31 = f\"763104351884.dkr.ecr.{REGION}.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124\"\n", + "\n", + "BASE_MODEL_S3_URI = f\"s3://jumpstart-private-cache-prod-{REGION}/meta-textgeneration/meta-textgeneration-llama-3-2-1b-instruct/artifacts/inference-prepack/v1.0.0/\"\n", + "\n", + "import random\n", + "name_suffix = random.randint(100, 10000)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "3e654dac", + "metadata": {}, + "outputs": [], + "source": [ + "# initialize clients\n", + "import boto3\n", + "sm = boto3.client(\"sagemaker\", region_name=REGION)\n", + "sm_runtime = boto3.client(\"sagemaker-runtime\", region_name=REGION)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "3d0d573f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Adapter weights: s3://sagemaker-us-west-2-099324990371/model-customization/output-artifacts/test-lora-training-1-1773273846617/output/model/checkpoints/hf/\n" + ] + } + ], + "source": [ + "# get s3 artifact location\n", + "response = sm.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)\n", + "model_s3_uri = response[\"ModelArtifacts\"][\"S3ModelArtifacts\"]\n", + "adapter_s3_uri = f\"{model_s3_uri}/checkpoints/hf/\"\n", + "print(f\" Adapter weights: {adapter_s3_uri}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be45afc0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model: {'ModelArn': 'arn:aws:sagemaker:us-west-2:099324990371:model/model-1977', 'ResponseMetadata': {'RequestId': '510a2671-03c1-464a-81d1-1ad7f512a72d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '510a2671-03c1-464a-81d1-1ad7f512a72d', 'strict-transport-security': 'max-age=47304000; includeSubDomains', 'x-frame-options': 'DENY', 'content-security-policy': \"frame-ancestors 'none'\", 'cache-control': 'no-cache, no-store, must-revalidate', 'x-content-type-options': 'nosniff', 'content-type': 'application/x-amz-json-1.1', 'content-length': '72', 'date': 'Thu, 12 Mar 2026 20:09:45 GMT'}, 'RetryAttempts': 0}}\n" + ] + } + ], + "source": [ + "# create model in SageMaker using boto3\n", + "model_name = f\"model-{name_suffix}\"\n", + "model = sm.create_model(\n", + " ModelName=model_name,\n", + " ExecutionRoleArn=ROLE,\n", + " PrimaryContainer={\n", + " \"Image\": LMI_IMAGE_URI,\n", + " \"Environment\": {\n", + " # Use lmi-dist for rolling batch — NOT \"disable\" (which requires the vllm entrypoint on 0.34.0+)\n", + " \"OPTION_ROLLING_BATCH\": \"lmi-dist\",\n", + " \"OPTION_ENABLE_LORA\": \"true\",\n", + " \"OPTION_MAX_LORAS\": \"8\",\n", + " \"OPTION_MAX_CPU_LORAS\": \"64\",\n", + " \"OPTION_MAX_LORA_RANK\": \"128\",\n", + " \"OPTION_MAX_ROLLING_BATCH_SIZE\": \"8\",\n", + " # Must match GPU count on the instance: \"1\" for g5.2xlarge, \"8\" for g6e.48xlarge\n", + " \"OPTION_TENSOR_PARALLEL_DEGREE\": \"1\",\n", + " \"OPTION_DTYPE\": \"fp16\",\n", + " \"OPTION_MAX_MODEL_LEN\": \"4096\",\n", + " },\n", + " # Load base model from JumpStart S3 cache — avoids needing HF_TOKEN for gated models\n", + " \"ModelDataSource\": {\n", + " \"S3DataSource\": {\n", + " \"S3Uri\": BASE_MODEL_S3_URI,\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"CompressionType\": \"None\",\n", + " \"ModelAccessConfig\": {\"AcceptEula\": True},\n", + " }\n", + " },\n", + " },\n", + ")\n", + "print(f\"model: {model}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e7da772", + "metadata": {}, + "outputs": [], + "source": [ + "# create model in SageMaker using ModelBuilder\n", + "from sagemaker.core.resources import TrainingJob\n", + "from sagemaker.serve import ModelBuilder\n", + "\n", + "training_job = TrainingJob.get(training_job_name=TRAINING_JOB_NAME)\n", + "print(f\"model package arn: {training_job.output_model_package_arn}\")\n", + "\n", + "model_name = f\"model-{name_suffix}\"\n", + "model_builder = ModelBuilder(model=training_job, role_arn=ROLE, instance_type=INSTANCE_TYPE)\n", + "model = model_builder.build(model_name=model_name)\n", + "print(f\"model arn: {model.model_arn}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "13eefb5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "endpoint config: {'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:099324990371:endpoint-config/e2e-1977', 'ResponseMetadata': {'RequestId': 'ded5b4e4-1b3e-49c2-97c2-25bb7a11b2a4', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ded5b4e4-1b3e-49c2-97c2-25bb7a11b2a4', 'strict-transport-security': 'max-age=47304000; includeSubDomains', 'x-frame-options': 'DENY', 'content-security-policy': \"frame-ancestors 'none'\", 'cache-control': 'no-cache, no-store, must-revalidate', 'x-content-type-options': 'nosniff', 'content-type': 'application/x-amz-json-1.1', 'content-length': '89', 'date': 'Thu, 12 Mar 2026 20:10:22 GMT'}, 'RetryAttempts': 0}}\n", + "endpoint: {'EndpointArn': 'arn:aws:sagemaker:us-west-2:099324990371:endpoint/e2e-1977', 'ResponseMetadata': {'RequestId': 'ca82b575-df5e-48f7-ba9b-74de1137d1ef', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ca82b575-df5e-48f7-ba9b-74de1137d1ef', 'strict-transport-security': 'max-age=47304000; includeSubDomains', 'x-frame-options': 'DENY', 'content-security-policy': \"frame-ancestors 'none'\", 'cache-control': 'no-cache, no-store, must-revalidate', 'x-content-type-options': 'nosniff', 'content-type': 'application/x-amz-json-1.1', 'content-length': '76', 'date': 'Thu, 12 Mar 2026 20:10:23 GMT'}, 'RetryAttempts': 0}}\n" + ] + } + ], + "source": [ + "# create endpoint\n", + "endpoint_name = f\"e2e-{name_suffix}\"\n", + "ep_config = sm.create_endpoint_config(\n", + " EndpointConfigName=endpoint_name,\n", + " ExecutionRoleArn=ROLE,\n", + " ProductionVariants=[\n", + " {\n", + " \"VariantName\": \"AllTraffic\",\n", + " \"InstanceType\": INSTANCE_TYPE,\n", + " \"InitialInstanceCount\": 1,\n", + " }\n", + " ],\n", + ")\n", + "print(f\"endpoint config: {ep_config}\")\n", + "\n", + "endpoint = sm.create_endpoint(EndpointName=endpoint_name, EndpointConfigName=endpoint_name)\n", + "print(f\"endpoint: {endpoint}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "dbf2a262", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "base inference component: {'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:099324990371:inference-component/e2e-1977-inference-component', 'ResponseMetadata': {'RequestId': 'c24a105b-762b-4aa1-b17c-ae71adfd0df2', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c24a105b-762b-4aa1-b17c-ae71adfd0df2', 'strict-transport-security': 'max-age=47304000; includeSubDomains', 'x-frame-options': 'DENY', 'content-security-policy': \"frame-ancestors 'none'\", 'cache-control': 'no-cache, no-store, must-revalidate', 'x-content-type-options': 'nosniff', 'content-type': 'application/x-amz-json-1.1', 'content-length': '117', 'date': 'Thu, 12 Mar 2026 20:10:28 GMT'}, 'RetryAttempts': 0}}\n" + ] + } + ], + "source": [ + "# create base model inference component\n", + "base_ic_name = f\"{endpoint_name}-inference-component\"\n", + "base_inference_component = sm.create_inference_component(\n", + " InferenceComponentName=base_ic_name,\n", + " EndpointName=endpoint_name,\n", + " VariantName=\"AllTraffic\",\n", + " Specification={\n", + " \"ModelName\": model_name,\n", + " \"ComputeResourceRequirements\": {\n", + " \"MinMemoryRequiredInMb\": 4096,\n", + " \"NumberOfAcceleratorDevicesRequired\": 1,\n", + " },\n", + " },\n", + " RuntimeConfig={\"CopyCount\": 1},\n", + ")\n", + "print(f\"base inference component: {base_inference_component}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "df74cb13", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "adapter inference component: {'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:099324990371:inference-component/e2e-1977-adapter', 'ResponseMetadata': {'RequestId': 'fd3e04bf-1b48-4211-bd92-16d3d67cc4df', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'fd3e04bf-1b48-4211-bd92-16d3d67cc4df', 'strict-transport-security': 'max-age=47304000; includeSubDomains', 'x-frame-options': 'DENY', 'content-security-policy': \"frame-ancestors 'none'\", 'cache-control': 'no-cache, no-store, must-revalidate', 'x-content-type-options': 'nosniff', 'content-type': 'application/x-amz-json-1.1', 'content-length': '105', 'date': 'Thu, 12 Mar 2026 20:39:33 GMT'}, 'RetryAttempts': 0}}\n" + ] + } + ], + "source": [ + "# create adapter inference component\n", + "endpoint_name = \"e2e-1977\"\n", + "base_ic_name = \"e2e-1977-inference-component\"\n", + "\n", + "adapter_ic_name = f\"{endpoint_name}-adapter\"\n", + "adapter_inference_component = sm.create_inference_component(\n", + " InferenceComponentName=adapter_ic_name,\n", + " EndpointName=endpoint_name,\n", + " Specification={\n", + " \"BaseInferenceComponentName\": base_ic_name,\n", + " \"Container\": {\"ArtifactUrl\": adapter_s3_uri},\n", + " },\n", + ")\n", + "print(f\"adapter inference component: {adapter_inference_component}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e0823def", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Response: {'generated_text': ' Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris. The capital of France is Paris'}\n" + ] + } + ], + "source": [ + "# test inference on base model inference component\n", + "import json\n", + "payload = json.dumps({\"inputs\": \"What is the capital of France?\", \"parameters\": {\"max_new_tokens\": 50}})\n", + "base_model_response = sm_runtime.invoke_endpoint(\n", + " EndpointName=endpoint_name,\n", + " InferenceComponentName=base_ic_name,\n", + " Body=payload,\n", + " ContentType=\"application/json\",\n", + ")\n", + "result = json.loads(base_model_response[\"Body\"].read().decode())\n", + "print(f\"Response: {result}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "79b05339", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'CreationTime': datetime.datetime(2026, 3, 12, 9, 50, 14, 970000, tzinfo=tzlocal()),\n", + " 'EndpointArn': 'arn:aws:sagemaker:us-west-2:099324990371:endpoint/e2e-5429',\n", + " 'EndpointName': 'e2e-5429',\n", + " 'InferenceComponentArn': 'arn:aws:sagemaker:us-west-2:099324990371:inference-component/e2e-5429-inference-component',\n", + " 'InferenceComponentName': 'e2e-5429-inference-component',\n", + " 'InferenceComponentStatus': 'InService',\n", + " 'LastModifiedTime': datetime.datetime(2026, 3, 12, 9, 55, 36, 65000, tzinfo=tzlocal()),\n", + " 'ResponseMetadata': {'HTTPHeaders': {'cache-control': 'no-cache, no-store, '\n", + " 'must-revalidate',\n", + " 'content-length': '973',\n", + " 'content-security-policy': 'frame-ancestors '\n", + " \"'none'\",\n", + " 'content-type': 'application/x-amz-json-1.1',\n", + " 'date': 'Thu, 12 Mar 2026 21:02:01 GMT',\n", + " 'strict-transport-security': 'max-age=47304000; '\n", + " 'includeSubDomains',\n", + " 'x-amzn-requestid': '36537af2-0b32-4c7b-a283-8d0f825d5bd1',\n", + " 'x-content-type-options': 'nosniff',\n", + " 'x-frame-options': 'DENY'},\n", + " 'HTTPStatusCode': 200,\n", + " 'RequestId': '36537af2-0b32-4c7b-a283-8d0f825d5bd1',\n", + " 'RetryAttempts': 0},\n", + " 'RuntimeConfig': {'CurrentCopyCount': 1, 'DesiredCopyCount': 1},\n", + " 'Specification': {'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 4096,\n", + " 'NumberOfAcceleratorDevicesRequired': 1.0,\n", + " 'NumberOfCpuCoresRequired': 8.0},\n", + " 'Container': {'DeployedImage': {'ResolutionTime': datetime.datetime(2026, 3, 12, 9, 50, 15, 707000, tzinfo=tzlocal()),\n", + " 'ResolvedImage': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference@sha256:4979ff55ba85b9b525333016fde63fa3d709567d1bbf02c486e963bdc0d48b7b',\n", + " 'SpecifiedImage': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128'}},\n", + " 'DataCacheConfig': {'EnableCaching': True}},\n", + " 'VariantName': 'e2e-5429'}\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "# resp = sm.describe_inference_component(InferenceComponentName=adapter_ic_name)\n", + "resp = sm.describe_inference_component(InferenceComponentName=\"e2e-5429-inference-component\")\n", + "# resp = sm.describe_endpoint(EndpointName=\"e2e-5429\")\n", + "\n", + "# status = resp[\"InferenceComponentStatus\"]\n", + "# print(f\"Status: {status}\")\n", + "pprint(resp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df316e99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Response: {'generated_text': ' Paris.\\nWhat is the capital of the United States? Washington, D.C.\\nWhat is the capital of the United Kingdom? London.\\nWhat is the capital of Australia? Canberra.\\nWhat is the capital of Canada? Ottawa.\\nWhat is the capital of'}\n" + ] + } + ], + "source": [ + "# test inference on adapter inference component\n", + "import json\n", + "payload = json.dumps({\"inputs\": \"What is the capital of France?\", \"parameters\": {\"max_new_tokens\": 50}})\n", + "adapter_response = sm_runtime.invoke_endpoint(\n", + " EndpointName=\"e2e-5429\", #endpoint_name,\n", + " InferenceComponentName=\"e2e-5429-inference-component\", #adapter_ic_name,\n", + " Body=payload,\n", + " ContentType=\"application/json\",\n", + ")\n", + "result = json.loads(adapter_response[\"Body\"].read().decode())\n", + "print(f\" Response: {result}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "df2b7dfa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "meta-textgeneration-llama-3-2-1b-instruct\n", + "{'Capabilities': ['TRAINING', 'FINE_TUNING', 'VALIDATION', 'CUSTOMIZATION'],\n", + " 'ContainerStartupHealthCheckTimeout': 1200,\n", + " 'ContextualHelp': {'HubDefaultTrainData': [\"Dataset: [OpenAssistant's TOP-1 \"\n", + " 'Conversation '\n", + " 'Threads](https://huggingface.co/datasets/OpenAssistant/oasst_top1_2023-08-25)',\n", + " \"OpenAssistant's TOP-1 \"\n", + " 'Conversation Threads dataset '\n", + " 'contains roughly 13,000 samples '\n", + " 'of conversations between the '\n", + " 'Assistant and the user.',\n", + " 'License: [Apache '\n", + " '2.0](https://jumpstart-cache-prod-us-east-2.s3-us-east-2.amazonaws.com/licenses/Apache-License/LICENSE-2.0.txt)'],\n", + " 'HubFormatTrainData': ['A train and an optional validation '\n", + " 'directories. Each directory '\n", + " 'contains a jsonl. ',\n", + " ' [Learn how to setup an AWS S3 '\n", + " 'bucket.](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html)']},\n", + " 'DataType': 'text',\n", + " 'DefaultInferenceInstanceType': 'ml.g6.xlarge',\n", + " 'DefaultPayloads': {'emojisBeijing': {'Body': {'inputs': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'Always answer with '\n", + " 'emojis<|eot_id|><|start_header_id|>user<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'How to go from '\n", + " 'Beijing to '\n", + " 'NY?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n'\n", + " '\\n',\n", + " 'parameters': {'max_new_tokens': 256,\n", + " 'temperature': 0.6,\n", + " 'top_p': 0.9}},\n", + " 'ContentType': 'application/json',\n", + " 'OutputKeys': {'generated_text': 'generated_text'},\n", + " 'PromptKey': 'inputs'},\n", + " 'mayonnaise': {'Body': {'inputs': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'what is the recipe of '\n", + " 'mayonnaise?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n'\n", + " '\\n',\n", + " 'parameters': {'details': True,\n", + " 'max_new_tokens': 256,\n", + " 'temperature': 0.6,\n", + " 'top_p': 0.9}},\n", + " 'ContentType': 'application/json',\n", + " 'OutputKeys': {'generated_text': 'generated_text'},\n", + " 'PromptKey': 'inputs'},\n", + " 'parisHaiku': {'Body': {'inputs': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'Always answer with '\n", + " 'Haiku<|eot_id|><|start_header_id|>user<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'I am going to Paris, '\n", + " 'what should I '\n", + " 'see?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n'\n", + " '\\n',\n", + " 'parameters': {'max_new_tokens': 256,\n", + " 'temperature': 0.6,\n", + " 'top_p': 0.9}},\n", + " 'ContentType': 'application/json',\n", + " 'OutputKeys': {'generated_text': 'generated_text'},\n", + " 'PromptKey': 'inputs'},\n", + " 'parisTrip': {'Body': {'inputs': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'I am going to Paris, '\n", + " 'what should I '\n", + " 'see?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'Paris, the capital of '\n", + " 'France, is known for '\n", + " 'its stunning '\n", + " 'architecture, art '\n", + " 'museums, historical '\n", + " 'landmarks, and romantic '\n", + " 'atmosphere. Here are '\n", + " 'some of the top '\n", + " 'attractions to see in '\n", + " 'Paris:\\n'\n", + " '\\n'\n", + " '1. The Eiffel Tower: '\n", + " 'The iconic Eiffel Tower '\n", + " 'is one of the most '\n", + " 'recognizable landmarks '\n", + " 'in the world and offers '\n", + " 'breathtaking views of '\n", + " 'the city.\\n'\n", + " '2. The Louvre Museum: '\n", + " 'The Louvre is one of '\n", + " \"the world's largest and \"\n", + " 'most famous museums, '\n", + " 'housing an impressive '\n", + " 'collection of art and '\n", + " 'artifacts, including '\n", + " 'the Mona Lisa.\\n'\n", + " '3. Notre-Dame '\n", + " 'Cathedral: This '\n", + " 'beautiful cathedral is '\n", + " 'one of the most famous '\n", + " 'landmarks in Paris and '\n", + " 'is known for its Gothic '\n", + " 'architecture and '\n", + " 'stunning stained glass '\n", + " 'windows.\\n'\n", + " '\\n'\n", + " 'These are just a few of '\n", + " 'the many attractions '\n", + " 'that Paris has to '\n", + " 'offer. With so much to '\n", + " \"see and do, it's no \"\n", + " 'wonder that Paris is '\n", + " 'one of the most popular '\n", + " 'tourist destinations in '\n", + " 'the '\n", + " 'world.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n'\n", + " '\\n'\n", + " 'What is so great about '\n", + " '#1?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n'\n", + " '\\n',\n", + " 'parameters': {'max_new_tokens': 256,\n", + " 'temperature': 0.6,\n", + " 'top_p': 0.9}},\n", + " 'ContentType': 'application/json',\n", + " 'OutputKeys': {'generated_text': 'generated_text'},\n", + " 'PromptKey': 'inputs'}},\n", + " 'DefaultTrainingDatasetUri': 's3://jumpstart-cache-prod-us-west-2/training-datasets/oasst_top/train/',\n", + " 'DefaultTrainingInstanceType': 'ml.g5.2xlarge',\n", + " 'Dependencies': [],\n", + " 'DisableOutputCompression': True,\n", + " 'DynamicContainerDeploymentSupported': True,\n", + " 'EncryptInterContainerTraffic': True,\n", + " 'FineTuningSupported': True,\n", + " 'Framework': 'meta',\n", + " 'GatedBucket': True,\n", + " 'HostingArtifactCompressionType': 'None',\n", + " 'HostingArtifactS3DataType': 'S3Prefix',\n", + " 'HostingArtifactUri': 's3://jumpstart-private-cache-prod-us-west-2/meta-textgeneration/meta-textgeneration-llama-3-2-1b-instruct/artifacts/inference-prepack/v1.0.0/',\n", + " 'HostingEcrSpecs': {'Framework': 'djl-lmi-18',\n", + " 'FrameworkVersion': '0.36.0',\n", + " 'PyVersion': 'py310'},\n", + " 'HostingEcrUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128',\n", + " 'HostingEulaUri': 's3://jumpstart-cache-prod-us-west-2/fmhMetadata/eula/llama3_2Eula.txt',\n", + " 'HostingInstanceTypeVariants': {'Variants': {'g4dn': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'g5': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'g6': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'g6e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'local_gpu': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'ml.g5.12xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 98304,\n", + " 'NumAccelerators': 4}}},\n", + " 'ml.g5.16xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 131072,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g5.24xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 196608,\n", + " 'NumAccelerators': 4}}},\n", + " 'ml.g5.2xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 16384,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g5.48xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 393216,\n", + " 'NumAccelerators': 8}}},\n", + " 'ml.g5.4xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 32768,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g5.8xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 65536,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g5.xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 8192,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g6.12xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 98304,\n", + " 'NumAccelerators': 4}}},\n", + " 'ml.g6.16xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 131072,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g6.24xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 196608,\n", + " 'NumAccelerators': 4}}},\n", + " 'ml.g6.2xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 16384,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g6.48xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 393216,\n", + " 'NumAccelerators': 8}}},\n", + " 'ml.g6.4xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 32768,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g6.8xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 65536,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.g6.xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 8192,\n", + " 'NumAccelerators': 1}}},\n", + " 'ml.p4d.24xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 589824,\n", + " 'NumAccelerators': 8}}},\n", + " 'ml.p5.48xlarge': {'Properties': {'ResourceRequirements': {'MinMemoryMb': 1048576,\n", + " 'NumAccelerators': 8}}},\n", + " 'p2': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p3': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p3dn': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p4d': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p4de': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p5': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p5e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p5en': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p6': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}},\n", + " 'p6e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.36.0-lmi18.0.0-cu128'}}}},\n", + " 'HostingResourceRequirements': {'MinMemoryMb': 8192, 'NumAccelerators': 1},\n", + " 'HostingScriptUri': 's3://jumpstart-cache-prod-us-west-2/source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz',\n", + " 'HostingUseScriptUri': False,\n", + " 'Hyperparameters': [{'Default': 'False',\n", + " 'Name': 'int8_quantization',\n", + " 'Options': ['True', 'False'],\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 'True',\n", + " 'Name': 'enable_fsdp',\n", + " 'Options': ['True', 'False'],\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 1,\n", + " 'Max': 1000,\n", + " 'Min': 1,\n", + " 'Name': 'epoch',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 0.0001,\n", + " 'Max': 1,\n", + " 'Min': 1e-08,\n", + " 'Name': 'learning_rate',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'float'},\n", + " {'Default': 8,\n", + " 'Min': 1,\n", + " 'Name': 'lora_r',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 32,\n", + " 'Min': 1,\n", + " 'Name': 'lora_alpha',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 'q_proj,v_proj',\n", + " 'Name': 'target_modules',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 0.05,\n", + " 'Max': 1,\n", + " 'Min': 0,\n", + " 'Name': 'lora_dropout',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'float'},\n", + " {'Default': 'False',\n", + " 'Name': 'instruction_tuned',\n", + " 'Options': ['True', 'False'],\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 'True',\n", + " 'Name': 'chat_dataset',\n", + " 'Options': ['True', 'False'],\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 'True',\n", + " 'Name': 'add_input_output_demarcation_key',\n", + " 'Options': ['True', 'False'],\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 1,\n", + " 'Max': 1000,\n", + " 'Min': 1,\n", + " 'Name': 'per_device_train_batch_size',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 1,\n", + " 'Max': 1000,\n", + " 'Min': 1,\n", + " 'Name': 'per_device_eval_batch_size',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': -1,\n", + " 'Min': -1,\n", + " 'Name': 'max_train_samples',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': -1,\n", + " 'Min': -1,\n", + " 'Name': 'max_val_samples',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 10,\n", + " 'Max': 1000,\n", + " 'Min': 1,\n", + " 'Name': 'seed',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': -1,\n", + " 'Min': -1,\n", + " 'Name': 'max_input_length',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 0.2,\n", + " 'Max': 1,\n", + " 'Min': 0,\n", + " 'Name': 'validation_split_ratio',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'float'},\n", + " {'Default': 0,\n", + " 'Min': 0,\n", + " 'Name': 'train_data_split_seed',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'int'},\n", + " {'Default': 'None',\n", + " 'Name': 'preprocessing_num_workers',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': 'Llama3.1',\n", + " 'Name': 'chat_template',\n", + " 'Scope': 'algorithm',\n", + " 'Type': 'text'},\n", + " {'Default': '/opt/ml/input/data/code/sourcedir.tar.gz',\n", + " 'Name': 'sagemaker_submit_directory',\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': 'transfer_learning.py',\n", + " 'Name': 'sagemaker_program',\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': '20',\n", + " 'Name': 'sagemaker_container_log_level',\n", + " 'Scope': 'container',\n", + " 'Type': 'text'}],\n", + " 'IncrementalTrainingSupported': False,\n", + " 'InferenceDependencies': [],\n", + " 'InferenceEnableNetworkIsolation': True,\n", + " 'InferenceEnvironmentVariables': [{'Default': 3600,\n", + " 'Name': 'ENDPOINT_SERVER_TIMEOUT',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'int'},\n", + " {'Default': '/opt/ml/model',\n", + " 'Name': 'HF_MODEL_ID',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': '/opt/ml/model',\n", + " 'Name': 'MODEL_CACHE_ROOT',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': 'true',\n", + " 'Name': 'OPTION_ENABLE_CHUNKED_PREFILL',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': '20',\n", + " 'Name': 'SAGEMAKER_CONTAINER_LOG_LEVEL',\n", + " 'RequiredForModelClass': False,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': '1',\n", + " 'Name': 'SAGEMAKER_ENV',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': '3600',\n", + " 'Name': 'SAGEMAKER_MODEL_SERVER_TIMEOUT',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': 1,\n", + " 'Name': 'SAGEMAKER_MODEL_SERVER_WORKERS',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'int'},\n", + " {'Default': 'inference.py',\n", + " 'Name': 'SAGEMAKER_PROGRAM',\n", + " 'RequiredForModelClass': True,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'},\n", + " {'Default': '/opt/ml/model/code',\n", + " 'Name': 'SAGEMAKER_SUBMIT_DIRECTORY',\n", + " 'RequiredForModelClass': False,\n", + " 'Scope': 'container',\n", + " 'Type': 'text'}],\n", + " 'InferenceVolumeSize': 256,\n", + " 'MaxRuntimeInSeconds': 360000,\n", + " 'MinSdkVersion': '2.225.0',\n", + " 'ModelDataDownloadTimeout': 1200,\n", + " 'ModelTypes': ['OPEN_WEIGHTS'],\n", + " 'NotebookLocations': {'DemoNotebook': 's3://jumpstart-cache-prod-us-west-2/pmm-notebooks/pmm-notebook-model-hub-text-generation-deploy.ipynb',\n", + " 'DemoNotebooks': [{'IsDefault': True,\n", + " 'S3Uri': 's3://jumpstart-cache-prod-us-west-2/pmm-notebooks/pmm-notebook-model-hub-text-generation-deploy.ipynb',\n", + " 'Title': 'Deploy'},\n", + " {'IsDefault': False,\n", + " 'S3Uri': 's3://jumpstart-cache-prod-us-west-2/pmm-notebooks/pmm-notebook-model-hub-text-generation-instruction-tuning-llama.ipynb',\n", + " 'Title': 'Fine-Tune: Instruction '\n", + " 'Tuning'},\n", + " {'IsDefault': False,\n", + " 'S3Uri': 's3://jumpstart-cache-prod-us-west-2/open-source-notebooks/sm-studio-oss-training-job-sample-notebook.ipynb',\n", + " 'Title': 'Deploy End-to-End Model '\n", + " 'Customization'}]},\n", + " 'Provider': 'meta',\n", + " 'RecipeCollection': [{'CustomizationTechnique': 'RLAIF',\n", + " 'DisplayName': 'Llama 3.2 1B GRPO RLAIF Fine-Tuning '\n", + " 'with LoRA',\n", + " 'Hardware': 'GPU',\n", + " 'HostingConfigs': [{'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g6.4xlarge',\n", + " 'Profile': 'Default'},\n", + " {'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g5.4xlarge'}],\n", + " 'HpEksOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlaif-llama-3-dot-2-1b-instruct-lora_override_params_k8s_v2.0.0.json',\n", + " 'HpEksPayloadTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlaif-llama-3-dot-2-1b-instruct-lora_payload_template_k8s_v2.0.0.yaml',\n", + " 'InstanceCount': 1,\n", + " 'Name': 'verl-grpo-rlaif-llama-3-dot-2-1b-instruct-lora',\n", + " 'Peft': 'LORA',\n", + " 'RecipeFilePath': 'recipes/fine-tuning/llama/verl-grpo-rlaif-llama-3-dot-2-1b-instruct-lora.yaml',\n", + " 'SequenceLength': '2K',\n", + " 'ServerlessMeteringType': 'Hourly',\n", + " 'SmtjImageUri': '920498770698.dkr.ecr.us-west-2.amazonaws.com/hyperpod-recipes:verl-v1.0.0-smtj',\n", + " 'SmtjOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlaif-llama-3-dot-2-1b-instruct-lora_override_params_sm_jobs_v2.0.0.json',\n", + " 'SmtjRecipeTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlaif-llama-3-dot-2-1b-instruct-lora_payload_template_sm_jobs_v2.0.0.yaml',\n", + " 'SupportedInstanceTypes': ['ml.p5.48xlarge',\n", + " 'ml.p4de.24xlarge',\n", + " 'ml.p4d.24xlarge'],\n", + " 'Type': 'FineTuning',\n", + " 'Versions': ['1.0.0']},\n", + " {'CustomizationTechnique': 'RLVR',\n", + " 'DisplayName': 'Llama 3.2 1B GRPO RLVR Fine-Tuning',\n", + " 'Hardware': 'GPU',\n", + " 'HostingConfigs': [{'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g6.4xlarge',\n", + " 'Profile': 'Default'},\n", + " {'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g5.4xlarge'}],\n", + " 'HpEksOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlvr-llama-3-dot-2-1b-instruct-lora_override_params_k8s_v2.0.0.json',\n", + " 'HpEksPayloadTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlvr-llama-3-dot-2-1b-instruct-lora_payload_template_k8s_v2.0.0.yaml',\n", + " 'InstanceCount': 1,\n", + " 'Name': 'verl-grpo-rlvr-llama-3-dot-2-1b-instruct-lora',\n", + " 'Peft': 'LORA',\n", + " 'RecipeFilePath': 'recipes/fine-tuning/llama/verl-grpo-rlvr-llama-3-dot-2-1b-instruct-lora.yaml',\n", + " 'SequenceLength': '1K',\n", + " 'ServerlessMeteringType': 'Hourly',\n", + " 'SmtjImageUri': '920498770698.dkr.ecr.us-west-2.amazonaws.com/hyperpod-recipes:verl-v1.0.0-smtj',\n", + " 'SmtjOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlvr-llama-3-dot-2-1b-instruct-lora_override_params_sm_jobs_v2.0.0.json',\n", + " 'SmtjRecipeTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/verl-grpo-rlvr-llama-3-dot-2-1b-instruct-lora_payload_template_sm_jobs_v2.0.0.yaml',\n", + " 'SupportedInstanceTypes': ['ml.p5.48xlarge',\n", + " 'ml.p4de.24xlarge',\n", + " 'ml.p4d.24xlarge'],\n", + " 'Type': 'FineTuning',\n", + " 'Versions': ['1.1.0']},\n", + " {'CustomizationTechnique': 'SFT',\n", + " 'DisplayName': 'Llama 3.2 1B Simple Fine Tuning with '\n", + " 'Lora on GPU, 4K sequence length',\n", + " 'Hardware': 'GPU',\n", + " 'HostingConfigs': [{'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g6.4xlarge',\n", + " 'Profile': 'Default'},\n", + " {'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g5.4xlarge'}],\n", + " 'HpEksOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_sft_lora_override_params_k8s_v2.0.0.json',\n", + " 'HpEksPayloadTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_sft_lora_payload_template_k8s_v2.0.0.yaml',\n", + " 'InstanceCount': 1,\n", + " 'Name': 'llmft_llama3_2_1b_instruct_seq4k_gpu_sft_lora',\n", + " 'Peft': 'LORA',\n", + " 'RecipeFilePath': 'recipes/fine-tuning/llama/llmft_llama3_2_1b_instruct_seq4k_gpu_sft_lora.yaml',\n", + " 'SequenceLength': '4K',\n", + " 'ServerlessMeteringType': 'Token-based',\n", + " 'SmtjImageUri': '920498770698.dkr.ecr.us-west-2.amazonaws.com/hyperpod-recipes:llmft-v1.0.0',\n", + " 'SmtjOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_sft_lora_override_params_sm_jobs_v2.0.0.json',\n", + " 'SmtjRecipeTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_sft_lora_payload_template_sm_jobs_v2.0.0.yaml',\n", + " 'SupportedInstanceTypes': ['ml.p4de.24xlarge',\n", + " 'ml.p4d.24xlarge',\n", + " 'ml.p5.48xlarge',\n", + " 'ml.g5.48xlarge',\n", + " 'ml.g5.12xlarge'],\n", + " 'Type': 'FineTuning',\n", + " 'Versions': ['1.1.0']},\n", + " {'CustomizationTechnique': 'DPO',\n", + " 'DisplayName': 'Llama 3.2 1B Direct Preference '\n", + " 'Optimization on GPU, 4K sequence length',\n", + " 'Hardware': 'GPU',\n", + " 'HostingConfigs': [{'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g6.4xlarge',\n", + " 'Profile': 'Default'},\n", + " {'ComputeResourceRequirements': {'MinMemoryRequiredInMb': 32768,\n", + " 'NumberOfAcceleratorDevicesRequired': 1,\n", + " 'NumberOfCpuCoresRequired': 12},\n", + " 'EcrAddress': '763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128',\n", + " 'Environment': {'OPTION_ASYNC_MODE': 'true',\n", + " 'OPTION_ENABLE_LORA': 'true',\n", + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", + " 'OPTION_MAX_CPU_LORAS': '16',\n", + " 'OPTION_MAX_LORAS': '8',\n", + " 'OPTION_MAX_LORA_RANK': '128',\n", + " 'OPTION_MAX_ROLLING_BATCH_SIZE': '8',\n", + " 'OPTION_ROLLING_BATCH': 'disable',\n", + " 'OPTION_TENSOR_PARALLEL_DEGREE': '1',\n", + " 'SAGEMAKER_ENABLE_LOAD_AWARE': '1',\n", + " 'SAGEMAKER_MAX_NUMBER_OF_ADAPTERS_IN_MEMORY': '32'},\n", + " 'InstanceType': 'ml.g5.4xlarge'}],\n", + " 'HpEksOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_dpo_override_params_k8s_v2.0.0.json',\n", + " 'HpEksPayloadTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_dpo_payload_template_k8s_v2.0.0.yaml',\n", + " 'InstanceCount': 1,\n", + " 'Name': 'llmft_llama3_2_1b_instruct_seq4k_gpu_dpo',\n", + " 'Peft': 'LORA',\n", + " 'RecipeFilePath': 'recipes/fine-tuning/llama/llmft_llama3_2_1b_instruct_seq4k_gpu_dpo.yaml',\n", + " 'SequenceLength': '4K',\n", + " 'ServerlessMeteringType': 'Token-based',\n", + " 'SmtjImageUri': '920498770698.dkr.ecr.us-west-2.amazonaws.com/hyperpod-recipes:llmft-v1.0.0',\n", + " 'SmtjOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_dpo_override_params_sm_jobs_v2.0.0.json',\n", + " 'SmtjRecipeTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/llmft_llama3_2_1b_instruct_seq4k_gpu_dpo_payload_template_sm_jobs_v2.0.0.yaml',\n", + " 'SupportedInstanceTypes': ['ml.p4de.24xlarge',\n", + " 'ml.p4d.24xlarge',\n", + " 'ml.p5.48xlarge',\n", + " 'ml.g5.48xlarge'],\n", + " 'Type': 'FineTuning',\n", + " 'Versions': ['1.1.0']},\n", + " {'DisplayName': 'Open Source Evaluation Evaluation on '\n", + " 'GPU - Meta Textgeneration Llama 3 2 1B '\n", + " 'Instruct',\n", + " 'EvaluationType': 'DeterministicEvaluation',\n", + " 'Hardware': 'GPU',\n", + " 'HpEksOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/meta_textgeneration_llama_3_2_1b_instruct_override_params_k8s_v2.0.0.json',\n", + " 'HpEksPayloadTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/meta_textgeneration_llama_3_2_1b_instruct_payload_template_k8s_v2.0.0.yaml',\n", + " 'Name': 'open-source-eval-meta-textgeneration-llama-3-2-1b-instruct-deterministic',\n", + " 'SmtjImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest',\n", + " 'SmtjOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/open-source-eval-meta-textgeneration-llama-3-2-1b-instruct-deterministic_override_params_sm_jobs_v2.0.0.json',\n", + " 'SmtjRecipeTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/open-source-eval-meta-textgeneration-llama-3-2-1b-instruct-deterministic_payload_template_sm_jobs_v2.0.0.yaml',\n", + " 'SupportedInstanceTypes': ['ml.g5.12xlarge',\n", + " 'ml.g5.16xlarge',\n", + " 'ml.p4d.24xlarge',\n", + " 'ml.p5.48xlarge'],\n", + " 'Type': 'Evaluation',\n", + " 'Versions': ['1.0']},\n", + " {'DisplayName': 'Open Source Evaluation Evaluation on '\n", + " 'GPU - Meta Textgeneration Llama 3 2 1B '\n", + " 'Instruct',\n", + " 'EvaluationType': 'LLMAJEvaluation',\n", + " 'Hardware': 'CPU',\n", + " 'HpEksOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/meta_textgeneration_llama_3_2_1b_instruct_override_params_k8s_v2.0.0.json',\n", + " 'HpEksPayloadTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/meta_textgeneration_llama_3_2_1b_instruct_payload_template_k8s_v2.0.0.yaml',\n", + " 'Name': 'open-source-eval-meta-textgeneration-llama-3-2-1b-instruct-llmaj',\n", + " 'SmtjImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/sagemaker-evaluation:latest',\n", + " 'SmtjOverrideParamsS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/open-source-eval-meta-textgeneration-llama-3-2-1b-instruct-llmaj_override_params_sm_jobs_v2.0.0.json',\n", + " 'SmtjRecipeTemplateS3Uri': 's3://jumpstart-cache-prod-us-west-2/recipes/open-source-eval-meta-textgeneration-llama-3-2-1b-instruct-llmaj_payload_template_sm_jobs_v2.0.0.yaml',\n", + " 'SupportedInstanceTypes': ['ml.t3.large'],\n", + " 'Type': 'Evaluation',\n", + " 'Versions': ['1.0']}],\n", + " 'ResourceNameBase': 'llama-3-2-1b-instruct',\n", + " 'SageMakerSdkPredictorSpecifications': {'DefaultAcceptType': 'application/json',\n", + " 'DefaultContentType': 'application/json',\n", + " 'SupportedAcceptTypes': ['application/json'],\n", + " 'SupportedContentTypes': ['application/json']},\n", + " 'SupportedInferenceInstanceTypes': ['ml.g5.12xlarge',\n", + " 'ml.g5.16xlarge',\n", + " 'ml.g5.24xlarge',\n", + " 'ml.g5.2xlarge',\n", + " 'ml.g5.48xlarge',\n", + " 'ml.g5.4xlarge',\n", + " 'ml.g5.8xlarge',\n", + " 'ml.g5.xlarge',\n", + " 'ml.g6.12xlarge',\n", + " 'ml.g6.16xlarge',\n", + " 'ml.g6.24xlarge',\n", + " 'ml.g6.2xlarge',\n", + " 'ml.g6.48xlarge',\n", + " 'ml.g6.4xlarge',\n", + " 'ml.g6.8xlarge',\n", + " 'ml.g6.xlarge',\n", + " 'ml.p4d.24xlarge',\n", + " 'ml.p5.48xlarge'],\n", + " 'SupportedTrainingInstanceTypes': ['ml.g4dn.12xlarge',\n", + " 'ml.g5.12xlarge',\n", + " 'ml.g5.2xlarge',\n", + " 'ml.g5.4xlarge',\n", + " 'ml.g5.8xlarge',\n", + " 'ml.p3dn.24xlarge',\n", + " 'ml.p5.48xlarge'],\n", + " 'Task': 'Text Generation',\n", + " 'TrainingArtifactUri': 's3://jumpstart-private-cache-prod-us-west-2/meta-training/train-meta-textgeneration-llama-3-2-1b-instruct.tar.gz',\n", + " 'TrainingDependencies': ['accelerate==0.33.0',\n", + " 'bitsandbytes==0.39.1',\n", + " 'black==23.7.0',\n", + " 'brotli==1.0.9',\n", + " 'datasets==2.14.1',\n", + " 'docstring-parser==0.16',\n", + " 'fire==0.5.0',\n", + " 'huggingface-hub==0.24.2',\n", + " 'inflate64==0.3.1',\n", + " 'loralib==0.1.1',\n", + " 'multivolumefile==0.2.3',\n", + " 'mypy-extensions==1.0.0',\n", + " 'nvidia-cublas-cu12==12.1.3.1',\n", + " 'nvidia-cuda-cupti-cu12==12.1.105',\n", + " 'nvidia-cuda-nvrtc-cu12==12.1.105',\n", + " 'nvidia-cuda-runtime-cu12==12.1.105',\n", + " 'nvidia-cudnn-cu12==8.9.2.26',\n", + " 'nvidia-cufft-cu12==11.0.2.54',\n", + " 'nvidia-curand-cu12==10.3.2.106',\n", + " 'nvidia-cusolver-cu12==11.4.5.107',\n", + " 'nvidia-cusparse-cu12==12.1.0.106',\n", + " 'nvidia-nccl-cu12==2.19.3',\n", + " 'nvidia-nvjitlink-cu12==12.3.101',\n", + " 'nvidia-nvtx-cu12==12.1.105',\n", + " 'pathspec==0.11.1',\n", + " 'peft==0.4.0',\n", + " 'py7zr==0.20.5',\n", + " 'pybcj==1.0.1',\n", + " 'pycryptodomex==3.18.0',\n", + " 'pyppmd==1.0.0',\n", + " 'pyzstd==0.15.9',\n", + " 'safetensors==0.4.2',\n", + " 'sagemaker_jumpstart_huggingface_script_utilities==1.2.7',\n", + " 'sagemaker_jumpstart_script_utilities==1.1.9',\n", + " 'scipy==1.11.1',\n", + " 'shtab==1.7.1',\n", + " 'termcolor==2.3.0',\n", + " 'texttable==1.6.7',\n", + " 'tokenize-rt==5.1.0',\n", + " 'tokenizers==0.19.1',\n", + " 'torch==2.2.0',\n", + " 'transformers==4.43.1',\n", + " 'triton==2.2.0',\n", + " 'trl==0.8.1',\n", + " 'typing-extensions==4.8.0',\n", + " 'tyro==0.7.3'],\n", + " 'TrainingEcrUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04',\n", + " 'TrainingEnableNetworkIsolation': True,\n", + " 'TrainingInstanceTypeVariants': {'Variants': {'g4dn': {'Properties': {'GatedModelEnvVarUri': 's3://jumpstart-private-cache-prod-us-west-2/meta-training/g4dn/v1.0.0/train-meta-textgeneration-llama-3-2-1b-instruct.tar.gz',\n", + " 'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'g5': {'Properties': {'GatedModelEnvVarUri': 's3://jumpstart-private-cache-prod-us-west-2/meta-training/g5/v1.0.0/train-meta-textgeneration-llama-3-2-1b-instruct.tar.gz',\n", + " 'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'g6': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'g6e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'g7e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'local_gpu': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p2': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p3': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p3dn': {'Properties': {'GatedModelEnvVarUri': 's3://jumpstart-private-cache-prod-us-west-2/meta-training/p3dn/v1.0.0/train-meta-textgeneration-llama-3-2-1b-instruct.tar.gz',\n", + " 'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p4d': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p4de': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p5': {'Properties': {'GatedModelEnvVarUri': 's3://jumpstart-private-cache-prod-us-west-2/meta-training/p5/v1.0.0/train-meta-textgeneration-llama-3-2-1b-instruct.tar.gz',\n", + " 'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p5e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p5en': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p6': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}},\n", + " 'p6e': {'Properties': {'ImageUri': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04'}}}},\n", + " 'TrainingMetrics': [{'Name': 'huggingface-textgeneration:eval-loss',\n", + " 'Regex': 'eval_epoch_loss=tensor\\\\(([0-9\\\\.]+)'},\n", + " {'Name': 'huggingface-textgeneration:eval-ppl',\n", + " 'Regex': 'eval_ppl=tensor\\\\(([0-9\\\\.]+)'},\n", + " {'Name': 'huggingface-textgeneration:train-loss',\n", + " 'Regex': 'train_epoch_loss=([0-9\\\\.]+)'}],\n", + " 'TrainingScriptUri': 's3://jumpstart-cache-prod-us-west-2/source-directory-tarballs/training/meta-textgeneration/prepack/inference-meta-textgeneration/v1.2.0/sourcedir.tar.gz',\n", + " 'TrainingSupported': True,\n", + " 'TrainingVolumeSize': 256,\n", + " 'Url': 'https://ai.meta.com/resources/models-and-libraries/llama-downloads/',\n", + " 'ValidationSupported': True}\n" + ] + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:12                                                                                   \n",
+       "                                                                                                  \n",
+       "    9 )                                                                                           \n",
+       "   10 hub_doc = json.loads(hub_resp[\"HubContentDocument\"])                                        \n",
+       "   11 pprint(hub_doc)                                                                             \n",
+       " 12 config = hub_doc[\"InferenceConfigComponents\"][\"lmi\"]                                        \n",
+       "   13 base_model_s3_uri = config[\"HostingArtifactUri\"]                                            \n",
+       "   14 instance_family = INSTANCE_TYPE.split(\".\")[1]                                               \n",
+       "   15 lmi_image_uri = config[\"HostingInstanceTypeVariants\"][\"Variants\"][instance_family][\"Prop    \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "KeyError: 'InferenceConfigComponents'\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m12\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 9 \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m10 \u001b[0mhub_doc = json.loads(hub_resp[\u001b[33m\"\u001b[0m\u001b[33mHubContentDocument\u001b[0m\u001b[33m\"\u001b[0m]) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m11 \u001b[0mpprint(hub_doc) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m12 config = \u001b[1;4mhub_doc[\u001b[0m\u001b[1;4;33m\"\u001b[0m\u001b[1;4;33mInferenceConfigComponents\u001b[0m\u001b[1;4;33m\"\u001b[0m\u001b[1;4m]\u001b[0m[\u001b[33m\"\u001b[0m\u001b[33mlmi\u001b[0m\u001b[33m\"\u001b[0m] \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m13 \u001b[0mbase_model_s3_uri = config[\u001b[33m\"\u001b[0m\u001b[33mHostingArtifactUri\u001b[0m\u001b[33m\"\u001b[0m] \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m14 \u001b[0minstance_family = INSTANCE_TYPE.split(\u001b[33m\"\u001b[0m\u001b[33m.\u001b[0m\u001b[33m\"\u001b[0m)[\u001b[94m1\u001b[0m] \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m15 \u001b[0mlmi_image_uri = config[\u001b[33m\"\u001b[0m\u001b[33mHostingInstanceTypeVariants\u001b[0m\u001b[33m\"\u001b[0m][\u001b[33m\"\u001b[0m\u001b[33mVariants\u001b[0m\u001b[33m\"\u001b[0m][instance_family][\u001b[33m\"\u001b[0m\u001b[33mProp\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mKeyError: \u001b[0m\u001b[38;2;0;135;0m'InferenceConfigComponents'\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "tags = sm.list_tags(ResourceArn=f\"arn:aws:sagemaker:{REGION}:099324990371:training-job/{TRAINING_JOB_NAME}\")\n", + "jumpstart_model_id = next(t[\"Value\"] for t in tags[\"Tags\"] if t[\"Key\"] == \"sagemaker-studio:jumpstart-model-id\")\n", + "print(jumpstart_model_id)\n", + "\n", + "hub_resp = sm.describe_hub_content(\n", + " HubName=\"SageMakerPublicHub\", HubContentType=\"Model\", HubContentName=jumpstart_model_id\n", + ")\n", + "hub_doc = json.loads(hub_resp[\"HubContentDocument\"])\n", + "pprint(hub_doc)\n", + "config = hub_doc[\"InferenceConfigComponents\"][\"lmi\"]\n", + "base_model_s3_uri = config[\"HostingArtifactUri\"]\n", + "instance_family = INSTANCE_TYPE.split(\".\")[1]\n", + "lmi_image_uri = config[\"HostingInstanceTypeVariants\"][\"Variants\"][instance_family][\"Properties\"][\"ImageUri\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da843726", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ci-scripts/displaytime.sh b/ci-scripts/displaytime.sh deleted file mode 100755 index 6e1e474b4a..0000000000 --- a/ci-scripts/displaytime.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -set -euo pipefail - -echo =================== $1 execution time =================== - -start_time=$2 -end_time=`date +%s` -total_time=$(expr $end_time - $start_time + 1) -hours=$((total_time/60/60%24)) -minutes=$((total_time/60%60)) -secs=$((total_time%60)) - -(( $hours > 0 )) && printf '%d hours ' $hours -(( $minutes > 0 )) && printf '%d minutes ' $minutes -(( $hours > 0 || $minutes > 0 )) && printf 'and ' -printf '%d seconds\n\n' $secs diff --git a/ci-scripts/queue_build.py b/ci-scripts/queue_build.py deleted file mode 100644 index fcff0b9a9b..0000000000 --- a/ci-scripts/queue_build.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os -import re -import time - -import boto3 - -account = boto3.client( - "sts", region_name="us-west-2", endpoint_url="https://sts.us-west-2.amazonaws.com" -).get_caller_identity()["Account"] -bucket_name = "sagemaker-us-west-2-%s" % account - -MAX_IN_PROGRESS_BUILDS = 3 -INTERVAL_BETWEEN_CONCURRENT_RUNS = 15 # minutes -CLEAN_UP_TICKETS_OLDER_THAN = 8 # hours - - -def queue_build(): - ticket_number = int(1000 * time.time()) - files = _list_tickets() - _cleanup_tickets_older_than(files) - _wait_for_other_builds(ticket_number) - - -def _build_info_from_file(file): - filename = file.key.split("/")[2] - ticket_number, build_id, source_version = filename.split("_") - return int(ticket_number), build_id, source_version - - -def _wait_for_other_builds(ticket_number): - sorted_files = _list_tickets() - - print("build queue status:") - print() - - for order, file in enumerate(sorted_files): - file_ticket_number, build_id, source_version = _build_info_from_file(file) - print( - "%s -> %s %s, ticket number: %s status: %s" - % (order, build_id, source_version, file_ticket_number, file.key.split("/")[1]) - ) - print() - build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID")) - source_version = re.sub( - "[_/]", - "-", - os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"), - ) - filename = "%s_%s_%s" % (ticket_number, build_id, source_version) - s3_file_obj = _write_ticket(filename, status="waiting") - print("Build %s waiting to be scheduled" % filename) - - while True: - _cleanup_tickets_with_terminal_states() - waiting_tickets = _list_tickets("waiting") - if waiting_tickets: - first_waiting_ticket_number, _, _ = _build_info_from_file(_list_tickets("waiting")[0]) - else: - first_waiting_ticket_number = ticket_number - - if ( - len(_list_tickets(status="in-progress")) < 3 - and last_in_progress_elapsed_time_check() - and first_waiting_ticket_number == ticket_number - ): - # put the build in progress - print("Scheduling build %s for running.." % filename) - s3_file_obj.delete() - _write_ticket(filename, status="in-progress") - break - else: - # wait - time.sleep(30) - - -def last_in_progress_elapsed_time_check(): - in_progress_tickets = _list_tickets("in-progress") - if not in_progress_tickets: - return True - last_in_progress_ticket, _, _ = _build_info_from_file(_list_tickets("in-progress")[-1]) - _elapsed_time = int(1000 * time.time()) - last_in_progress_ticket - last_in_progress_elapsed_time = int(_elapsed_time / (1000 * 60)) # in minutes - return last_in_progress_elapsed_time > INTERVAL_BETWEEN_CONCURRENT_RUNS - - -def _cleanup_tickets_with_terminal_states(): - files = _list_tickets() - build_ids = [] - for file in files: - _, build_id, _ = _build_info_from_file(file) - build_ids.append(build_id) - - client = boto3.client("codebuild") - response = client.batch_get_builds(ids=build_ids) - - for file, build_details in zip(files, response["builds"]): - _, _build_id_from_file, _ = _build_info_from_file(file) - build_status = build_details["buildStatus"] - - if build_status != "IN_PROGRESS" and _build_id_from_file == build_details["id"]: - print( - "Build %s in terminal state: %s, deleting lock" - % (_build_id_from_file, build_status) - ) - file.delete() - - -def _cleanup_tickets_older_than(files): - oldfiles = list(filter(_file_older_than, files)) - for file in oldfiles: - print("object %s older than 8 hours. Deleting" % file.key) - file.delete() - return files - - -def _list_tickets(status=None): - s3 = boto3.resource("s3") - bucket = s3.Bucket(bucket_name) - prefix = "ci-integ-queue/{}/".format(status) if status else "ci-integ-queue/" - objects = [file for file in bucket.objects.filter(Prefix=prefix)] - files = list(filter(lambda x: x != prefix, objects)) - sorted_files = list(sorted(files, key=lambda y: y.key)) - return sorted_files - - -def _file_older_than(file): - timelimit = 1000 * 60 * 60 * CLEAN_UP_TICKETS_OLDER_THAN - file_ticket_number, build_id, source_version = _build_info_from_file(file) - return int(1000 * time.time()) - file_ticket_number > timelimit - - -def _write_ticket(filename, status="waiting"): - file_path = "ci-integ-queue/{}".format(status) - if not os.path.exists(file_path): - os.makedirs(file_path) - - file_full_path = file_path + "/" + filename - with open(file_full_path, "w") as file: - file.write(filename) - s3_file_obj = boto3.Session().resource("s3").Object(bucket_name, file_full_path) - s3_file_obj.upload_file(file_full_path) - print("Build %s is now in state %s" % (filename, status)) - return s3_file_obj - - -if __name__ == "__main__": - queue_build() diff --git a/doc/Makefile b/doc/Makefile deleted file mode 100644 index af378c2e0f..0000000000 --- a/doc/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -W -SPHINXBUILD = python -msphinx -SPHINXPROJ = sagemaker -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/_static/js/analytics.js b/doc/_static/js/analytics.js deleted file mode 100644 index 2160956952..0000000000 --- a/doc/_static/js/analytics.js +++ /dev/null @@ -1,2 +0,0 @@ -console.log("Starting analytics..."); -var s_code=s.t();if(s_code)document.write(s_code) diff --git a/doc/_static/theme_overrides.css b/doc/_static/theme_overrides.css deleted file mode 100644 index 7c1a520223..0000000000 --- a/doc/_static/theme_overrides.css +++ /dev/null @@ -1,10 +0,0 @@ -/* override table width restrictions */ -.wy-table-responsive table td, .wy-table-responsive table th { - white-space: normal; -} - -.wy-table-responsive { - margin-bottom: 24px; - max-width: 100%; - overflow: visible; -} diff --git a/doc/algorithms/factorization_machines.rst b/doc/algorithms/factorization_machines.rst deleted file mode 100644 index e6a509d167..0000000000 --- a/doc/algorithms/factorization_machines.rst +++ /dev/null @@ -1,22 +0,0 @@ -FactorizationMachines -------------------------- - -The Amazon SageMaker Factorization Machines algorithm. - -.. autoclass:: sagemaker.FactorizationMachines - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, num_factors, predictor_type, epochs, clip_gradient, mini_batch_size, feature_dim, eps, rescale_grad, bias_lr, linear_lr, factors_lr, bias_wd, linear_wd, factors_wd, bias_init_method, bias_init_scale, bias_init_sigma, bias_init_value, linear_init_method, linear_init_scale, linear_init_sigma, linear_init_value, factors_init_method, factors_init_scale, factors_init_sigma, factors_init_value - - -.. autoclass:: sagemaker.FactorizationMachinesModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.FactorizationMachinesPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/index.rst b/doc/algorithms/index.rst deleted file mode 100644 index 45235a3bfe..0000000000 --- a/doc/algorithms/index.rst +++ /dev/null @@ -1,20 +0,0 @@ -###################### -First-Party Algorithms -###################### - -Amazon SageMaker provides implementations of some common machine learning algorithms optimized for GPU architecture and massive datasets. - -.. toctree:: - :maxdepth: 2 - - sagemaker.amazon.amazon_estimator - factorization_machines - ipinsights - kmeans - knn - lda - linear_learner - ntm - object2vec - pca - randomcutforest diff --git a/doc/algorithms/ipinsights.rst b/doc/algorithms/ipinsights.rst deleted file mode 100644 index f3aa8ac437..0000000000 --- a/doc/algorithms/ipinsights.rst +++ /dev/null @@ -1,22 +0,0 @@ -IP Insights ------------ - -The Amazon SageMaker IP Insights algorithm. - -.. autoclass:: sagemaker.IPInsights - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, num_entity_vectors, vector_dim, batch_metrics_publish_interval, epochs, learning_rate, - num_ip_encoder_layers, random_negative_sampling_rate, shuffled_negative_sampling_rate, weight_decay - -.. autoclass:: sagemaker.IPInsightsModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.IPInsightsPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/kmeans.rst b/doc/algorithms/kmeans.rst deleted file mode 100644 index bcf2b221bf..0000000000 --- a/doc/algorithms/kmeans.rst +++ /dev/null @@ -1,21 +0,0 @@ -K-means --------------------- - -The Amazon SageMaker K-means algorithm. - -.. autoclass:: sagemaker.KMeans - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, k, init_method, max_iterations, tol, num_trials, local_init_method, half_life_time_size, epochs, center_factor, mini_batch_size, feature_dim, MAX_DEFAULT_BATCH_SIZE - -.. autoclass:: sagemaker.KMeansModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.KMeansPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/knn.rst b/doc/algorithms/knn.rst deleted file mode 100644 index 89180ce3d8..0000000000 --- a/doc/algorithms/knn.rst +++ /dev/null @@ -1,22 +0,0 @@ -K-Nearest Neighbors -------------------- - -The Amazon SageMaker K-Nearest Neighbors (k-NN) algorithm. - -.. autoclass:: sagemaker.KNN - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, k, sample_size, predictor_type, dimension_reduction_target, dimension_reduction_type, - index_metric, index_type, faiss_index_ivf_nlists, faiss_index_pq_m - -.. autoclass:: sagemaker.KNNModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.KNNPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/lda.rst b/doc/algorithms/lda.rst deleted file mode 100644 index 394503b90b..0000000000 --- a/doc/algorithms/lda.rst +++ /dev/null @@ -1,22 +0,0 @@ -LDA --------------------- - -The Amazon SageMaker LDA algorithm. - -.. autoclass:: sagemaker.LDA - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, num_topics, alpha0, max_restarts, max_iterations, mini_batch_size, feature_dim, tol - - -.. autoclass:: sagemaker.LDAModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.LDAPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/linear_learner.rst b/doc/algorithms/linear_learner.rst deleted file mode 100644 index db9685d24f..0000000000 --- a/doc/algorithms/linear_learner.rst +++ /dev/null @@ -1,21 +0,0 @@ -LinearLearner --------------------- - -The Amazon SageMaker LinearLearner algorithm. - -.. autoclass:: sagemaker.LinearLearner - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, instance_count, instance_type, predictor_type, binary_classifier_model_selection_criteria, target_recall, target_precision, positive_example_weight_mult, epochs, use_bias, num_models, parameter, num_calibration_samples, calibration, init_method, init_scale, init_sigma, init_bias, optimizer, loss, wd, l1, momentum, learning_rate, beta_1, beta_2, bias_lr_mult, use_lr_scheduler, lr_scheduler_step, lr_scheduler_factor, lr_scheduler_minimum_lr, lr_scheduler_minimum_lr, mini_batch_size, feature_dim, bias_wd_mult, MAX_DEFAULT_BATCH_SIZE - -.. autoclass:: sagemaker.LinearLearnerModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.LinearLearnerPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/ntm.rst b/doc/algorithms/ntm.rst deleted file mode 100644 index 2d72cda2b2..0000000000 --- a/doc/algorithms/ntm.rst +++ /dev/null @@ -1,23 +0,0 @@ -NTM ---- - -The Amazon SageMaker NTM algorithm. - -.. autoclass:: sagemaker.NTM - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, num_topics, encoder_layers, epochs, encoder_layers_activation, optimizer, tolerance, - num_patience_epochs, batch_norm, rescale_gradient, clip_gradient, weight_decay, learning_rate - - -.. autoclass:: sagemaker.NTMModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.NTMPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/object2vec.rst b/doc/algorithms/object2vec.rst deleted file mode 100644 index 102db5ee5d..0000000000 --- a/doc/algorithms/object2vec.rst +++ /dev/null @@ -1,21 +0,0 @@ -Object2Vec ----------- - -The Amazon SageMaker Object2Vec algorithm. - -.. autoclass:: sagemaker.Object2Vec - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, enc_dim, mini_batch_size, epochs, early_stopping_patience, early_stopping_tolerance, - dropout, weight_decay, bucket_width, num_classes, mlp_layers, mlp_dim, mlp_activation, - output_layer, optimizer, learning_rate, enc0_network, enc1_network, enc0_cnn_filter_width, - enc1_cnn_filter_width, enc0_max_seq_len, enc1_max_seq_len, enc0_token_embedding_dim, - enc1_token_embedding_dim, enc0_vocab_size, enc1_vocab_size, enc0_layers, enc1_layers, - enc0_freeze_pretrained_embedding, enc1_freeze_pretrained_embedding - -.. autoclass:: sagemaker.Object2VecModel - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/pca.rst b/doc/algorithms/pca.rst deleted file mode 100644 index 68a5db29f3..0000000000 --- a/doc/algorithms/pca.rst +++ /dev/null @@ -1,22 +0,0 @@ -PCA --------------------- - -The Amazon SageMaker PCA algorithm. - -.. autoclass:: sagemaker.PCA - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, num_components, algorithm_mode, subtract_mean, extra_components, mini_batch_size, feature_dim, MAX_DEFAULT_BATCH_SIZE - - -.. autoclass:: sagemaker.PCAModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.PCAPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/randomcutforest.rst b/doc/algorithms/randomcutforest.rst deleted file mode 100644 index b723319a52..0000000000 --- a/doc/algorithms/randomcutforest.rst +++ /dev/null @@ -1,22 +0,0 @@ -Random Cut Forest ------------------ - -The Amazon SageMaker Random Cut Forest algorithm. - -.. autoclass:: sagemaker.RandomCutForest - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :exclude-members: image_uri, num_trees, num_samples_per_tree, eval_metrics, feature_dim, MINI_BATCH_SIZE - - -.. autoclass:: sagemaker.RandomCutForestModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.RandomCutForestPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/algorithms/sagemaker.amazon.amazon_estimator.rst b/doc/algorithms/sagemaker.amazon.amazon_estimator.rst deleted file mode 100644 index bfc0b4a0be..0000000000 --- a/doc/algorithms/sagemaker.amazon.amazon_estimator.rst +++ /dev/null @@ -1,9 +0,0 @@ -Amazon Estimators --------------------- - -Base class for Amazon Estimator implementations - -.. autoclass:: sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/amazon_sagemaker_debugger.rst b/doc/amazon_sagemaker_debugger.rst deleted file mode 100644 index 23081138b1..0000000000 --- a/doc/amazon_sagemaker_debugger.rst +++ /dev/null @@ -1,414 +0,0 @@ - -######################### -Amazon SageMaker Debugger -######################### - - -Amazon SageMaker Debugger allows you to detect anomalies while training your machine learning model by emitting relevant data during training, storing the data and then analyzing it. - -.. contents:: - -Background -========== - -Amazon SageMaker provides every developer and data scientist with the ability to build, train, and deploy machine learning models quickly. Amazon SageMaker is a fully-managed service that encompasses the entire machine learning workflow. You can label and prepare your data, choose an algorithm, train a model, and then tune and optimize it for deployment. You can deploy your models to production with Amazon SageMaker to make predictions at lower costs than was previously possible. - -`SageMaker Debugger `__ provides a way to hook into the training process and emit debug artifacts (a.k.a. "tensors") that represent the training state at each point in the training lifecycle. Debugger then stores the data in real time and uses rules that encapsulate logic to analyze tensors and react to anomalies. Debugger provides built-in rules and allows you to write custom rules for analysis. - -Setup -===== - -To get started, you must satisfy the following prerequisites: - -* Specify an AWS Region where you'll train your model. -* Give Amazon SageMaker the access to your data in Amazon Simple Storage Service (Amazon S3) needed to train your model by creating an IAM role ARN. See the `AWS IAM documentation `__ for how to fine tune the permissions needed. - -Capture real-time debugging data during model training in Amazon SageMaker -========================================================================== - -To enable data emission for debugging model training, Amazon SageMaker initializes a "hook" which attaches itself to the training process and emits data necessary for debugging, i.e. tensors. To provide the hook's configuration, specify the option called ``DebuggerHookConfig`` when training your model. For more about the ``DebuggerHookConfig`` object, see the `API documentation `__. - -The ``DebuggerHookConfig`` accepts one or more objects of type ``CollectionConfig``, which defines the configuration around the tensor collection you intend to emit and save during model training. The concept of a "collection" helps group tensors for easier handling. - -.. code:: python - - from sagemaker.debugger import CollectionConfig, DebuggerHookConfig - - collection_config = CollectionConfig( - name='collection_name', - parameters={ - 'key': 'value' - } - ) - - debugger_hook_config = DebuggerHookConfig( - s3_output_path='s3://path/for/data/emission', - container_local_output_path='/local/path/for/data/emission', - hook_parameters={ - 'key': 'value' - }, - collection_configs=[ - collection_config - ] - ) - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - debugger_hook_config=debugger_hook_config - ) - -Specifying configurations for collections ------------------------------------------ - -Collection Name -~~~~~~~~~~~~~~~ - -``name`` in ``CollectionConfig`` is used to specify the name of the tensor collection you wish to emit and store. This name is used by SageMaker Debugger to refer to all the tensors in this collection. You can supply any valid string for the collection name. In addition, there are "built-in" collections, whose names are recognized by the hook, that you can emit simply by specifying their names. Examples of these collections are "gradients", "weights", "biases", etc. A full list is available at `SageMaker Debugger Built-in Collections `__. - -To emit and store one of the built-in collections: - -.. code:: python - - collection_config_biases = CollectionConfig(name='biases') - -Collection Parameters -~~~~~~~~~~~~~~~~~~~~~ - -To specify additional configuration for a particular collection, use ``parameters`` in the ``CollectionConfig``. This parameter provides a mapping that defines what group of tensors are saved and how frequently they are to be saved. - -For instance, suppose you want to save a collection of tensors with the following properties: - -========================================================= ========= -**Desired Property** **Value** ---------------------------------------------------------- --------- -regex of tensors which should be saved ``relu`` -step frequency at which the said tensors should be saved 20 -starting at step 5 -ending at step 100 -========================================================= ========= - -You should configure the ``CollectionConfig`` as: - -.. code:: python - - collection_config_for_relu = CollectionConfig( - name='custom_relu_collection', - parameters={ - 'include_regex': 'relu', - 'save_interval': '20', - 'start_step': '5', - 'end_step': '100' - } - } - -The possible values of ``parameters`` in ``CollectionConfig`` can be viewed at `CollectionParameters `__. - -Hook Parameters -~~~~~~~~~~~~~~~ - -To apply properties across all collections, use ``hook_parameters`` within the ``DebuggerHookConfig`` object. For example, to apply a value of ``10`` for ``save_interval`` across all collections: - -.. code:: python - - from sagemaker.debugger import CollectionConfig, DebuggerHookConfig - - collection_config_1 = CollectionConfig( - name='collection_name_1', - parameters={ - 'include_regex': '.*' - } - ) - collection_config_2 = CollectionConfig( - name='collection_name_2', - parameters={ - 'include_regex': '.*' - } - } - - debugger_hook_config = DebuggerHookConfig( - s3_output_path='s3://path/for/data/emission', - container_local_output_path='/local/path/for/data/emission', - hook_parameters={ - 'save_interval': '10' - }, - collection_configs=[ - collection_config_1, collection_config_2 - ] - ) - -In the above sample code, the ``save_interval`` of ``10`` will be applied for storing both collections. - -Note that the ``save_interval`` value set in the ``collection_parameters`` will override the value for ``save_interval`` in the ``hook_parameters``. For example, in the above sample code, if ``collection_config_2`` had a ``save_interval`` value set to ``20``, then the tensors for that collection would be saved with step interval ``20`` while those for ``collection_config_1`` would still be saved with a step interval of ``10``. This holds true for any parameters common in ``hook_parameters`` and ``parameters`` in ``CollectionConfig``. - -The possible values of ``hook_parameters`` in ``DebuggerHookConfig`` can be viewed at `SageMaker Debugger Hook `__. - -Begin model training --------------------- - -To create a training job that initializes the debugging hook with the value of the ``DebuggerHookConfig`` object, call ``fit()`` on the ``estimator``. The hook starts emitting the relevant debugging data, i.e. the tensor collections, in real time and stores the data locally in the local path provided in ``DebuggerHookConfig``. This data is then uploaded in near real time to an S3 path derived from the path provided in the hook configuration. - -.. code:: - - s3://{destination-bucket-prefix}/{training-job-name}/debug-output/ - -The path is derived from the value of ``s3_output_path``, and not used verbatim, to ensure that artifacts from different training jobs are placed in different Amazon S3 paths. To enable correct analyses of different training jobs, it is essential to keep the debug artifacts from these jobs separate. - -To access the above Amazon S3 path through the estimator object, you can use the following command: - -.. code:: python - - tensors_s3_output_path = estimator.latest_job_debugger_artifacts_path() - -You can use the ``S3Downloader`` utility to view and download the debugging data emitted during training in Amazon S3. (Note that data is stored in a streaming fashion so the data you download locally through ``S3Downloader`` will be a snapshot of the data generated until that time.) Here is the code: - -.. code:: python - - from sagemaker.s3 import S3Downloader - - # Start the training by calling fit - # Setting the wait to `False` would make the fit asynchronous - estimator.fit(wait=False) - - # Get a list of S3 URIs - S3Downloader.list(estimator.latest_job_debugger_artifacts_path()) - -Continuous analyses through rules -================================= - -In addition to collecting the debugging data, Amazon SageMaker Debugger provides the capability for you to analyze it in a streaming fashion using "rules". A SageMaker Debugger "rule" is a piece of code which encapsulates the logic for analyzing debugging data. - -SageMaker Debugger provides a set of built-in rules curated by data scientists and engineers at Amazon to identify common problems while training machine learning models. There is also support for using custom rule source codes for evaluation. In the following sections, you'll learn how to use both the built-in and custom rules while training your model. - -Relationship between debugger hook and rules --------------------------------------------- - -Using SageMaker Debugger is, broadly, a two-pronged approach. On one hand you have the production of debugging data, which is done through the Debugger Hook, and on the other hand you have the consumption of this data, which can be with rules (for continuous analyses) or by using the SageMaker Debugger SDK (for interactive analyses). - -The production and consumption of data are defined independently. For example, you could configure the debugging hook to store only the collection "gradients" and then configure the rules to operate on some other collection, say, "weights". While this is possible, it's quite useless as it gives you no meaningful insight into the training process. This is because the rule will do nothing in this example scenario since it will wait for the tensors in the collection "gradients" which are never be emitted. - -For more useful and efficient debugging, configure your debugging hook to produce and store the debugging data that you care about and employ rules that operate on that particular data. This way, you ensure that the Debugger is utilized to its maximum potential in detecting anomalies. In this sense, there is a loose binding between the hook and the rules. - -Normally, you'd achieve this binding for a training job by providing values for both ``debugger_hook_config`` and ``rules`` in your estimator. However, SageMaker Debugger simplifies this by allowing you to specify the collection configuration within the ``Rule`` object itself. This way, you don't have to specify the ``debugger_hook_config`` in your estimator separately. - -Using built-in rules --------------------- - -SageMaker Debugger comes with a set of built-in rules which can be used to identify common problems in model training, for example vanishing gradients or exploding tensors. You can choose to evaluate one or more of these rules while training your model to obtain meaningful insight into the training process. To learn more about these built in rules, see `SageMaker Debugger Built-in Rules `__. - -Pre-defined debugger hook configuration for built-in rules -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -As mentioned earlier, for efficient analyses, it's important that the debugging data that is emitted by the hook is relevant to the rules used to operate and analyze the data. For example, if the hook is configured to emit the collection "weights", you should evaluate a rule that operates on this collection and not some other collection. - -Determining the types of data to emit for debugging with the built-in rules during the model training can be tricky. To guide you in this choice, Amazon SageMaker provides you with predefined collection configurations best suited for each of the built-in rules. So when you use built-in Debugger rules, you just need to specify the names of the built-in rule and SageMaker Debugger configures the collection(s) to emit that the rules need to operate on. To learn more about the mapping of each rule to the appropriate collection configuration, see `Amazon SageMaker Debugger Rules Config `__. - -Sample Usages -~~~~~~~~~~~~~ - -**Example 1**: Using a single built-in rule without any customization. - -.. code:: python - - from sagemaker.debugger import Rule - from smdebug_rulesconfig import vanishing_gradient - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - rules=[Rule.sagemaker(vanishing_gradient())] - ) - - -In the example above, Amazon SageMaker pulls the collection configuration best suited for the built-in rule Vanishing Gradient from `SageMaker Debugger Rules Config `__ and configures the debugging data to be stored in the manner specified in the configuration. - -**Example 2**: Using more than one built-in rules without any customization. - -.. code:: python - - from sagemaker.debugger import Rule - from smdebug_rulesconfig import vanishing_gradient, weight_update_ratio - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - rules=[Rule.sagemaker(vanishing_gradient()), Rule.sagemaker(weight_update_ratio())] - ) - -In the example above, Amazon SageMaker pulls the hook configurations for Vanishing Gradient and Weight Update Ratio rules from `SageMaker Debugger Rules Config `__ and configures the collections to be stored in the manner specified in each configuration. - -**Example 3**: Using a built-in rule with no customization and another built-in rule with customization. - -Here we modify the ``weight_update_ratio`` rule to store a custom collection rather than "weights" which it would normally do if the behavior is not overridden. - - -.. code:: python - - from sagemaker.debugger import Rule - from smdebug_rulesconfig import vanishing_gradient, weight_update_ratio - - wur_with_customization = Rule.sagemaker( - base_config=weight_update_ratio(), - name="custom_wup_rule_name", - rule_parameters={ - 'key1': 'value1', - 'key2': 'value2' - }, - collections_to_save=[ - CollectionConfig( - name="custom_collection_name", - parameters= { - 'key1': 'value1', - 'key2': 'value2' - } - ) - ] - ) - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - rules=[ - Rule.sagemaker(vanishing_gradient()), - wur_with_customization - ] - ) - - -In the example above, the collection configuration for Vanishing Gradient is pulled from `SageMaker Debugger Rules Config `__ and the user supplied configuration is used for the Weight Update Ratio rule. - -Using custom rules ------------------- - -SageMaker Debugger also allows the users to create custom rules and have those evaluated against the debugging data. To use custom rules, you must provide two items: - -* Custom rule source file and its local or S3 location. You can learn more about how to write custom rules at `How to Write Custom Debugger Rules `__ -* Rule evaluator image for the corresponding region available from `Amazon SageMaker Debugger Custom Rule Images `__ - -To learn more about how to write your custom rules and use them see `SageMaker Debugger Custom Rules `__. - -Sample Usage -~~~~~~~~~~~~ - -For this example, we evaluate an altered version of the Vanishing Gradient rule against our model training. The rule checks the gradients and asserts that the mean value of the gradients at any step is always above a certain threshold. The source code for the rule is available `here `__ and is assumed to be in the relative directory path ``rules/custom_gradient_rule.py``. - -To evaluate the custom rule against the training: - -.. code:: python - - from sagemaker.debugger import Rule - - region = 'us-east-1' # the AWS region of the training job - custom_gradient_rule = Rule.custom( - name='MyCustomRule', - image_uri='864354269164.dkr.ecr.{}.amazonaws.com/sagemaker-debugger-rule-evaluator:latest'.format(region), - instance_type='ml.t3.medium', # instance type to run the rule evaluation on - source='rules/custom_gradient_rule.py', # path to the rule source file - rule_to_invoke='CustomGradientRule', # name of the class to invoke in the rule source file - volume_size_in_gb=30, # EBS volume size required to be attached to the rule evaluation instance - collections_to_save=[CollectionConfig("gradients")], # collections to be analyzed by the rule - rule_parameters={ - 'threshold': '20.0' # this will be used to initialize 'threshold' param in your rule constructor - } - ) - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - rules=[ - custom_gradient_rule - ] - ) - -While initializing the custom rule through ``Rules.custom()``, you must specify a valid S3 location for rule source location as the value of ``source``. - -Capture real-time TensorBoard data from the debugging hook -========================================================== - -In addition to emitting and storing the debugging data useful for analyses, the debugging hook is also capable of emitting `TensorBoard `__ data for you to point your TensorBoard application at and to visualize. - -To enable the debugging hook to emit TensorBoard data, you need to specify the new option ``TensorBoardOutputConfig`` as follows: - -.. code:: python - - from sagemaker.debugger import TensorBoardOutputConfig - - tensorboard_output_config = TensorBoardOutputConfig( - s3_output_path='s3://path/for/tensorboard/data/emission', - container_local_output_path='/local/path/for/tensorboard/data/emission' - ) - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - tensorboard_output_config=tensorboard_output_config - ) - -To create a training job where the debugging hook emits and stores TensorBoard data using the configuration specified in the ``TensorBoardOutputConfig`` object, call ``fit()`` on the ``estimator``. The debugging hook uploads the generated TensorBoard data in near real-time to an S3 path derived from the value of ``s3_output_path`` provided in the configuration: - -.. code:: - - s3://{destination-bucket-prefix}/{training-job-name}/tensorboard-output/ - -To access the S3 path where the tensorboard data is stored, you can do: - -.. code:: python - - tensorboard_s3_output_path = estimator.latest_job_tensorboard_artifacts_path() - -The reason for deriving the path from the value supplied to ``s3_output_path`` is the same as that provided for ``DebuggerHookConfig`` case - the directory for TensorBoard artifact storage needs be different for each training job. - -Note that having the TensorBoard data emitted from the hook in addition to the tensors will incur a cost to the training and may slow it down. - -Interactive analysis using SageMaker Debugger SDK and visualizations -==================================================================== - -`Amazon SageMaker Debugger SDK `__ also allows you to do interactive analyses on the debugging data produced from a training job run and to render visualizations of it. After calling ``fit()`` on the estimator, you can use the SDK to load the saved data in a SageMaker Debugger ``trial`` and do an analysis on the data: - -.. code:: python - - from smdebug.trials import create_trial - - s3_output_path = estimator.latest_job_debugger_artifacts_path() - trial = create_trial(s3_output_path) - -To learn more about the programming model for analysis using the SageMaker Debugger SDK, see `SageMaker Debugger Analysis `__. - -For a tutorial on what you can do after creating the trial and how to visualize the results, see `SageMaker Debugger - Visualizing Debugging Results `__. - -Default behavior and opting out -=============================== - -For ``TensorFlow``, ``Keras``, ``MXNet``, ``PyTorch`` and ``XGBoost`` estimators, the ``DebuggerHookConfig`` is always initialized regardless of specification while initializing the estimator. This is done to minimize code changes needed to get useful debugging information. - -To disable the hook initialization, you can do so by specifying ``False`` for value of ``debugger_hook_config`` in your framework estimator's initialization: - -.. code:: python - - estimator = TensorFlow( - role=role, - instance_count=1, - instance_type=instance_type, - debugger_hook_config=False - ) - -Learn More -========== - -Further documentation ---------------------- - -* API documentation: https://sagemaker.readthedocs.io/en/stable/debugger.html -* AWS documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html -* SageMaker Debugger SDK: https://github.com/awslabs/sagemaker-debugger -* ``S3Downloader``: https://sagemaker.readthedocs.io/en/stable/s3.html#sagemaker.s3.S3Downloader - -Notebook examples ------------------ - -Consult our notebook examples for in-depth tutorials: https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-debugger diff --git a/doc/amazon_sagemaker_featurestore.rst b/doc/amazon_sagemaker_featurestore.rst deleted file mode 100644 index 0388e9218d..0000000000 --- a/doc/amazon_sagemaker_featurestore.rst +++ /dev/null @@ -1,397 +0,0 @@ -############################## -Amazon SageMaker Feature Store -############################## - -.. rubric:: **Create Feature Groups** - :name: bCe9CAXalwH - -This guide will show you how to create and use -`Amazon SageMaker Feature Store `__. -The example code in this guide covers using the SageMaker Python SDK. The -underlying APIs are available for developers using other languages. - -.. rubric:: Features - :name: bCe9CAtWHPP - -Prior to using a feature store you will typically load your dataset, run -transformations, and set up your features for ingestion. This step has a -lot of variation and is highly dependent on your data. The example code -in the following code blocks will often make reference to an example -notebook, \ `Fraud Detection with Amazon SageMaker Feature Store -`__. -It is recommended that you run this notebook -in SageMaker Studio and use the code from there, as the code in this -guide is conceptual and not fully functional if copied. - -.. rubric:: Feature store data types and schema - :name: bCe9CAr4kIT - -Feature Store supported types are ``String``, ``Fractional``, and -``Integral``. The default type is set to ``String``. This means that, if -a column in your dataset is not a ``float`` or ``long`` type, it will -default to ``String`` in your feature store. - - -You may use a schema to describe your data’s columns and data types. You -pass this schema into FeatureDefinitions, a required parameter for a -FeatureGroup. However, for Python developers, the SageMaker Python SDK -has automatic data type detection when you use the -``load_feature_definitions`` function. - -.. rubric:: Feature store setup - :name: bCe9CAgy6IH - -To start using Feature Store, first create a SageMaker session, boto3 -session, and a Feature Store session. Also, setup the bucket you will -use for your features; this is your Offline Store. The following will -use the SageMaker default bucket and add a custom prefix to it. - -.. note:: - - The role that you use requires these managed - policies:\ ``AmazonSageMakerFullAccess``\ and\ ``AmazonSageMakerFeatureStoreAccess``\ . - - -.. code:: python - - import boto3 - import sagemaker - from sagemaker.session import Session - - boto_session = boto3.Session(region_name=region) - role = sagemaker.get_execution_role() - sagemaker_session = sagemaker.Session() - region = sagemaker_session.boto_region_name - default_bucket = sagemaker_session.default_bucket() - prefix = 'sagemaker-featurestore' - offline_feature_store_bucket = 's3://*{}*/*{}*'.format(default_bucket, prefix) - - sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region) - featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region) - - feature_store_session = Session( -     boto_session=boto_session, -     sagemaker_client=sagemaker_client, -     sagemaker_featurestore_runtime_client=featurestore_runtime - ) - -.. rubric:: Load datasets and partition data into feature groups - :name: bCe9CA31y9f - -You will load your data into data frames for each of your features. You -will use these data frames after you setup the feature group. In the -fraud detection example, you can see these steps in the following code. - -.. code:: python - - import numpy as np - import pandas as pd - import matplotlib.pyplot as plt - import io - - fraud_detection_bucket_name = 'sagemaker-featurestore-fraud-detection' - identity_file_key = 'sampled_identity.csv' - transaction_file_key = 'sampled_transactions.csv' - - identity_data_object = s3_client.get_object(Bucket=fraud_detection_bucket_name, Key=identity_file_key) - transaction_data_object = s3_client.get_object(Bucket=fraud_detection_bucket_name, Key=transaction_file_key) - - identity_data = pd.read_csv(io.BytesIO(identity_data_object['Body'].read())) - transaction_data = pd.read_csv(io.BytesIO(transaction_data_object['Body'].read())) - - identity_data = identity_data.round(5) - transaction_data = transaction_data.round(5) - - identity_data = identity_data.fillna(0) - transaction_data = transaction_data.fillna(0) - - # Feature transformations for this dataset are applied before ingestion into FeatureStore. - # One hot encode card4, card6 - encoded_card_bank = pd.get_dummies(transaction_data['card4'], prefix = 'card_bank') - encoded_card_type = pd.get_dummies(transaction_data['card6'], prefix = 'card_type') - - transformed_transaction_data = pd.concat([transaction_data, encoded_card_type, encoded_card_bank], axis=1) - transformed_transaction_data = transformed_transaction_data.rename(columns={"card_bank_american express": "card_bank_american_express"}) - -.. rubric:: Feature group setup - :name: bCe9CARx8h9 - -Name your feature groups and customize the feature names with a unique -name, and setup each feature group with the ``FeatureGroup`` class. - -.. code:: python - - from sagemaker.feature_store.feature_group import FeatureGroup - feature_group_name = "some string for a name" - feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) - -For example, in the fraud detection example, the two feature groups are -“identity” and “transaction”. In the following code you can see how the -names are customized with a timestamp, then each group is setup by -passing in the name and the session. - -.. code:: python - - import time - from time import gmtime, strftime, sleep - from sagemaker.feature_store.feature_group import FeatureGroup - - identity_feature_group_name = 'identity-feature-group-' + strftime('%d-%H-%M-%S', gmtime()) - transaction_feature_group_name = 'transaction-feature-group-' + strftime('%d-%H-%M-%S', gmtime()) - - identity_feature_group = FeatureGroup(name=identity_feature_group_name, sagemaker_session=feature_store_session) - transaction_feature_group = FeatureGroup(name=transaction_feature_group_name, sagemaker_session=feature_store_session) - -.. rubric:: Record identifier and event time feature - :name: bCe9CA17VV7 - -Next, you will need a record identifier name and an event time feature -name. This will match the column of the corresponding features in your -data. For example, in the fraud detection example, the column of -interest is “TransactionID”. “EventTime” can be appended to your data -when no timestamp is available. In the following code, you can see how -these variables are set, and then ``EventTime`` is appended to both -feature’s data. - -.. code:: python - - record_identifier_name = "TransactionID" - event_time_feature_name = "EventTime" - current_time_sec = int(round(time.time())) - identity_data[event_time_feature_name] = pd.Series([current_time_sec]*len(identity_data), dtype="float64") - transformed_transaction_data[event_time_feature_name] = pd.Series([current_time_sec]*len(transaction_data), dtype="float64") - -.. rubric:: Feature definitions - :name: bCe9CA4yUcO - -You can now load the feature definitions by passing a data frame -containing the feature data. In the following code for the fraud -detection example, the identity feature and transaction feature are each -loaded by using ``load_feature_definitions``, and this function -automatically detects the data type of each column of data. For -developers using a schema rather than automatic detection, refer to the -`Creating Feature Groups with Data Wrangler example `__ for -code that shows loading the schema, mapping it and adding as a -``FeatureDefinition`` that is used when you create the ``FeatureGroup``. -This example also covers a boto3 implementation, instead of using the -SageMaker Python SDK. - -.. code:: python - - identity_feature_group.load_feature_definitions(data_frame=identity_data); # output is suppressed - transaction_feature_group.load_feature_definitions(data_frame=transformed_transaction_data); # output is suppressed - -.. rubric:: Create a feature group - :name: bCe9CAwMEgY - -The last step for creating the feature group is to use the -``create`` function. The following code shows all of the available -parameters. The online store is not created by default, so you must set -this as \ ``True`` if you want to enable it. The ``s3_uri`` is the -location of your offline store. - -.. code:: python - - # create a FeatureGroup - feature_group.create( -     description = "Some info about the feature group", -     feature_group_name = feature_group_name, -     record_identifier_name = record_identifier_name, -     event_time_feature_name = event_time_feature_name, -     feature_definitions = feature_definitions, -     role_arn = role, -     s3_uri = offline_feature_store_bucket, -     enable_online_store = True, -     online_store_kms_key_id = None, -     offline_store_kms_key_id = None, -     disable_glue_table_creation = False, -     data_catalog_config = None, -     tags = ["tag1","tag2"]) - -The following code from the fraud detection example shows a minimal -``create`` call for each of the two features groups being created. - -.. code:: python - - identity_feature_group.create( -     s3_uri=offline_feature_store_bucket, -     record_identifier_name=record_identifier_name, -     event_time_feature_name=event_time_feature_name, -     role_arn=role, -     enable_online_store=True - ) - - transaction_feature_group.create( -     s3_uri=offline_feature_store_bucket, -     record_identifier_name=record_identifier_name, -     event_time_feature_name=event_time_feature_name, -     role_arn=role, -     enable_online_store=True - ) - -Creating a feature group takes time as the data is loaded. You will need -to wait until it is created before you can use it. You can check status -using the following method. - -.. code:: python - -  status = feature_group.describe().get("FeatureGroupStatus") - -While it is creating you will get a ``Creating`` as a response. When -this has finished successfully the response will be ``Created``. The -other possible statuses are ``CreateFailed``, ``Deleting``, or -``DeleteFailed``. - -.. rubric:: Describe a feature group - :name: bCe9CA2TNON - -You can retrieve information about your feature group with the -``describe`` function. - -.. code:: python - - feature_group.describe() - -.. rubric:: List feature groups - :name: bCe9CA2wPF2 - -You can list all of your feature groups with the -``list_feature_groups`` function. - -.. code:: python - - sagemaker_client.list_feature_groups() - -.. rubric:: Put records in a feature group - :name: bCe9CAymRdA - -You can use the ``ingest`` function to load your feature data. You pass -in a data frame of feature data, set the number of workers, and choose -to wait for it to return or not. The following example demonstrates -using the ``ingest`` function. - -.. code:: python - - feature_group.ingest( -     data_frame=feature_data, max_workers=3, wait=True - ) - -For each feature group you have, run the ``ingest`` function on the -feature data you want to load. - -.. rubric:: Get records from a feature group - :name: bCe9CA25xj5 - -You can use the ``get_record`` function to retrieve the data for a -specific feature by its record identifier. The following example uses an -example identifier to retrieve the record. - -.. code:: python - - record_identifier_value = str(2990130) - featurestore_runtime.get_record(FeatureGroupName=transaction_feature_group_name, RecordIdentifierValueAsString=record_identifier_value) - -You can use the ``batch_get_record`` function to retrieve multiple records simultaneously from your feature store. The following example uses this API to retrieve a batch of records. - -.. code:: python - - record_identifier_values = ["573291", "109382", "828400", "124013"] - featurestore_runtime.batch_get_record(Identifiers=[{"FeatureGroupName": transaction_feature_group_name, "RecordIdentifiersValueAsString": record_identifier_values}]) - -An example response from the fraud detection example: - -.. code:: python - - ... - 'Record': [{'FeatureName': 'TransactionID', 'ValueAsString': '2990130'}, -   {'FeatureName': 'isFraud', 'ValueAsString': '0'}, -   {'FeatureName': 'TransactionDT', 'ValueAsString': '152647'}, -   {'FeatureName': 'TransactionAmt', 'ValueAsString': '75.0'}, -   {'FeatureName': 'ProductCD', 'ValueAsString': 'H'}, -   {'FeatureName': 'card1', 'ValueAsString': '4577'}, - ... - -.. rubric:: Hive DDL commands - :name: bCe9CA30nHn - -The SageMaker Python SDK’s FeatureStore class also provides the -functionality to generate Hive DDL commands. The schema of the table is -generated based on the feature definitions. Columns are named after -feature name and data-type are inferred based on feature type. - -.. code:: python - - print(feature_group.as_hive_ddl()) - -An example output: - -.. code:: python - - CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.identity-feature-group-27-19-33-00 ( -   TransactionID INT -   id_01 FLOAT -   id_02 FLOAT -   id_03 FLOAT -   id_04 FLOAT -  ... - -.. rubric:: Build a Training Dataset - :name: bCe9CAVnDLV - -Feature Store automatically builds a Amazon Glue Data Catalog when -Feature Groups are created and can optionally be turned off. The -following we show how to create a single training dataset with feature -values from both identity and transaction feature groups created above. -Also, the following shows how to run an Amazon Athena query to join data -stored in the Offline Store from both identity and transaction feature -groups. - - -To start, create an Athena query using\ ``athena_query()``\ for both -identity and transaction feature groups. The ``table_name`` is the Glue -table that is auto-generated by Feature Store. - -.. code:: python - - identity_query = identity_feature_group.athena_query() - transaction_query = transaction_feature_group.athena_query() - - identity_table = identity_query.table_name - transaction_table = transaction_query.table_name - -.. rubric:: Writing and Executing your Athena Query - :name: bCe9CArSR5J - -You will write your query using SQL on these feature groups, and then -execute the query with the ``.run()`` command and specify your S3 bucket -location for the data set to be saved there. - -.. code:: python - - # Athena query - query_string = 'SELECT * FROM "'+transaction_table+'" LEFT JOIN "'+identity_table+'" ON "'+transaction_table+'".transactionid = "'+identity_table+'".transactionid' - - # run Athena query. The output is loaded to a Pandas dataframe. - dataset = pd.DataFrame() - identity_query.run(query_string=query_string, output_location='s3://'+default_s3_bucket_name+'/query_results/') - identity_query.wait() - dataset = identity_query.as_dataframe() - -From here you can train a model using this data set and then perform -inference. - -.. rubric:: Delete a feature group - :name: bCe9CA61b78 - -You can delete a feature group with the ``delete`` function. - -.. code:: python - - feature_group.delete() - -The following code example is from the fraud detection example. - -.. code:: python - - identity_feature_group.delete() - transaction_feature_group.delete() diff --git a/doc/amazon_sagemaker_model_monitoring.rst b/doc/amazon_sagemaker_model_monitoring.rst deleted file mode 100644 index e493c24e2f..0000000000 --- a/doc/amazon_sagemaker_model_monitoring.rst +++ /dev/null @@ -1,204 +0,0 @@ - -############################## -Amazon SageMaker Model Monitor -############################## - - -Amazon SageMaker Model Monitor allows you to create a set of baseline statistics and constraints using the data with which your model was trained, then set up a schedule to monitor the predictions made on your endpoint. - -.. contents:: - -Background -========== - -Amazon SageMaker provides every developer and data scientist with the ability to build, train, and deploy machine learning models quickly. Amazon SageMaker is a fully-managed service that encompasses the entire machine learning workflow. You can label and prepare your data, choose an algorithm, train a model, and then tune and optimize it for deployment. You can deploy your models to production with Amazon SageMaker to make predictions at lower costs than was previously possible. - -Amazon SageMaker Model Monitor enables you to capture the input, output and metadata for the invocations of the models that you deploy. It also enables you to analyze the data and monitor its quality. In this notebook, you learn how Amazon SageMaker enables these capabilities. - -Setup -===== - -To get started, you must satisfy the following prerequisites: - -* Specify an AWS Region to host your model. -* Create an IAM role ARN that is used to give Amazon SageMaker access to your data in Amazon Simple Storage Service (Amazon S3). See the `AWS IAM documentation `__ for how to fine tune the permissions needed. -* Create an S3 bucket used to store the data used to train your model, any additional model data, and the data captured from model invocations. You can use the same bucket for these, or use separate buckets (e.g. if you want different security policies). - -Capture real-time inference data from Amazon SageMaker endpoints -================================================================ - -To enable data capture for monitoring the model data quality, specify the new capture option called ``DataCaptureConfig`` when deploying to an endpoint. You can choose to capture the request payload, the response payload or both with this configuration. The capture config applies to all variants. For more about the ``DataCaptureConfig`` object, see the `API documentation `__. - -.. code:: python - - from sagemaker.model_monitor import DataCaptureConfig - - data_capture_config = DataCaptureConfig( - enable_capture=True, - sampling_percentage=100, - destination_s3_uri='s3://path/for/data/capture' - ) - - predictor = model.deploy( - initial_instance_count=1, - instance_type='ml.m4.xlarge', - data_capture_config=data_capture_config - ) - -When you invoke the endpoint, the request and response payload, along with some additional metadata, is saved in the Amazon S3 location that you have specified in the ``DataCaptureConfig``. You should expect to see different files from different time periods, organized based on the hour in which the invocation occurred. The format of the Amazon S3 path is: - -.. code:: - - s3://{destination-bucket-prefix}/{endpoint-name}/{variant-name}/yyyy/mm/dd/hh/filename.jsonl - -You can use the ``S3Downloader`` utility to view and download the captured data in Amazon S3: - -.. code:: python - - from sagemaker.s3 import S3Downloader - - # Invoke the endpoint - predictor.predict(data) - - # Get a list of S3 URIs - S3Downloader.list('s3://path/for/data/capture') - - # Read a specific file - S3Downloader.read_file('s3://path/for/data/capture/endpoint-name/variant-name/2020/01/01/00/filename.jsonl') - -The contents of the single captured file should be all the data captured in an Amazon SageMaker-specific JSON-line formatted file. Each inference request is captured in a single line in the jsonl file. The line contains both the input and output merged together. - -Baselining and continuous monitoring -==================================== - -In addition to collecting the data, Amazon SageMaker provides the capability for you to monitor and evaluate the data observed by the endpoints. Two tasks are needed for this: - -* Create a baseline with which you compare the realtime traffic. -* Setup a schedule to continuously evaluate and compare against the baseline after it has been created. - -Constraint suggestion with baseline/training dataset ----------------------------------------------------- - -You can ask Amazon SageMaker to suggest a set of baseline constraints and generate descriptive statistics that characterize the data in a training dataset stored in Amazon S3. ``DefaultModelMonitor.suggest_baseline()`` starts a Processing Job using a Model Monitor container provided by Amazon SageMaker to generate the constraints. You can read more about ``suggest_baseline()`` in the `API documentation `__. - -.. code:: python - - from sagemaker.model_monitor import DefaultModelMonitor - from sagemaker.model_monitor.dataset_format import DatasetFormat - - my_monitor = DefaultModelMonitor( - role=role, - instance_count=1, - instance_type='ml.m5.xlarge', - volume_size_in_gb=20, - max_runtime_in_seconds=3600, - ) - - my_monitor.suggest_baseline( - baseline_dataset='s3://path/to/training-dataset-with-header.csv', - dataset_format=DatasetFormat.csv(header=True), - ) - -With the monitor object, you can also explore the generated constraints and statistics: - -.. code:: python - - import pandas as pd - - baseline_job = my_monitor.latest_baselining_job - schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict["features"]) - schema_df.head(10) - - constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict["features"]) - constraints_df.head(10) - -Analyze the data collected for data quality issues --------------------------------------------------- - -You can also analyze and monitor the data with Monitoring Schedules. - -Using ``DefaultMonitor.create_monitoring_schedule()``, you can create a model monitoring schedule for an endpoint that compares the baseline resources (constraints and statistics) against the realtime traffic. For more about this method, see the `API documentation `__. - -.. code:: python - - from sagemaker.model_monitor import CronExpressionGenerator - - my_monitor.create_monitoring_schedule( - monitor_schedule_name='my-monitoring-schedule', - endpoint_input=predictor.endpoint_name, - statistics=my_monitor.baseline_statistics(), - constraints=my_monitor.suggested_constraints(), - schedule_cron_expression=CronExpressionGenerator.hourly(), - ) - -The schedule starts jobs at the specified interval. - -.. note:: - - Even for an hourly schedule, Amazon SageMaker has a buffer period of 20 minutes to schedule your execution. This is expected and done for load balancing on the backend. - -Once the executions have started, you can use ``list_executions()`` to view them: - -.. code:: python - - executions = my_monitor.list_executions() - -You can also view the status of a specific execution: - -.. code:: python - - latest_execution = executions[-1] - - latest_execution.describe()['ProcessingJobStatus'] - latest_execution.describe()['ExitMessage'] - -Here are the possible terminal states and what each of them means: - -* ``Completed`` - This means the monitoring execution completed and no issues were found in the violations report. -* ``CompletedWithViolations`` - This means the execution completed, but constraint violations were detected. -* ``Failed`` - The monitoring execution failed, maybe due to client error (perhaps incorrect role premissions) or infrastructure issues. Further examination of the FailureReason and ExitMessage is necessary to identify what exactly happened. -* ``Stopped`` - job exceeded the max runtime or was manually stopped. - -You can also get the S3 URI for the output with ``latest_execution.output.destination`` and analyze the results. - -Visualize results -================= - -You can use the monitor object to gather reports for visualization: - -.. code:: python - - suggested_constraints = my_monitor.suggested_constraints() - baseline_statistics = my_monitor.baseline_statistics() - - latest_monitoring_violations = my_monitor.latest_monitoring_constraint_violations() - latest_monitoring_statistics = my_monitor.latest_monitoring_statistics() - -For a tutorial on how to visualize the results, see `SageMaker Model Monitor - visualizing monitoring results `__. - -Delete the resources -==================== - -When deleting an endpoint, you need to first delete the monitoring schedule: - -.. code:: python - - my_monitor.delete_monitoring_schedule() - - predictor.delete_endpoint() - predictor.delete_model() - -Learn More -========== - -Further documentation ---------------------- - -* API documentation: https://sagemaker.readthedocs.io/en/stable/model_monitor.html -* AWS documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor.html -* ``S3Downloader``: https://sagemaker.readthedocs.io/en/stable/s3.html#sagemaker.s3.S3Downloader - -Notebook examples ------------------ - -Consult our notebook examples for in-depth tutorials: https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker_model_monitor diff --git a/doc/amazon_sagemaker_processing.rst b/doc/amazon_sagemaker_processing.rst deleted file mode 100644 index ac168578cd..0000000000 --- a/doc/amazon_sagemaker_processing.rst +++ /dev/null @@ -1,247 +0,0 @@ -########################### -Amazon SageMaker Processing -########################### - - -Amazon SageMaker Processing allows you to run steps for data pre- or post-processing, feature engineering, data validation, or model evaluation workloads on Amazon SageMaker. - -.. contents:: - -Background -========== - -Amazon SageMaker lets developers and data scientists train and deploy machine learning models. With Amazon SageMaker Processing, you can run processing jobs for data processing steps in your machine learning pipeline. Processing jobs accept data from Amazon S3 as input and store data into Amazon S3 as output. - -.. image:: ./amazon_sagemaker_processing_image1.png - -Setup -===== - -The fastest way to get started with Amazon SageMaker Processing is by running a Jupyter notebook. You can follow the `Getting Started with Amazon SageMaker`_ guide to start running notebooks on Amazon SageMaker. - -.. _Getting Started with Amazon SageMaker: https://docs.aws.amazon.com/sagemaker/latest/dg/gs.html - -You can run notebooks on Amazon SageMaker that demonstrate end-to-end examples of using processing jobs to perform data pre-processing, feature engineering and model evaluation steps. See `Learn More`_ at the bottom of this page for more in-depth information. - - -Data Pre-Processing and Model Evaluation with scikit-learn -========================================================== - -You can run a scikit-learn script to do data processing on SageMaker using the :class:`sagemaker.sklearn.processing.SKLearnProcessor` class. - -You first create a ``SKLearnProcessor`` - -.. code:: python - - from sagemaker.sklearn.processing import SKLearnProcessor - - sklearn_processor = SKLearnProcessor( - framework_version="0.20.0", - role="[Your SageMaker-compatible IAM role]", - instance_type="ml.m5.xlarge", - instance_count=1, - ) - -Then you can run a scikit-learn script ``preprocessing.py`` in a processing job. In this example, our script takes one input from S3 and one command-line argument, processes the data, then splits the data into two datasets for output. When the job is finished, we can retrive the output from S3. - -.. code:: python - - from sagemaker.processing import ProcessingInput, ProcessingOutput - - sklearn_processor.run( - code="preprocessing.py", - inputs=[ - ProcessingInput(source="s3://your-bucket/path/to/your/data", destination="/opt/ml/processing/input"), - ], - outputs=[ - ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), - ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"), - ], - arguments=["--train-test-split-ratio", "0.2"], - ) - - preprocessing_job_description = sklearn_processor.jobs[-1].describe() - -For an in-depth look, please see the `Scikit-learn Data Processing and Model Evaluation`_ example notebook. - -.. _Scikit-learn Data Processing and Model Evaluation: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb - - -Data Processing with Spark -============================================ -SageMaker provides two classes for customers to run Spark applications: :class:`sagemaker.spark.processing.PySparkProcessor` and :class:`sagemaker.spark.processing.SparkJarProcessor` - - -PySparkProcessor ---------------------- - -You can use the :class:`sagemaker.spark.processing.PySparkProcessor` class to run PySpark scripts as processing jobs. - -This example shows how you can take an existing PySpark script and run a processing job with the :class:`sagemaker.spark.processing.PySparkProcessor` class and the pre-built SageMaker Spark container. - -First you need to create a :class:`PySparkProcessor` object - -.. code:: python - - from sagemaker.processing import PySparkProcessor, ProcessingInput - - spark_processor = PySparkProcessor( - base_job_name="sm-spark", - framework_version="2.4", - py_version="py37", - container_version="1", - role="[Your SageMaker-compatible IAM role]", - instance_count=2, - instance_type="ml.c5.xlarge", - max_runtime_in_seconds=1200, - image_uri="your-image-uri" - ) - -The ``framework_version`` is the spark version where the script will be running. -``py_version`` and ``container_version`` are two new parameters you can specify in the constructor. They give you more flexibility to select the container version to avoid any backward incompatibilities and unnecessary dependency upgrade. - -If you just specify the ``framework_version``, Sagemaker will default to a python version and the latest container version. To pin to an exact version of the SageMaker Spark container you need to specify all the three parameters: ``framework_version``, ``py_version`` and ``container_version``. - -You can also specify the ``image_uri`` and it will override all the three parameters. - -Note that ``command`` option will not be supported on either :class:`PySparkProcessor` or :class:`SparkJarProcessor`. If you want to run the script on your own container, please use :class:`ScriptProcessor` instead. - -Then you can run your existing spark script ``preprocessing.py`` in a processing job. - -.. code:: python - - spark_processor.run( - submit_app="preprocess.py", - submit_py_files=["module.py", "lib.egg"], - submit_jars=["lib.jar", "lib2.jar"], - submit_files=["file.txt", "file2.csv"], - arguments=["s3_input_bucket", bucket, - "s3_input_key_prefix", input_prefix, - "s3_output_bucket", bucket, - "s3_output_key_prefix", input_preprocessed_prefix], - spark_event_logs_s3_uri="s3://your-bucket/your-prefix/store-spark-events" - ) - -``submit_app`` is the local relative path or s3 path of your python script, it's ``preprocess.py`` in this case. - -You can also specify any python or jar dependencies or files that your script depends on with ``submit_py_files``, ``submit_jars`` and ``submit_files``. - -``submit_py_files`` is a list of .zip, .egg, or .py files to place on the PYTHONPATH for Python apps. ``submit_jars`` is a list of jars to include on the driver and executor classpaths. ``submit_files`` is list of files to be placed in the working directory of each executor. File paths of these files in executors can be accessed via SparkFiles.get(fileName). - -Each item in the list can be either s3 path or local path, and if you have dependencies stored both in s3 and locally, you can put all of them in ``submit_py_files``, ``submit_jars``, and ``submit_files`` - -Just like using the ScriptProcessor, you can pass any arguments to your script by specifying ``arguments`` parameter. In this example, four arguments are passed to the script to get and upload data from/to s3. - -To support Spark history server, you can specify the parameter ``spark_event_logs_s3_uri`` when you invoke run() method to continuously upload spark events to s3. Note that the performance will be slightly impacted if you decide to publish spark event to s3. - -Spark History Server ---------------------- - -While script is running, or after script has run, you can view spark UI by running history server locally or in the notebook. By default, the s3 URI you provided in previous ``run()`` method will be used as spark event source, but you can also specify a different URI. Last but not the least, you can terminate the history server with ``terminate_history_server()``. Note that only one history server process will be running at a time. - -Here's an example to start and terminate history server - -.. code:: python - - spark_processor.start_history_server() - spark_processor.terminate_history_server() - -You don't always have to run the script first to start history server, you can also specify the s3 URI with spark event logs stored. For example - -.. code:: python - - spark_processor.start_history_server(spark_event_logs_s3_uri="s3://your-bucket/your-prefix/store-spark-events") - -To successfully run the history server, first you need to make sure ``docker`` is installed in your machine. Then you need to configure your aws credentials with S3 read permission. Last but not the least, you need to either invoke ``run()`` method with ``spark_event_logs_s3_uri`` first, or specify the ``spark_event_logs_s3_uri`` in ``start_history_server()`` method, otherwise it will fail. - -SparkJarProcessor ---------------------- - -Supposed that you have the jar file "preprocessing.jar" stored in the same directory as you are now, and the java package is ``com.path.to.your.class.PreProcessing.java`` -Here's an example of using PySparkProcessor. - -.. code:: python - - spark = SparkJarProcessor( - base_job_name="sm-spark-java", - image_uri=beta_image_uri, - role=role, - instance_count=2, - instance_type="ml.c5.xlarge", - max_runtime_in_seconds=1200, - ) - - spark.run( - submit_app="preprocessing.jar", - submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp", - arguments=["--input", input_s3_uri, "--output", output_s3_uri] - ) - -:class:`SparkJarProcessor` is very similar to :class:`PySparkProcessor` except that the ``run()`` method takes only jar file path, configured by ``submit_app`` parameter, and ``submit_class`` parameter, which is equivalent to "--class" option for "spark-submit" command. - -Configuration Override ----------------------- - -Overriding Spark configuration is crucial for a number of tasks such as tuning your Spark application or configuring the hive metastore. Using our Python SDK, you can easily override Spark/Hive/Hadoop configuration. - -An example usage would be overriding Spark executor memory/cores as demonstrated in the following code snippet: - -.. code:: python - - spark_processor = PySparkProcessor( - base_job_name="sm-spark", - image_uri=beta_image_uri, - role=role, - instance_count=2, - instance_type="ml.c5.xlarge", - max_runtime_in_seconds=1200, - ) - - configuration = [{ - "Classification": "spark-defaults", - "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"}, - }] - - spark_processor.run( - submit_app="./code/preprocess.py", - arguments=["s3_input_bucket", bucket, - "s3_input_key_prefix", input_prefix_abalone, - "s3_output_bucket", bucket, - "s3_output_key_prefix", input_preprocessed_prefix_abalone], - configuration=configuration, - logs=False - ) - -For an in-depth look of how to write your configuration, please see `Amazon EMR Configuring Applications`_ document. - -.. _Amazon EMR Configuring Applications: https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html - -For an in-depth look, please see the `Feature Transformation with Spark`_ example notebook. - -.. _Feature Transformation with Spark: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker_processing/feature_transformation_with_sagemaker_processing/feature_transformation_with_sagemaker_processing.ipynb - - -Learn More -========== - -Processing class documentation ------------------------------- - -- :class:`sagemaker.processing.Processor` -- :class:`sagemaker.processing.ScriptProcessor` -- :class:`sagemaker.sklearn.processing.SKLearnProcessor` -- :class:`sagemaker.spark.processing.PySparkProcessor` -- :class:`sagemaker.spark.processing.SparkJarProcessor` -- :class:`sagemaker.processing.ProcessingInput` -- :class:`sagemaker.processing.ProcessingOutput` -- :class:`sagemaker.processing.ProcessingJob` - - -Further documentation ---------------------- - -- `Processing class documentation `_ -- `AWS Documentation `_ -- `AWS Notebook examples `_ -- `Processing API documentation `_ -- `Processing container specification `_ diff --git a/doc/amazon_sagemaker_processing_image1.png b/doc/amazon_sagemaker_processing_image1.png deleted file mode 100644 index 1cd4ab5e28..0000000000 Binary files a/doc/amazon_sagemaker_processing_image1.png and /dev/null differ diff --git a/doc/api/index.rst b/doc/api/index.rst deleted file mode 100644 index 46dcd5e53d..0000000000 --- a/doc/api/index.rst +++ /dev/null @@ -1,14 +0,0 @@ -#### -APIs -#### - -The SageMaker Python SDK consists of a variety classes for preparing data, training, inference and general utility: - -.. toctree:: - :maxdepth: 2 - - prep_data/feature_store - training/index - training/distributed - inference/index - utility/index diff --git a/doc/api/inference/async_inference.rst b/doc/api/inference/async_inference.rst deleted file mode 100644 index 493cab40eb..0000000000 --- a/doc/api/inference/async_inference.rst +++ /dev/null @@ -1,19 +0,0 @@ -Async Inference ------------------ - -This module contains classes related to Amazon Sagemaker Async Inference - -.. automodule:: sagemaker.async_inference.async_inference_config - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.async_inference.async_inference_response - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.async_inference.waiter_config - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/deserializers.rst b/doc/api/inference/deserializers.rst deleted file mode 100644 index f19ed91e8d..0000000000 --- a/doc/api/inference/deserializers.rst +++ /dev/null @@ -1,7 +0,0 @@ -Deserializers -------------- - -.. automodule:: sagemaker.deserializers - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/index.rst b/doc/api/inference/index.rst deleted file mode 100644 index 1efdabea54..0000000000 --- a/doc/api/inference/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -############## -Inference APIs -############## - -.. toctree:: - :maxdepth: 1 - :glob: - - * diff --git a/doc/api/inference/model.rst b/doc/api/inference/model.rst deleted file mode 100644 index d6cb0b5003..0000000000 --- a/doc/api/inference/model.rst +++ /dev/null @@ -1,18 +0,0 @@ -Model ------ - -.. autoclass:: sagemaker.model.Model - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.model.FrameworkModel - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.model.ModelPackage - :members: - :undoc-members: - :show-inheritance: - diff --git a/doc/api/inference/model_monitor.rst b/doc/api/inference/model_monitor.rst deleted file mode 100644 index 946e376502..0000000000 --- a/doc/api/inference/model_monitor.rst +++ /dev/null @@ -1,43 +0,0 @@ -Model Monitor -------------- - - -.. automodule:: sagemaker.model_monitor.model_monitoring - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.model_monitor.monitoring_files - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.model_monitor.dataset_format - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.model_monitor.data_capture_config - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.model_monitor.cron_expression_generator - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.model_monitor.clarify_model_monitoring - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.model_metrics - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.drift_check_baselines - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/multi_data_model.rst b/doc/api/inference/multi_data_model.rst deleted file mode 100644 index fec64a172e..0000000000 --- a/doc/api/inference/multi_data_model.rst +++ /dev/null @@ -1,7 +0,0 @@ -MultiDataModel --------------- - -.. automodule:: sagemaker.multidatamodel - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/pipeline.rst b/doc/api/inference/pipeline.rst deleted file mode 100644 index ea4bdb5937..0000000000 --- a/doc/api/inference/pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -PipelineModel -------------- - -.. autoclass:: sagemaker.pipeline.PipelineModel - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/predictor_async.rst b/doc/api/inference/predictor_async.rst deleted file mode 100644 index e5b83e2324..0000000000 --- a/doc/api/inference/predictor_async.rst +++ /dev/null @@ -1,9 +0,0 @@ -AsyncPredictor --------------------- - -Make async predictions against SageMaker endpoints with Python objects - -.. autoclass:: sagemaker.predictor_async.AsyncPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/predictors.rst b/doc/api/inference/predictors.rst deleted file mode 100644 index 6a9243f329..0000000000 --- a/doc/api/inference/predictors.rst +++ /dev/null @@ -1,9 +0,0 @@ -Predictors --------------------- - -Make real-time predictions against SageMaker endpoints with Python objects - -.. autoclass:: sagemaker.predictor.Predictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/serializers.rst b/doc/api/inference/serializers.rst deleted file mode 100644 index 6bd22ca4dc..0000000000 --- a/doc/api/inference/serializers.rst +++ /dev/null @@ -1,7 +0,0 @@ -Serializers ------------ - -.. automodule:: sagemaker.serializers - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/serverless.rst b/doc/api/inference/serverless.rst deleted file mode 100644 index d338efd7be..0000000000 --- a/doc/api/inference/serverless.rst +++ /dev/null @@ -1,9 +0,0 @@ -Serverless Inference ---------------------- - -This module contains classes related to Amazon Sagemaker Serverless Inference - -.. automodule:: sagemaker.serverless.serverless_inference_config - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/inference/transformer.rst b/doc/api/inference/transformer.rst deleted file mode 100644 index 1c49ac9945..0000000000 --- a/doc/api/inference/transformer.rst +++ /dev/null @@ -1,7 +0,0 @@ -Transformer ------------ - -.. autoclass:: sagemaker.transformer.Transformer - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/prep_data/feature_store.rst b/doc/api/prep_data/feature_store.rst deleted file mode 100644 index 1980a0b069..0000000000 --- a/doc/api/prep_data/feature_store.rst +++ /dev/null @@ -1,74 +0,0 @@ -Feature Store APIs ------------------- - -Feature group -************* - -.. autoclass:: sagemaker.feature_store.feature_group.FeatureGroup - :members: - :exclude-members: load_feature_definitions - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.feature_group.AthenaQuery - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.feature_group.IngestionManagerPandas - :members: - :show-inheritance: - - -Feature definition -****************** - -.. autoclass:: sagemaker.feature_store.feature_definition.FeatureDefinition - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.feature_definition.FractionalFeatureDefinition - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.feature_definition.IntegralFeatureDefinition - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.feature_definition.StringFeatureDefinition - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.feature_definition.FeatureTypeEnum - :members: - :show-inheritance: - - -Inputs -****** - -.. autoclass:: sagemaker.feature_store.inputs.Config - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.inputs.DataCatalogConfig - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.inputs.OfflineStoreConfig - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.inputs.OnlineStoreConfig - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.inputs.OnlineStoreSecurityConfig - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.inputs.S3StorageConfig - :members: - :show-inheritance: - -.. autoclass:: sagemaker.feature_store.inputs.FeatureValue - :members: - :show-inheritance: diff --git a/doc/api/training/algorithm.rst b/doc/api/training/algorithm.rst deleted file mode 100644 index 778738a192..0000000000 --- a/doc/api/training/algorithm.rst +++ /dev/null @@ -1,7 +0,0 @@ -Algorithm Estimator -------------------- - -.. automodule:: sagemaker.algorithm - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/training/analytics.rst b/doc/api/training/analytics.rst deleted file mode 100644 index e60d6f21f8..0000000000 --- a/doc/api/training/analytics.rst +++ /dev/null @@ -1,22 +0,0 @@ -Analytics ---------- - -.. autoclass:: sagemaker.analytics.AnalyticsMetricsBase - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.analytics.HyperparameterTuningJobAnalytics - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.analytics.TrainingJobAnalytics - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.analytics.ExperimentAnalytics - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/training/automl.rst b/doc/api/training/automl.rst deleted file mode 100644 index 1d7907dc2b..0000000000 --- a/doc/api/training/automl.rst +++ /dev/null @@ -1,12 +0,0 @@ -AutoML ------- - -.. automodule:: sagemaker.automl.automl - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.automl.candidate_estimator - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/training/debugger.rst b/doc/api/training/debugger.rst deleted file mode 100644 index 0fd1f1d3cd..0000000000 --- a/doc/api/training/debugger.rst +++ /dev/null @@ -1,79 +0,0 @@ -Debugger --------- - -Amazon SageMaker Debugger provides full visibility -into training jobs of state-of-the-art machine learning models. -This SageMaker Debugger module provides high-level methods -to set up Debugger configurations to -monitor, profile, and debug your training job. -Configure the Debugger-specific parameters when constructing -a SageMaker estimator to gain visibility and insights -into your training job. - -.. currentmodule:: sagemaker.debugger - -.. autoclass:: get_rule_container_image_uri - :show-inheritance: - -.. autoclass:: get_default_profiler_rule - :show-inheritance: - -.. class:: sagemaker.debugger.rule_configs - - A helper module to configure the SageMaker Debugger built-in rules with - the :class:`~sagemaker.debugger.Rule` classmethods and - and the :class:`~sagemaker.debugger.ProfilerRule` classmethods. - - For a full list of built-in rules, see - `List of Debugger Built-in Rules - `_. - - This module is imported from the Debugger client library for rule configuration. - For more information, see - `Amazon SageMaker Debugger RulesConfig - `_. - -.. autoclass:: RuleBase - :show-inheritance: - -.. autoclass:: Rule - :show-inheritance: - :inherited-members: - -.. autoclass:: ProfilerRule - :show-inheritance: - :inherited-members: - -.. autoclass:: CollectionConfig - :show-inheritance: - -.. autoclass:: DebuggerHookConfig - :show-inheritance: - -.. autoclass:: TensorBoardOutputConfig - :show-inheritance: - -.. autoclass:: ProfilerConfig - :show-inheritance: - -.. autoclass:: FrameworkProfile - :show-inheritance: - -.. autoclass:: DetailedProfilingConfig - :show-inheritance: - -.. autoclass:: DataloaderProfilingConfig - :show-inheritance: - -.. autoclass:: PythonProfilingConfig - :show-inheritance: - -.. autoclass:: PythonProfiler - :show-inheritance: - -.. autoclass:: cProfileTimer - :show-inheritance: - -.. automodule:: sagemaker.debugger.metrics_config - :members: StepRange, TimeRange - :undoc-members: diff --git a/doc/api/training/distributed.rst b/doc/api/training/distributed.rst deleted file mode 100644 index 7376978873..0000000000 --- a/doc/api/training/distributed.rst +++ /dev/null @@ -1,11 +0,0 @@ -Distributed Training APIs -------------------------- -SageMaker distributed training libraries offer both data parallel and model parallel training strategies. -They combine software and hardware technologies to improve inter-GPU and inter-node communications. -They extend SageMaker’s training capabilities with built-in options that require only small code changes to your training scripts. - -.. toctree:: - :maxdepth: 3 - - smd_data_parallel - smd_model_parallel diff --git a/doc/api/training/estimators.rst b/doc/api/training/estimators.rst deleted file mode 100644 index 93e094f502..0000000000 --- a/doc/api/training/estimators.rst +++ /dev/null @@ -1,22 +0,0 @@ -Estimators ----------- - -A high level interface for SageMaker training - -.. autoclass:: sagemaker.estimator.EstimatorBase - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -.. autoclass:: sagemaker.estimator.Estimator - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - -.. autoclass:: sagemaker.estimator.Framework - :members: - :undoc-members: - :show-inheritance: - :inherited-members: diff --git a/doc/api/training/index.rst b/doc/api/training/index.rst deleted file mode 100644 index 6446b59770..0000000000 --- a/doc/api/training/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -############# -Training APIs -############# - -.. toctree:: - :maxdepth: 4 - - analytics - automl - debugger - estimators - algorithm - tuner - parameter - processing diff --git a/doc/api/training/parameter.rst b/doc/api/training/parameter.rst deleted file mode 100644 index 1893a8e889..0000000000 --- a/doc/api/training/parameter.rst +++ /dev/null @@ -1,7 +0,0 @@ -Parameters ----------- - -.. automodule:: sagemaker.parameter - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/training/processing.rst b/doc/api/training/processing.rst deleted file mode 100644 index 7d572aace2..0000000000 --- a/doc/api/training/processing.rst +++ /dev/null @@ -1,17 +0,0 @@ -Processing ----------- - -.. automodule:: sagemaker.processing - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.spark.processing - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.clarify - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/training/sdp_versions/latest.rst b/doc/api/training/sdp_versions/latest.rst deleted file mode 100644 index 4b770cd5e1..0000000000 --- a/doc/api/training/sdp_versions/latest.rst +++ /dev/null @@ -1,9 +0,0 @@ - -Version 1.2.x (Latest) -====================== - -.. toctree:: - :maxdepth: 1 - - latest/smd_data_parallel_pytorch.rst - latest/smd_data_parallel_tensorflow.rst diff --git a/doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst b/doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst deleted file mode 100644 index 85c9594e73..0000000000 --- a/doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst +++ /dev/null @@ -1,550 +0,0 @@ -############################################################## -PyTorch Guide to SageMaker's distributed data parallel library -############################################################## - -Use this guide to learn about the SageMaker distributed -data parallel library API for PyTorch. - -.. contents:: Topics - :depth: 3 - :local: - -.. _pytorch-sdp-modify: - -Modify a PyTorch training script to use SageMaker data parallel -====================================================================== - -The following steps show you how to convert a PyTorch training script to -utilize SageMaker's distributed data parallel library. - -The distributed data parallel library APIs are designed to be close to PyTorch Distributed Data -Parallel (DDP) APIs. -See `SageMaker distributed data parallel PyTorch examples `__ for additional details on how to implement the data parallel library -API offered for PyTorch. - - -- First import the distributed data parallel library’s PyTorch client and initialize it. You also import - the distributed data parallel library module for distributed training. - - .. code:: python - - import smdistributed.dataparallel.torch.distributed as dist - - from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP - - dist.init_process_group() - - -- Pin each GPU to a single distributed data parallel library process with ``local_rank`` - this - refers to the relative rank of the process within a given node. - ``smdistributed.dataparallel.torch.get_local_rank()`` API provides - you the local rank of the device. The leader node will be rank 0, and - the worker nodes will be rank 1, 2, 3, and so on. This is invoked in - the next code block as ``dist.get_local_rank()``. - - .. code:: python - - torch.cuda.set_device(dist.get_local_rank()) - - -- Then wrap the PyTorch model with the distributed data parallel library’s DDP. - - .. code:: python - - model = ... - # Wrap model with SageMaker's DistributedDataParallel - model = DDP(model) - - -- Modify the ``torch.utils.data.distributed.DistributedSampler`` to - include the cluster’s information. Set ``num_replicas`` to the - total number of GPUs participating in training across all the nodes - in the cluster. This is called ``world_size``. You can get - ``world_size`` with - ``smdistributed.dataparallel.torch.get_world_size()`` API. This is - invoked in the following code as ``dist.get_world_size()``. Also - supply the node rank using - ``smdistributed.dataparallel.torch.get_rank()``. This is invoked as - ``dist.get_rank()``. - - .. code:: python - - train_sampler = DistributedSampler(train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank()) - - -- Finally, modify your script to save checkpoints only on the leader - node. The leader node will have a synchronized model. This also - avoids worker nodes overwriting the checkpoints and possibly - corrupting the checkpoints. - -.. code:: python - - if dist.get_rank() == 0: - torch.save(...) - - -All put together, the following is an example PyTorch training script -you will have for distributed training with the distributed data parallel library: - -.. code:: python - - # Import distributed data parallel library PyTorch API - import smdistributed.dataparallel.torch.distributed as dist - - # Import distributed data parallel library PyTorch DDP - from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP - - # Initialize distributed data parallel library - dist.init_process_group() - - class Net(nn.Module): -     ... -     # Define model - - def train(...): -     ... -     # Model training - - def test(...): -     ... -     # Model evaluation - - def main(): - -     # Scale batch size by world size -     batch_size //= dist.get_world_size() -     batch_size = max(batch_size, 1) - -     # Prepare dataset -     train_dataset = torchvision.datasets.MNIST(...) - -     # Set num_replicas and rank in DistributedSampler -     train_sampler = torch.utils.data.distributed.DistributedSampler( -             train_dataset, -             num_replicas=dist.get_world_size(), -             rank=dist.get_rank()) - -     train_loader = torch.utils.data.DataLoader(..) - -     # Wrap the PyTorch model with distributed data parallel library’s DDP -     model = DDP(Net().to(device)) - -     # Pin each GPU to a single distributed data parallel library process. -     torch.cuda.set_device(local_rank) -     model.cuda(local_rank) - -     # Train -     optimizer = optim.Adadelta(...) -     scheduler = StepLR(...) -     for epoch in range(1, args.epochs + 1): -         train(...) -         if rank == 0: -             test(...) -         scheduler.step() - -     # Save model on master node. -     if dist.get_rank() == 0: -         torch.save(...) - - if __name__ == '__main__': -     main() - - -.. _pytorch-sdp-api: - -PyTorch API -=========== - -.. class:: smdistributed.dataparallel.torch.parallel.DistributedDataParallel(module, device_ids=None, output_device=None, broadcast_buffers=True, process_group=None, bucket_cap_mb=None) - - ``smdistributed.dataparallel``'s implementation of distributed data - parallelism for PyTorch. In most cases, wrapping your PyTorch Module - with ``smdistributed.dataparallel``'s ``DistributedDataParallel`` (DDP) is - all you need to do to use ``smdistributed.dataparallel``. - - Creation of this DDP class requires ``smdistributed.dataparallel`` - already initialized - with ``smdistributed.dataparallel.torch.distributed.init_process_group()``. - - This container parallelizes the application of the given module by - splitting the input across the specified devices by chunking in the - batch dimension. The module is replicated on each machine and each - device, and each such replica handles a portion of the input. During the - backwards pass, gradients from each node are averaged. - - The batch size should be larger than the number of GPUs used locally. - ​ - Example usage - of ``smdistributed.dataparallel.torch.parallel.DistributedDataParallel``: - - .. code:: python - - import torch - import smdistributed.dataparallel.torch.distributed as dist - from smdistributed.dataparallel.torch.parallel import DistributedDataParallel as DDP - - dist.init_process_group() - - # Pin GPU to be used to process local rank (one GPU per process) - torch.cuda.set_device(dist.get_local_rank()) - - # Build model and optimizer - model = ... - optimizer = torch.optim.SGD(model.parameters(), -                             lr=1e-3 * dist.get_world_size()) - # Wrap model with smdistributed.dataparallel's DistributedDataParallel - model = DDP(model) - - **Parameters:** - - - ``module (torch.nn.Module)(required):`` PyTorch NN Module to be - parallelized - - ``device_ids (list[int])(optional):`` CUDA devices. This should only - be provided when the input module resides on a single CUDA device. - For single-device modules, - the ``ith module replica is placed on device_ids[i]``. For - multi-device modules and CPU modules, device_ids must be None or an - empty list, and input data for the forward pass must be placed on the - correct device. Defaults to ``None``. - - ``output_device (int)(optional):`` Device location of output for - single-device CUDA modules. For multi-device modules and CPU modules, - it must be None, and the module itself dictates the output location. - (default: device_ids[0] for single-device modules).  Defaults - to ``None``. - - ``broadcast_buffers (bool)(optional):`` Flag that enables syncing - (broadcasting) buffers of the module at beginning of the forward - function. ``smdistributed.dataparallel`` does not support broadcast - buffer yet. Please set this to ``False``. - - ``process_group(smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` Defaults - to ``None.`` - - ``bucket_cap_mb (int)(optional):`` DistributedDataParallel will - bucket parameters into multiple buckets so that gradient reduction of - each bucket can potentially overlap with backward - computation. ``bucket_cap_mb`` controls the bucket size in - MegaBytes (MB) (default: 25). - - .. note:: - - This module assumes all parameters are registered in the model by the - time it is created. No parameters should be added nor removed later. - - .. note:: - - This module assumes all parameters are registered in the model of - each distributed processes are in the same order. The module itself - will conduct gradient all-reduction following the reverse order of - the registered parameters of the model. In other words, it is users’ - responsibility to ensure that each distributed process has the exact - same model and thus the exact same parameter registration order. - - .. note:: - - You should never change the set of your model’s parameters after - wrapping up your model with DistributedDataParallel. In other words, - when wrapping up your model with DistributedDataParallel, the - constructor of DistributedDataParallel will register the additional - gradient reduction functions on all the parameters of the model - itself at the time of construction. If you change the model’s - parameters after the DistributedDataParallel construction, this is - not supported and unexpected behaviors can happen, since some - parameters’ gradient reduction functions might not get called. - - .. method:: no_sync() - - ``smdistributed.dataparallel`` supports the `PyTorch DDP no_sync() `_ - context manager. It enables gradient accumulation by skipping AllReduce - during training iterations inside the context. - - .. note:: - - The ``no_sync()`` context manager is available from smdistributed-dataparallel v1.2.2. - To find the release note, see :ref:`sdp_1.2.2_release_note`. - - **Example:** - - .. code:: python - - # Gradients are accumulated while inside no_sync context - with model.no_sync(): - ... - loss.backward() - - # First iteration upon exiting context - # Incoming gradients are added to the accumulated gradients and then synchronized via AllReduce - ... - loss.backward() - - # Update weights and reset gradients to zero after accumulation is finished - optimizer.step() - optimizer.zero_grad() - - -.. function:: smdistributed.dataparallel.torch.distributed.is_available() - - Check if script started as a distributed job. For local runs user can - check that is_available returns False and run the training script - without calls to ``smdistributed.dataparallel``. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``True`` if started as a distributed job, ``False`` otherwise - - -.. function:: smdistributed.dataparallel.torch.distributed.init_process_group(*args, **kwargs) - - Initialize ``smdistributed.dataparallel``. Must be called at the - beginning of the training script, before calling any other methods. - ​ - Process group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with ``torch.distributed`` only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - ​ - After this - call, ``smdistributed.dataparallel.torch.distributed.is_initialized()`` will - return ``True``. - ​ - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.torch.distributed.is_initialized() - - Checks if the default process group has been initialized. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``True`` if initialized, else ``False``. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_world_size(group=smdistributed.dataparallel.torch.distributed.group.WORLD) - - The total number of GPUs across all the nodes in the cluster. For - example, in a 8 node cluster with 8 GPU each, size will be equal to 64. - - **Inputs:** - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - **Returns:** - - - An integer scalar containing the total number of GPUs in the training - job, across all nodes in the cluster. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_rank(group=smdistributed.dataparallel.torch.distributed.group.WORLD) - - The rank of the node in the cluster. The rank ranges from 0 to number of - nodes - 1. This is similar to MPI's World Rank. - - - **Inputs:** - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - **Returns:** - - - An integer scalar containing the rank of the worker node. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_local_rank() - - Local rank refers to the relative rank of - the ``smdistributed.dataparallel`` process within the node the current - process is running on. For example, if a node contains 8 GPUs, it has - 8 ``smdistributed.dataparallel`` processes. Each process has - a ``local_rank`` ranging from 0 to 7. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the GPU and - its ``smdistributed.dataparallel`` process. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_reduce(tensor, op=smdistributed.dataparallel.torch.distributed.ReduceOp.SUM, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - - Performs an all-reduce operation on a tensor (torch.tensor) across - all ``smdistributed.dataparallel`` workers - - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors.  By - default, ``smdistributed.dataparallel`` AllReduce reduces the tensor - data across all ``smdistributed.dataparallel`` workers in such a way - that all get the final result. - - After the call ``tensor`` is going to be bitwise identical in all - processes. - - **Inputs:** - - - ``tensor (torch.tensor) (required):`` Input and output of the collective. The function operates in-place. - - - ``op (smdistributed.dataparallel.torch.distributed.ReduceOp) (optional)``: The reduction operation to combine tensors across different ranks.  Defaults to ``SUM`` if None is given. - - * Supported ops: ``AVERAGE``, ``SUM``, ``MIN``, ``MAX`` - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process group is not supported in ``smdistributed.dataparallel``. This parameter exists for API parity with torch.distributed only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool) (optional):`` Whether this op should be an async op. Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - .. rubric:: Notes - - ``smdistributed.dataparallel.torch.distributed.allreduce``, in most - cases, is ~2X slower than all-reducing - with ``smdistributed.dataparallel.torch.parallel.distributed.DistributedDataParallel`` and - hence, it is not recommended to be used for performing gradient - reduction during the training - process. ``smdistributed.dataparallel.torch.distributed.allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. - - -.. function:: smdistributed.dataparallel.torch.distributed.broadcast(tensor, src=0, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - - Broadcasts the tensor (torch.tensor) to the whole group. - - ``tensor`` must have the same number of elements as GPUs in the - cluster. - - **Inputs:** - - - ``tensor (torch.tensor)(required)`` - - - ``src (int)(optional)`` - - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process group is not supported in ``smdistributed.dataparallel``. This parameter exists for API parity with ``torch.distributed`` only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool)(optional):`` Whether this op should be an async op. Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_gather(tensor_list, tensor, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - - Gathers tensors from the whole group in a list. - - - **Inputs:** - - - ``tensor_list (list[torch.tensor])(required):`` Output list. It - should contain correctly-sized tensors to be used for output of the - collective. - - ``tensor (torch.tensor)(required):`` Tensor to be broadcast from - current process. - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_to_all_single(output_t, input_t, output_split_sizes=None, input_split_sizes=None, group=group.WORLD, async_op=False) - - Each process scatters input tensor to all processes in a group and return gathered tensor in output. - - **Inputs:** - - - output_t - - input_t - - output_split_sizes - - input_split_sizes - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.barrier(group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - - Synchronizes all ``smdistributed.dataparallel`` processes. - - **Inputs:** - - - tensor (torch.tensor)(required): Data to be sent if src is the rank of current process, and tensor to be used to save received data otherwise. - - - src (int)(optional): Source rank. - - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. class:: smdistributed.dataparallel.torch.distributed.ReduceOp - - An enum-like class for supported reduction operations - in ``smdistributed.dataparallel``. - - The values of this class can be accessed as attributes, for - example, ``ReduceOp.SUM``. They are used in specifying strategies for - reduction collectives such as -  ``smdistributed.dataparallel.torch.distributed.all_reduce(...)``. - - - ``AVERAGE`` - - ``SUM`` - - ``MIN`` - - ``MAX`` diff --git a/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst deleted file mode 100644 index c615ad67aa..0000000000 --- a/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst +++ /dev/null @@ -1,534 +0,0 @@ -################################################################# -TensorFlow Guide to SageMaker's distributed data parallel library -################################################################# - -.. admonition:: Contents - - - :ref:`tensorflow-sdp-modify` - - :ref:`tensorflow-sdp-api` - -.. _tensorflow-sdp-modify: - -Modify a TensorFlow 2.x training script to use SageMaker data parallel -====================================================================== - -The following steps show you how to convert a TensorFlow 2.x training -script to utilize the distributed data parallel library. - -The distributed data parallel library APIs are designed to be close to Horovod APIs. -See `SageMaker distributed data parallel TensorFlow examples -`__ -for additional details on how to implement the data parallel library. - -- First import the distributed data parallel library’s TensorFlow client and initialize it: - - .. code:: python - - import smdistributed.dataparallel.tensorflow as sdp - sdp.init() - - -- Pin each GPU to a single smdistributed.dataparallel process - with ``local_rank`` - this refers to the relative rank of the - process within a given node. ``sdp.tensorflow.local_rank()`` API - provides you the local rank of the device. The leader node will be - rank 0, and the worker nodes will be rank 1, 2, 3, and so on. This is - invoked in the next code block as ``sdp.local_rank()``. - ``set_memory_growth`` is not directly related to SMD, but must be set - for distributed training with TensorFlow. - - .. code:: python - - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: -     tf.config.experimental.set_memory_growth(gpu, True) - if gpus: -     tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], 'GPU') - - -- Scale the learning rate by the number of workers. - ``sdp.tensorflow.size()`` API provides you number of workers in the - cluster. This is invoked in the next code block as ``sdp.size()``. - - .. code:: python - - learning_rate = learning_rate * sdp.size() - - -- Use the library’s ``DistributedGradientTape`` to optimize AllReduce - operations during training. This wraps ``tf.GradientTape``. - - .. code:: python - - with tf.GradientTape() as tape: -       output = model(input) -       loss_value = loss(label, output) - - # Wrap tf.GradientTape with the library's DistributedGradientTape - tape = sdp.DistributedGradientTape(tape) - - -- Broadcast initial model variables from the leader node (rank 0) to - all the worker nodes (ranks 1 through n). This is needed to ensure a - consistent initialization across all the worker ranks. For this, you - use ``sdp.tensorflow.broadcast_variables`` API after the - model and optimizer variables are initialized. This is invoked in the - next code block as ``sdp.broadcast_variables()``. - - .. code:: python - - sdp.broadcast_variables(model.variables, root_rank=0) - sdp.broadcast_variables(opt.variables(), root_rank=0) - - -- Finally, modify your script to save checkpoints only on the leader - node. The leader node will have a synchronized model. This also - avoids worker nodes overwriting the checkpoints and possibly - corrupting the checkpoints. - - .. code:: python - - if sdp.rank() == 0: -     checkpoint.save(checkpoint_dir) - - -All put together, the following is an example TensorFlow2 training -script you will have for distributed training with the library. - -.. code:: python - - import tensorflow as tf - - # Import the library's TF API - import smdistributed.dataparallel.tensorflow as sdp - - # Initialize the library - sdp.init() - - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: -     tf.config.experimental.set_memory_growth(gpu, True) - if gpus: -     # Pin GPUs to a single process -     tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], 'GPU') - - # Prepare Dataset - dataset = tf.data.Dataset.from_tensor_slices(...) - - # Define Model - mnist_model = tf.keras.Sequential(...) - loss = tf.losses.SparseCategoricalCrossentropy() - - # Scale Learning Rate - # LR for 8 node run : 0.000125 - # LR for single node run : 0.001 - opt = tf.optimizers.Adam(0.000125 * sdp.size()) - - @tf.function - def training_step(images, labels, first_batch): -     with tf.GradientTape() as tape: -         probs = mnist_model(images, training=True) -         loss_value = loss(labels, probs) - -     # Wrap tf.GradientTape with the library's DistributedGradientTape -     tape = sdp.DistributedGradientTape(tape) - -     grads = tape.gradient(loss_value, mnist_model.trainable_variables) -     opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) - -     if first_batch: -        # Broadcast model and optimizer variables -        sdp.broadcast_variables(mnist_model.variables, root_rank=0) -        sdp.broadcast_variables(opt.variables(), root_rank=0) - -     return loss_value - - ... - - # Save checkpoints only from master node. - if sdp.rank() == 0: -     checkpoint.save(checkpoint_dir) - - -.. _tensorflow-sdp-api: - -TensorFlow API -============== - -.. function:: smdistributed.dataparallel.tensorflow.init() - - Initialize ``smdistributed.dataparallel``. Must be called at the - beginning of the training script. - - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - - .. rubric:: Notes - - ``init()`` needs to be called only once. It will throw an error if - called more than once: - - ``init() called more than once. smdistributed.dataparallel is already initialized.`` - - -.. function:: smdistributed.dataparallel.tensorflow.size() - - The total number of GPUs across all the nodes in the cluster. For - example, in a 8 node cluster with 8 GPUs each, ``size`` will be equal - to 64. - - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the total number of GPUs, across all - nodes in the cluster. - - -.. function:: smdistributed.dataparallel.tensorflow.local_size() - - The total number of GPUs on a node. For example, on a node with 8 - GPUs, ``local_size`` will be equal to 8. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the total number of GPUs on itself. - - -.. function:: smdistributed.dataparallel.tensorflow.rank() - - The rank of the node in the cluster. The rank ranges from 0 to number of - nodes - 1. This is similar to MPI's World Rank. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the node. - - -.. function:: smdistributed.dataparallel.tensorflow.local_rank() - - Local rank refers to the relative rank of the - GPUs’ ``smdistributed.dataparallel`` processes within the node. For - example, if a node contains 8 GPUs, it has - 8 ``smdistributed.dataparallel`` processes, then each process will - get a local rank ranging from 0 to 7. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the GPU and - its ``smdistributed.dataparallel`` process. - - -.. function:: smdistributed.dataparallel.tensorflow.allreduce(tensor, param_index, num_params, compression=Compression.none, op=ReduceOp.AVERAGE) - - Performs an all-reduce operation on a tensor (``tf.Tensor``). - - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors. By - default, ``smdistributed.dataparallel`` AllReduce averages the - tensors across the participating workers. - ​ - **Inputs:** - - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all ranks. - - ``param_index (int)(required):`` 0 if you are reducing a single tensor. Index of the tensor if you are reducing a list of tensors. - - ``num_params (int)(required):`` len(tensor). - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``op (optional)(smdistributed.dataparallel.tensorflow.ReduceOp)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``SUM``, ``MIN``, ``MAX``, ``AVERAGE`` - - **Returns:** - - - A tensor of the same shape and type as input ``tensor``, all-reduced across all the processes. - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast_global_variables(root_rank) - - Broadcasts all global variables from root rank to all other processes. - - **Inputs:** - - - ``root_rank (int)(required):`` Rank of the process from which global - variables will be broadcasted to all other processes. - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast_variables(variables, root_rank) - - Applicable for TensorFlow 2.x only. - ​ - Broadcasts variables from root rank to all other processes. - ​ - With TensorFlow 2.x, ``broadcast_variables`` is used to - broadcast ``model.variables`` and ``optimizer.variables`` post - initialization from the leader node to all the worker nodes. This - ensures a consistent initialization across all the worker ranks. - - **Inputs:** - - - ``variables (tf.Variable)(required):`` Variables to be broadcasted. - - ``root_rank (int)(required):`` Rank of the process from which - variables will be broadcasted to all other processes. - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.oob_allreduce(tensor, compression=Compression.none, op=ReduceOp.AVERAGE) - - OutOfBand (oob) AllReduce is simplified AllReduce function for use cases - such as calculating total loss across all the GPUs in the training. - oob_allreduce average the tensors, as reduction operation, across the - worker nodes. - - **Inputs:** - - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all worker nodes. - - ``compression`` (optional): Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different worker nodes. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - **Returns:** - - - ``None`` - - .. rubric:: Notes - - ``smdistributed.dataparallel.tensorflow.oob_allreduce``, in most - cases, is ~2x slower - than ``smdistributed.dataparallel.tensorflow.allreduce``  so it is not - recommended to be used for performing gradient reduction during the - training - process. ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. - - -.. function:: smdistributed.dataparallel.tensorflow.overlap(tensor) - - This function is applicable only for models compiled with XLA. Use this - function to enable ``smdistributed.dataparallel`` to efficiently - overlap backward pass with the all reduce operation. - - Example usage: - - .. code:: python - - layer = tf.nn.dropout(...) # Or any other layer - layer = smdistributed.dataparallel.tensorflow.overlap(layer) - - The overlap operation is inserted into the TF graph as a node. It - behaves as an identity operation, and helps in achieving the - communication overlap with backward pass operation. - - **Inputs:** - - - ``tensor (tf.Tensor)(required):`` The tensor to be all-reduced. - - **Returns:** - - - ``None`` - - .. rubric:: Notes - - This operation helps in speeding up distributed training, as - the AllReduce operation does not have to wait for all the gradients to - be ready. Backward propagation proceeds sequentially from the output - layer of the network to the input layer. When the gradient computation - for a layer finishes, ``smdistributed.dataparallel`` adds them to a - fusion buffer. As soon as the size of the fusion buffer reaches a - predefined threshold (25 Mb), ``smdistributed.dataparallel`` starts - the AllReduce operation. - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast(tensor, root_rank) - - Broadcasts the input tensor on root rank to the same input tensor on all - other ``smdistributed.dataparallel`` processes. - ​ - The broadcast will not start until all processes are ready to send and - receive the tensor. - - **Inputs:** - - - ``tensor (tf.Tensor)(required):`` The tensor to be broadcasted. - - ``root_rank (int)(required):`` Rank of the process from which - tensor will be broadcasted to all other processes. - - **Returns:** - - - A tensor of the same shape and type as tensor, with the value - broadcasted from root rank. - - -.. function:: smdistributed.dataparallel.tensorflow.shutdown() - - Shuts down ``smdistributed.dataparallel``. Optional to call at the end - of the training script. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.DistributedOptimizer - - Applicable if you use the ``tf.estimator`` API in TensorFlow 2.x (2.3.1). - ​ - Construct a new ``DistributedOptimizer`` , which uses TensorFlow - optimizer under the hood for computing single-process gradient values - and applying gradient updates after the gradient values have been - combined across all ``smdistributed.dataparallel`` workers. - ​ - Example usage: - - .. code:: python - - opt = ... # existing optimizer from tf.train package or your custom optimizer - opt = smdistributed.dataparallel.tensorflow.DistributedOptimizer(opt) - - - - ``optimizer (tf.train.Optimizer)(required):`` TF Optimizer to use for computing gradients and applying updates. - - - ``name (str)(optional):`` Name prefix for the operations created when applying gradients. Defaults to ``smdistributed.dataparallel`` followed by provided optimizer type. - - - ``use_locking (bool)(optional):`` Whether to use locking when updating variables. Defaults to ``False``. - - - ``device_dense:`` Not supported. Raises not supported error. - - - ``device_sparse:`` Not supported. Raises not supported error. - - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``. - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - - ``bucket_cap_mb (int)(optional):`` Size of ``smdistributed.dataparallel`` fusion buffer size. Defaults to 25MB that works optimally for most case. If you provide a value, expects the (value * 1024 * 1024) i.e., bytes to be multiple of 128. - - -.. function:: smdistributed.dataparallel.tensorflow.DistributedGradientTape - - Applicable to TensorFlow 2.x only. - - Construct a new ``DistributedGradientTape``, which uses - TensorFlow’s ``GradientTape`` under the hood, using an AllReduce to - combine gradient values before applying gradients to model weights. - ​ - Example Usage: - - .. code:: python - - with tf.GradientTape() as tape: -       output = model(input) -       loss_value = loss(label, output) - - # Wrap in smdistributed.dataparallel's DistributedGradientTape - tape = smdistributed.dataparallel.tensorflow.DistributedGradientTape(tape) - - - - ``gradtape (tf.GradientTape)(required):`` GradientTape to use for computing gradients and applying updates. - - - ``device_dense:`` Not supported. Raises not supported error. - - - ``device_sparse:`` Not supported. Raises not supported error. - - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``. - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - -.. function:: smdistributed.dataparallel.tensorflow.BroadcastGlobalVariablesHook - - Applicable if you use the ``tf.estimator`` API in TensorFlow 2.x (2.3.1). - - - ``SessionRunHook`` that will broadcast all global variables from root - rank to all other processes during initialization. - ​ - This is necessary to ensure consistent initialization of all workers - when training is started with random weights or restored from a - checkpoint. - ​ - Example Usage: - - .. code:: python - - hooks = [smdistributed.dataparallel.tensorflow.BroadcastGlobalVariablesHook(root_rank=0)] - ... - with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, -                                        hooks=hooks, -                                        config=config) as mon_sess: -      ... - - - - ``root_rank (int)(required):`` Rank of the process from which global - variables will be broadcasted to all other processes. - - -.. function:: smdistributed.dataparallel.tensorflow.Compression - - Optional Gradient Compression algorithm that can be used in AllReduce - operation. - - - ``none``: alias for ``NoneCompression``. Do not compression gradient - tensors. - - ``fp16``: alias for ``FP16Compression``. Compress the floating point - gradient tensors to 16-bit (FP16) - - -.. function:: smdistributed.dataparallel.tensorflow.ReduceOp - - Supported reduction operations in ``smdistributed.dataparallel``. - - - ``AVERAGE`` - - ``SUM`` - - ``MIN`` - - ``MAX`` diff --git a/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst b/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst deleted file mode 100644 index 1e18472ca0..0000000000 --- a/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst +++ /dev/null @@ -1,531 +0,0 @@ -############################################################## -PyTorch Guide to SageMaker's distributed data parallel library -############################################################## - -.. admonition:: Contents - - - :ref:`pytorch-sdp-modify-1.0.0` - - :ref:`pytorch-sdp-api-1.0.0` - -.. _pytorch-sdp-modify-1.0.0: - -Modify a PyTorch training script to use SageMaker data parallel -====================================================================== - -The following steps show you how to convert a PyTorch training script to -utilize SageMaker's distributed data parallel library. - -The distributed data parallel library APIs are designed to be close to PyTorch Distributed Data -Parallel (DDP) APIs. -See `SageMaker distributed data parallel PyTorch examples `__ for additional details on how to implement the data parallel library -API offered for PyTorch. - - -- First import the distributed data parallel library’s PyTorch client and initialize it. You also import - the distributed data parallel library module for distributed training. - - .. code:: python - - import smdistributed.dataparallel.torch.distributed as dist - - from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP - - dist.init_process_group() - - -- Pin each GPU to a single distributed data parallel library process with ``local_rank`` - this - refers to the relative rank of the process within a given node. - ``smdistributed.dataparallel.torch.get_local_rank()`` API provides - you the local rank of the device. The leader node will be rank 0, and - the worker nodes will be rank 1, 2, 3, and so on. This is invoked in - the next code block as ``dist.get_local_rank()``. - - .. code:: python - - torch.cuda.set_device(dist.get_local_rank()) - - -- Then wrap the PyTorch model with the distributed data parallel library’s DDP. - - .. code:: python - - model = ... - # Wrap model with SageMaker's DistributedDataParallel - model = DDP(model) - - -- Modify the ``torch.utils.data.distributed.DistributedSampler`` to - include the cluster’s information. Set``num_replicas`` to the - total number of GPUs participating in training across all the nodes - in the cluster. This is called ``world_size``. You can get - ``world_size`` with - ``smdistributed.dataparallel.torch.get_world_size()`` API. This is - invoked in the following code as ``dist.get_world_size()``. Also - supply the node rank using - ``smdistributed.dataparallel.torch.get_rank()``. This is invoked as - ``dist.get_rank()``. - - .. code:: python - - train_sampler = DistributedSampler(train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank()) - - -- Finally, modify your script to save checkpoints only on the leader - node. The leader node will have a synchronized model. This also - avoids worker nodes overwriting the checkpoints and possibly - corrupting the checkpoints. - -.. code:: python - - if dist.get_rank() == 0: - torch.save(...) - - -All put together, the following is an example PyTorch training script -you will have for distributed training with the distributed data parallel library: - -.. code:: python - - # Import distributed data parallel library PyTorch API - import smdistributed.dataparallel.torch.distributed as dist - - # Import distributed data parallel library PyTorch DDP - from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP - - # Initialize distributed data parallel library - dist.init_process_group() - - class Net(nn.Module): -     ... -     # Define model - - def train(...): -     ... -     # Model training - - def test(...): -     ... -     # Model evaluation - - def main(): - -     # Scale batch size by world size -     batch_size //= dist.get_world_size() // 8 -     batch_size = max(batch_size, 1) - -     # Prepare dataset -     train_dataset = torchvision.datasets.MNIST(...) - -     # Set num_replicas and rank in DistributedSampler -     train_sampler = torch.utils.data.distributed.DistributedSampler( -             train_dataset, -             num_replicas=dist.get_world_size(), -             rank=dist.get_rank()) - -     train_loader = torch.utils.data.DataLoader(..) - -     # Wrap the PyTorch model with distributed data parallel library’s DDP -     model = DDP(Net().to(device)) - -     # Pin each GPU to a single distributed data parallel library process. -     torch.cuda.set_device(local_rank) -     model.cuda(local_rank) - -     # Train -     optimizer = optim.Adadelta(...) -     scheduler = StepLR(...) -     for epoch in range(1, args.epochs + 1): -         train(...) -         if rank == 0: -             test(...) -         scheduler.step() - -     # Save model on master node. -     if dist.get_rank() == 0: -         torch.save(...) - - if __name__ == '__main__': -     main() - - -.. _pytorch-sdp-api-1.0.0: - -PyTorch API -=========== - -.. rubric:: Supported versions - -**PyTorch 1.6.0, 1.7.1** - - -.. function:: smdistributed.dataparallel.torch.distributed.is_available() - :noindex: - - Check if script started as a distributed job. For local runs user can - check that is_available returns False and run the training script - without calls to ``smdistributed.dataparallel``. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``True`` if started as a distributed job, ``False`` otherwise - - -.. function:: smdistributed.dataparallel.torch.distributed.init_process_group(*args, **kwargs) - :noindex: - - Initialize ``smdistributed.dataparallel``. Must be called at the - beginning of the training script, before calling any other methods. - ​ - Process group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with ``torch.distributed`` only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - ​ - After this - call, ``smdistributed.dataparallel.torch.distributed.is_initialized()`` will - return ``True``. - ​ - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.torch.distributed.is_initialized() - :noindex: - - Checks if the default process group has been initialized. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``True`` if initialized, else ``False``. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_world_size(group=smdistributed.dataparallel.torch.distributed.group.WORLD) - :noindex: - - The total number of GPUs across all the nodes in the cluster. For - example, in a 8 node cluster with 8 GPU each, size will be equal to 64. - - **Inputs:** - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - **Returns:** - - - An integer scalar containing the total number of GPUs in the training - job, across all nodes in the cluster. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_rank(group=smdistributed.dataparallel.torch.distributed.group.WORLD) - :noindex: - - The rank of the node in the cluster. The rank ranges from 0 to number of - nodes - 1. This is similar to MPI's World Rank. - - - **Inputs:** - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - **Returns:** - - - An integer scalar containing the rank of the worker node. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_local_rank() - :noindex: - - Local rank refers to the relative rank of - the ``smdistributed.dataparallel`` process within the node the current - process is running on. For example, if a node contains 8 GPUs, it has - 8 ``smdistributed.dataparallel`` processes. Each process has - a ``local_rank`` ranging from 0 to 7. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the GPU and - its ``smdistributed.dataparallel`` process. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_reduce(tensor, op=smdistributed.dataparallel.torch.distributed.ReduceOp.SUM, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Performs an all-reduce operation on a tensor (torch.tensor) across - all ``smdistributed.dataparallel`` workers - - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors.  By - default, ``smdistributed.dataparallel`` AllReduce reduces the tensor - data across all ``smdistributed.dataparallel`` workers in such a way - that all get the final result. - - After the call ``tensor`` is going to be bitwise identical in all - processes. - - **Inputs:** - - - ``tensor (torch.tensor) (required):`` Input and output of the collective. The function operates in-place. - - - ``op (smdistributed.dataparallel.torch.distributed.ReduceOp) (optional)``: The reduction operation to combine tensors across different ranks.  Defaults to ``SUM`` if None is given. - - * Supported ops: ``AVERAGE``, ``SUM``, ``MIN``, ``MAX`` - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process group is not supported in ``smdistributed.dataparallel``. This parameter exists for API parity with torch.distributed only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool) (optional):`` Whether this op should be an async op. Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - .. rubric:: Notes - - ``smdistributed.dataparallel.torch.distributed.allreduce``, in most - cases, is ~2X slower than all-reducing - with ``smdistributed.dataparallel.torch.parallel.distributed.DistributedDataParallel`` and - hence, it is not recommended to be used for performing gradient - reduction during the training - process. ``smdistributed.dataparallel.torch.distributed.allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. - - -.. function:: smdistributed.dataparallel.torch.distributed.broadcast(tensor, src=0, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Broadcasts the tensor (torch.tensor) to the whole group. - - ``tensor`` must have the same number of elements as GPUs in the - cluster. - - **Inputs:** - - - ``tensor (torch.tensor)(required)`` - - - ``src (int)(optional)`` - - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process group is not supported in ``smdistributed.dataparallel``. This parameter exists for API parity with ``torch.distributed`` only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool)(optional):`` Whether this op should be an async op. Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_gather(tensor_list, tensor, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Gathers tensors from the whole group in a list. - - - **Inputs:** - - - ``tensor_list (list[torch.tensor])(required):`` Output list. It - should contain correctly-sized tensors to be used for output of the - collective. - - ``tensor (torch.tensor)(required):`` Tensor to be broadcast from - current process. - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_to_all_single(output_t, input_t, output_split_sizes=None, input_split_sizes=None, group=group.WORLD, async_op=False) - :noindex: - - Each process scatters input tensor to all processes in a group and return gathered tensor in output. - - **Inputs:** - - - output_t - - input_t - - output_split_sizes - - input_split_sizes - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.barrier(group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Synchronizes all ``smdistributed.dataparallel`` processes. - - **Inputs:** - - - tensor (torch.tensor)(required): Data to be sent if src is the rank of current process, and tensor to be used to save received data otherwise. - - - src (int)(optional): Source rank. - - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. class:: smdistributed.dataparallel.torch.parallel.DistributedDataParallel(module, device_ids=None, output_device=None, broadcast_buffers=True, process_group=None, bucket_cap_mb=None) - :noindex: - - ``smdistributed.dataparallel's`` implementation of distributed data - parallelism for PyTorch. In most cases, wrapping your PyTorch Module - with ``smdistributed.dataparallel's`` ``DistributedDataParallel (DDP)`` is - all you need to do to use ``smdistributed.dataparallel``. - - Creation of this DDP class requires ``smdistributed.dataparallel`` - already initialized - with ``smdistributed.dataparallel.torch.distributed.init_process_group()``. - - This container parallelizes the application of the given module by - splitting the input across the specified devices by chunking in the - batch dimension. The module is replicated on each machine and each - device, and each such replica handles a portion of the input. During the - backwards pass, gradients from each node are averaged. - - The batch size should be larger than the number of GPUs used locally. - ​ - Example usage - of ``smdistributed.dataparallel.torch.parallel.DistributedDataParallel``: - - .. code:: python - - import torch - import smdistributed.dataparallel.torch.distributed as dist - from smdistributed.dataparallel.torch.parallel import DistributedDataParallel as DDP - - dist.init_process_group() - - # Pin GPU to be used to process local rank (one GPU per process) - torch.cuda.set_device(dist.get_local_rank()) - - # Build model and optimizer - model = ... - optimizer = torch.optim.SGD(model.parameters(), -                             lr=1e-3 * dist.get_world_size()) - # Wrap model with smdistributed.dataparallel's DistributedDataParallel - model = DDP(model) - - **Parameters:** - - - ``module (torch.nn.Module)(required):`` PyTorch NN Module to be - parallelized - - ``device_ids (list[int])(optional):`` CUDA devices. This should only - be provided when the input module resides on a single CUDA device. - For single-device modules, - the ``ith module replica is placed on device_ids[i]``. For - multi-device modules and CPU modules, device_ids must be None or an - empty list, and input data for the forward pass must be placed on the - correct device. Defaults to ``None``. - - ``output_device (int)(optional):`` Device location of output for - single-device CUDA modules. For multi-device modules and CPU modules, - it must be None, and the module itself dictates the output location. - (default: device_ids[0] for single-device modules).  Defaults - to ``None``. - - ``broadcast_buffers (bool)(optional):`` Flag that enables syncing - (broadcasting) buffers of the module at beginning of the forward - function. ``smdistributed.dataparallel`` does not support broadcast - buffer yet. Please set this to ``False``. - - ``process_group(smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` Defaults - to ``None.`` - - ``bucket_cap_mb (int)(optional):`` DistributedDataParallel will - bucket parameters into multiple buckets so that gradient reduction of - each bucket can potentially overlap with backward - computation. ``bucket_cap_mb`` controls the bucket size in - MegaBytes (MB) (default: 25). - - .. rubric:: Notes - - - This module assumes all parameters are registered in the model by the - time it is created. No parameters should be added nor removed later. - - This module assumes all parameters are registered in the model of - each distributed processes are in the same order. The module itself - will conduct gradient all-reduction following the reverse order of - the registered parameters of the model. In other words, it is users’ - responsibility to ensure that each distributed process has the exact - same model and thus the exact same parameter registration order. - - You should never change the set of your model’s parameters after - wrapping up your model with DistributedDataParallel. In other words, - when wrapping up your model with DistributedDataParallel, the - constructor of DistributedDataParallel will register the additional - gradient reduction functions on all the parameters of the model - itself at the time of construction. If you change the model’s - parameters after the DistributedDataParallel construction, this is - not supported and unexpected behaviors can happen, since some - parameters’ gradient reduction functions might not get called. - - -.. class:: smdistributed.dataparallel.torch.distributed.ReduceOp - :noindex: - - An enum-like class for supported reduction operations - in ``smdistributed.dataparallel``. - - The values of this class can be accessed as attributes, for - example, ``ReduceOp.SUM``. They are used in specifying strategies for - reduction collectives such as -  ``smdistributed.dataparallel.torch.distributed.all_reduce(...)``. - - - ``AVERAGE`` - - ``SUM`` - - ``MIN`` - - ``MAX`` diff --git a/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst deleted file mode 100644 index 43e4d8f26d..0000000000 --- a/doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst +++ /dev/null @@ -1,555 +0,0 @@ -################################################################# -TensorFlow Guide to SageMaker's distributed data parallel library -################################################################# - -.. admonition:: Contents - - - :ref:`tensorflow-sdp-modify-1.0.0` - - :ref:`tensorflow-sdp-api-1.0.0` - -.. _tensorflow-sdp-modify-1.0.0: - -Modify a TensorFlow 2.x training script to use SageMaker data parallel -====================================================================== - -The following steps show you how to convert a TensorFlow 2.x training -script to utilize the distributed data parallel library. - -The distributed data parallel library APIs are designed to be close to Horovod APIs. -See `SageMaker distributed data parallel TensorFlow examples `__ for additional details on how to implement the data parallel library -API offered for TensorFlow. - -- First import the distributed data parallel library’s TensorFlow client and initialize it: - - .. code:: python - - import smdistributed.dataparallel.tensorflow as sdp - sdp.init() - - -- Pin each GPU to a single smdistributed.dataparallel process - with ``local_rank`` - this refers to the relative rank of the - process within a given node. ``sdp.tensorflow.local_rank()`` API - provides you the local rank of the device. The leader node will be - rank 0, and the worker nodes will be rank 1, 2, 3, and so on. This is - invoked in the next code block as ``sdp.local_rank()``. - ``set_memory_growth`` is not directly related to SMD, but must be set - for distributed training with TensorFlow. - - .. code:: python - - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: -     tf.config.experimental.set_memory_growth(gpu, True) - if gpus: -     tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], 'GPU') - - -- Scale the learning rate by the number of workers. - ``sdp.tensorflow.size()`` API provides you number of workers in the - cluster. This is invoked in the next code block as ``sdp.size()``. - - .. code:: python - - learning_rate = learning_rate * sdp.size() - - -- Use the library’s ``DistributedGradientTape`` to optimize AllReduce - operations during training. This wraps ``tf.GradientTape``. - - .. code:: python - - with tf.GradientTape() as tape: -       output = model(input) -       loss_value = loss(label, output) - - # Wrap tf.GradientTape with the library's DistributedGradientTape - tape = sdp.DistributedGradientTape(tape) - - -- Broadcast initial model variables from the leader node (rank 0) to - all the worker nodes (ranks 1 through n). This is needed to ensure a - consistent initialization across all the worker ranks. For this, you - use ``sdp.tensorflow.broadcast_variables`` API after the - model and optimizer variables are initialized. This is invoked in the - next code block as ``sdp.broadcast_variables()``. - - .. code:: python - - sdp.broadcast_variables(model.variables, root_rank=0) - sdp.broadcast_variables(opt.variables(), root_rank=0) - - -- Finally, modify your script to save checkpoints only on the leader - node. The leader node will have a synchronized model. This also - avoids worker nodes overwriting the checkpoints and possibly - corrupting the checkpoints. - - .. code:: python - - if sdp.rank() == 0: -     checkpoint.save(checkpoint_dir) - - -All put together, the following is an example TensorFlow2 training -script you will have for distributed training with the library. - -.. code:: python - - import tensorflow as tf - - # Import the library's TF API - import smdistributed.dataparallel.tensorflow as sdp - - # Initialize the library - sdp.init() - - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: -     tf.config.experimental.set_memory_growth(gpu, True) - if gpus: -     # Pin GPUs to a single process -     tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], 'GPU') - - # Prepare Dataset - dataset = tf.data.Dataset.from_tensor_slices(...) - - # Define Model - mnist_model = tf.keras.Sequential(...) - loss = tf.losses.SparseCategoricalCrossentropy() - - # Scale Learning Rate - # LR for 8 node run : 0.000125 - # LR for single node run : 0.001 - opt = tf.optimizers.Adam(0.000125 * sdp.size()) - - @tf.function - def training_step(images, labels, first_batch): -     with tf.GradientTape() as tape: -         probs = mnist_model(images, training=True) -         loss_value = loss(labels, probs) - -     # Wrap tf.GradientTape with the library's DistributedGradientTape -     tape = sdp.DistributedGradientTape(tape) - -     grads = tape.gradient(loss_value, mnist_model.trainable_variables) -     opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) - -     if first_batch: -        # Broadcast model and optimizer variables -        sdp.broadcast_variables(mnist_model.variables, root_rank=0) -        sdp.broadcast_variables(opt.variables(), root_rank=0) - -     return loss_value - - ... - - # Save checkpoints only from master node. - if sdp.rank() == 0: -     checkpoint.save(checkpoint_dir) - - -.. _tensorflow-sdp-api-1.0.0: - -TensorFlow API -============== - -.. rubric:: Supported versions - -**TensorFlow 2.3.x - 2.4.1** - - -.. function:: smdistributed.dataparallel.tensorflow.init() - :noindex: - - Initialize ``smdistributed.dataparallel``. Must be called at the - beginning of the training script. - - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - - .. rubric:: Notes - - ``init()`` needs to be called only once. It will throw an error if - called more than once: - - ``init() called more than once. smdistributed.dataparallel is already initialized.`` - - -.. function:: smdistributed.dataparallel.tensorflow.size() - :noindex: - - The total number of GPUs across all the nodes in the cluster. For - example, in a 8 node cluster with 8 GPUs each, ``size`` will be equal - to 64. - - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the total number of GPUs, across all - nodes in the cluster. - - -.. function:: smdistributed.dataparallel.tensorflow.local_size() - :noindex: - - The total number of GPUs on a node. For example, on a node with 8 - GPUs, ``local_size`` will be equal to 8. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the total number of GPUs on itself. - - -.. function:: smdistributed.dataparallel.tensorflow.rank() - :noindex: - - The rank of the node in the cluster. The rank ranges from 0 to number of - nodes - 1. This is similar to MPI's World Rank. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the node. - - -.. function:: smdistributed.dataparallel.tensorflow.local_rank() - :noindex: - - Local rank refers to the relative rank of the - GPUs’ ``smdistributed.dataparallel`` processes within the node. For - example, if a node contains 8 GPUs, it has - 8 ``smdistributed.dataparallel`` processes, then each process will - get a local rank ranging from 0 to 7. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the GPU and - its ``smdistributed.dataparallel`` process. - - -.. function:: smdistributed.dataparallel.tensorflow.allreduce(tensor, param_index, num_params, compression=Compression.none, op=ReduceOp.AVERAGE) - :noindex: - - Performs an all-reduce operation on a tensor (``tf.Tensor``). - - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors. By - default, ``smdistributed.dataparallel`` AllReduce averages the - tensors across the participating workers. - ​ - **Inputs:** - - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all ranks. - - ``param_index (int)(required):`` 0 if you are reducing a single tensor. Index of the tensor if you are reducing a list of tensors. - - ``num_params (int)(required):`` len(tensor). - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``op (optional)(smdistributed.dataparallel.tensorflow.ReduceOp)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``SUM``, ``MIN``, ``MAX``, ``AVERAGE`` - - **Returns:** - - - A tensor of the same shape and type as input ``tensor``, all-reduced across all the processes. - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast_global_variables(root_rank) - :noindex: - - Broadcasts all global variables from root rank to all other processes. - - **Inputs:** - - - ``root_rank (int)(required):`` Rank of the process from which global - variables will be broadcasted to all other processes. - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast_variables(variables, root_rank) - :noindex: - - Applicable for TensorFlow 2.x only. - ​ - Broadcasts variables from root rank to all other processes. - ​ - With TensorFlow 2.x, ``broadcast_variables`` is used to - broadcast ``model.variables`` and ``optimizer.variables`` post - initialization from the leader node to all the worker nodes. This - ensures a consistent initialization across all the worker ranks. - - **Inputs:** - - - ``variables (tf.Variable)(required):`` Variables to be broadcasted. - - ``root_rank (int)(required):`` Rank of the process from which - variables will be broadcasted to all other processes. - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.oob_allreduce(tensor, compression=Compression.none, op=ReduceOp.AVERAGE) - :noindex: - - OutOfBand (oob) AllReduce is simplified AllReduce function for use cases - such as calculating total loss across all the GPUs in the training. - oob_allreduce average the tensors, as reduction operation, across the - worker nodes. - - **Inputs:** - - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all worker nodes. - - ``compression`` (optional): Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different worker nodes. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - **Returns:** - - - ``None`` - - .. rubric:: Notes - - ``smdistributed.dataparallel.tensorflow.oob_allreduce``, in most - cases, is ~2x slower - than ``smdistributed.dataparallel.tensorflow.allreduce``  so it is not - recommended to be used for performing gradient reduction during the - training - process. ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. - - -.. function:: smdistributed.dataparallel.tensorflow.overlap(tensor) - :noindex: - - This function is applicable only for models compiled with XLA. Use this - function to enable ``smdistributed.dataparallel`` to efficiently - overlap backward pass with the all reduce operation. - - Example usage: - - .. code:: python - - layer = tf.nn.dropout(...) # Or any other layer - layer = smdistributed.dataparallel.tensorflow.overlap(layer) - - The overlap operation is inserted into the TF graph as a node. It - behaves as an identity operation, and helps in achieving the - communication overlap with backward pass operation. - - **Inputs:** - - - ``tensor (tf.Tensor)(required):`` The tensor to be all-reduced. - - **Returns:** - - - ``None`` - - .. rubric:: Notes - - This operation helps in speeding up distributed training, as - the AllReduce operation does not have to wait for all the gradients to - be ready. Backward propagation proceeds sequentially from the output - layer of the network to the input layer. When the gradient computation - for a layer finishes, ``smdistributed.dataparallel`` adds them to a - fusion buffer. As soon as the size of the fusion buffer reaches a - predefined threshold (25 Mb), ``smdistributed.dataparallel`` starts - the AllReduce operation. - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast(tensor, root_rank) - :noindex: - - Broadcasts the input tensor on root rank to the same input tensor on all - other ``smdistributed.dataparallel`` processes. - ​ - The broadcast will not start until all processes are ready to send and - receive the tensor. - - **Inputs:** - - - ``tensor (tf.Tensor)(required):`` The tensor to be broadcasted. - - ``root_rank (int)(required):`` Rank of the process from which - tensor will be broadcasted to all other processes. - - **Returns:** - - - A tensor of the same shape and type as tensor, with the value - broadcasted from root rank. - - -.. function:: smdistributed.dataparallel.tensorflow.shutdown() - :noindex: - - Shuts down ``smdistributed.dataparallel``. Optional to call at the end - of the training script. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.DistributedOptimizer - :noindex: - - Applicable if you use the ``tf.estimator`` API in TensorFlow 2.x (2.3.1). - ​ - Construct a new ``DistributedOptimizer`` , which uses TensorFlow - optimizer under the hood for computing single-process gradient values - and applying gradient updates after the gradient values have been - combined across all ``smdistributed.dataparallel`` workers. - ​ - Example usage: - - .. code:: python - - opt = ... # existing optimizer from tf.train package or your custom optimizer - opt = smdistributed.dataparallel.tensorflow.DistributedOptimizer(opt) - - - - ``optimizer (tf.train.Optimizer)(required):`` TF Optimizer to use for computing gradients and applying updates. - - - ``name (str)(optional):`` Name prefix for the operations created when applying gradients. Defaults to ``smdistributed.dataparallel`` followed by provided optimizer type. - - - ``use_locking (bool)(optional):`` Whether to use locking when updating variables. Defaults to ``False``. - - - ``device_dense:`` Not supported. Raises not supported error. - - - ``device_sparse:`` Not supported. Raises not supported error. - - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``. - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - - ``bucket_cap_mb (int)(optional):`` Size of ``smdistributed.dataparallel`` fusion buffer size. Defaults to 25MB that works optimally for most case. If you provide a value, expects the (value * 1024 * 1024) i.e., bytes to be multiple of 128. - - -.. function:: smdistributed.dataparallel.tensorflow.DistributedGradientTape - :noindex: - - Applicable to TensorFlow 2.x only. - - Construct a new ``DistributedGradientTape``, which uses - TensorFlow’s ``GradientTape`` under the hood, using an AllReduce to - combine gradient values before applying gradients to model weights. - ​ - Example Usage: - - .. code:: python - - with tf.GradientTape() as tape: -       output = model(input) -       loss_value = loss(label, output) - - # Wrap in smdistributed.dataparallel's DistributedGradientTape - tape = smdistributed.dataparallel.tensorflow.DistributedGradientTape(tape) - - - - ``gradtape (tf.GradientTape)(required):`` GradientTape to use for computing gradients and applying updates. - - - ``device_dense:`` Not supported. Raises not supported error. - - - ``device_sparse:`` Not supported. Raises not supported error. - - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``. - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - -.. function:: smdistributed.dataparallel.tensorflow.BroadcastGlobalVariablesHook - :noindex: - - Applicable if you use the ``tf.estimator`` API in TensorFlow 2.x (2.3.1). - - - ``SessionRunHook`` that will broadcast all global variables from root - rank to all other processes during initialization. - ​ - This is necessary to ensure consistent initialization of all workers - when training is started with random weights or restored from a - checkpoint. - ​ - Example Usage: - - .. code:: python - - hooks = [smdistributed.dataparallel.tensorflow.BroadcastGlobalVariablesHook(root_rank=0)] - ... - with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, -                                        hooks=hooks, -                                        config=config) as mon_sess: -      ... - - - - ``root_rank (int)(required):`` Rank of the process from which global - variables will be broadcasted to all other processes. - - -.. function:: smdistributed.dataparallel.tensorflow.Compression - :noindex: - - Optional Gradient Compression algorithm that can be used in AllReduce - operation. - - - ``none``: alias for ``NoneCompression``. Do not compression gradient - tensors. - - ``fp16``: alias for ``FP16Compression``. Compress the floating point - gradient tensors to 16-bit (FP16) - - -.. function:: smdistributed.dataparallel.tensorflow.ReduceOp - :noindex: - - Supported reduction operations in ``smdistributed.dataparallel``. - - - ``AVERAGE`` - - ``SUM`` - - ``MIN`` - - ``MAX`` diff --git a/doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_pytorch.rst b/doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_pytorch.rst deleted file mode 100644 index d8de621147..0000000000 --- a/doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_pytorch.rst +++ /dev/null @@ -1,533 +0,0 @@ -############################################################## -PyTorch Guide to SageMaker's distributed data parallel library -############################################################## - -.. admonition:: Contents - - - :ref:`pytorch-sdp-modify-11x` - - :ref:`pytorch-sdp-api-11x` - -.. _pytorch-sdp-modify-11x: - - -Modify a PyTorch training script to use SageMaker data parallel -====================================================================== - -The following steps show you how to convert a PyTorch training script to -utilize SageMaker's distributed data parallel library. - -The distributed data parallel library APIs are designed to be close to PyTorch Distributed Data -Parallel (DDP) APIs. -See `SageMaker distributed data parallel PyTorch examples `__ for additional details on how to implement the data parallel library -API offered for PyTorch. - - -- First import the distributed data parallel library’s PyTorch client and initialize it. You also import - the distributed data parallel library module for distributed training. - - .. code:: python - - import smdistributed.dataparallel.torch.distributed as dist - - from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP - - dist.init_process_group() - - -- Pin each GPU to a single distributed data parallel library process with ``local_rank`` - this - refers to the relative rank of the process within a given node. - ``smdistributed.dataparallel.torch.get_local_rank()`` API provides - you the local rank of the device. The leader node will be rank 0, and - the worker nodes will be rank 1, 2, 3, and so on. This is invoked in - the next code block as ``dist.get_local_rank()``. - - .. code:: python - - torch.cuda.set_device(dist.get_local_rank()) - - -- Then wrap the PyTorch model with the distributed data parallel library’s DDP. - - .. code:: python - - model = ... - # Wrap model with SageMaker's DistributedDataParallel - model = DDP(model) - - -- Modify the ``torch.utils.data.distributed.DistributedSampler`` to - include the cluster’s information. Set``num_replicas`` to the - total number of GPUs participating in training across all the nodes - in the cluster. This is called ``world_size``. You can get - ``world_size`` with - ``smdistributed.dataparallel.torch.get_world_size()`` API. This is - invoked in the following code as ``dist.get_world_size()``. Also - supply the node rank using - ``smdistributed.dataparallel.torch.get_rank()``. This is invoked as - ``dist.get_rank()``. - - .. code:: python - - train_sampler = DistributedSampler(train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank()) - - -- Finally, modify your script to save checkpoints only on the leader - node. The leader node will have a synchronized model. This also - avoids worker nodes overwriting the checkpoints and possibly - corrupting the checkpoints. - -.. code:: python - - if dist.get_rank() == 0: - torch.save(...) - - -All put together, the following is an example PyTorch training script -you will have for distributed training with the distributed data parallel library: - -.. code:: python - - # Import distributed data parallel library PyTorch API - import smdistributed.dataparallel.torch.distributed as dist - - # Import distributed data parallel library PyTorch DDP - from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP - - # Initialize distributed data parallel library - dist.init_process_group() - - class Net(nn.Module): -     ... -     # Define model - - def train(...): -     ... -     # Model training - - def test(...): -     ... -     # Model evaluation - - def main(): - -     # Scale batch size by world size -     batch_size //= dist.get_world_size() // 8 -     batch_size = max(batch_size, 1) - -     # Prepare dataset -     train_dataset = torchvision.datasets.MNIST(...) - -     # Set num_replicas and rank in DistributedSampler -     train_sampler = torch.utils.data.distributed.DistributedSampler( -             train_dataset, -             num_replicas=dist.get_world_size(), -             rank=dist.get_rank()) - -     train_loader = torch.utils.data.DataLoader(..) - -     # Wrap the PyTorch model with distributed data parallel library’s DDP -     model = DDP(Net().to(device)) - -     # Pin each GPU to a single distributed data parallel library process. -     torch.cuda.set_device(local_rank) -     model.cuda(local_rank) - -     # Train -     optimizer = optim.Adadelta(...) -     scheduler = StepLR(...) -     for epoch in range(1, args.epochs + 1): -         train(...) -         if rank == 0: -             test(...) -         scheduler.step() - -     # Save model on master node. -     if dist.get_rank() == 0: -         torch.save(...) - - if __name__ == '__main__': -     main() - - -.. _pytorch-sdp-api-11x: - - -PyTorch API -=========== - -.. rubric:: Supported versions - -**PyTorch 1.7.1, 1.8.1** - - -.. function:: smdistributed.dataparallel.torch.distributed.is_available() - :noindex: - - Check if script started as a distributed job. For local runs user can - check that is_available returns False and run the training script - without calls to ``smdistributed.dataparallel``. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``True`` if started as a distributed job, ``False`` otherwise - - -.. function:: smdistributed.dataparallel.torch.distributed.init_process_group(*args, **kwargs) - :noindex: - - Initialize ``smdistributed.dataparallel``. Must be called at the - beginning of the training script, before calling any other methods. - ​ - Process group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with ``torch.distributed`` only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - ​ - After this - call, ``smdistributed.dataparallel.torch.distributed.is_initialized()`` will - return ``True``. - ​ - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.torch.distributed.is_initialized() - :noindex: - - Checks if the default process group has been initialized. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``True`` if initialized, else ``False``. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_world_size(group=smdistributed.dataparallel.torch.distributed.group.WORLD) - :noindex: - - The total number of GPUs across all the nodes in the cluster. For - example, in a 8 node cluster with 8 GPU each, size will be equal to 64. - - **Inputs:** - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - **Returns:** - - - An integer scalar containing the total number of GPUs in the training - job, across all nodes in the cluster. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_rank(group=smdistributed.dataparallel.torch.distributed.group.WORLD) - :noindex: - - The rank of the node in the cluster. The rank ranges from 0 to number of - nodes - 1. This is similar to MPI's World Rank. - - - **Inputs:** - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - **Returns:** - - - An integer scalar containing the rank of the worker node. - - -.. function:: smdistributed.dataparallel.torch.distributed.get_local_rank() - :noindex: - - Local rank refers to the relative rank of - the ``smdistributed.dataparallel`` process within the node the current - process is running on. For example, if a node contains 8 GPUs, it has - 8 ``smdistributed.dataparallel`` processes. Each process has - a ``local_rank`` ranging from 0 to 7. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the GPU and - its ``smdistributed.dataparallel`` process. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_reduce(tensor, op=smdistributed.dataparallel.torch.distributed.ReduceOp.SUM, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Performs an all-reduce operation on a tensor (torch.tensor) across - all ``smdistributed.dataparallel`` workers - - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors.  By - default, ``smdistributed.dataparallel`` AllReduce reduces the tensor - data across all ``smdistributed.dataparallel`` workers in such a way - that all get the final result. - - After the call ``tensor`` is going to be bitwise identical in all - processes. - - **Inputs:** - - - ``tensor (torch.tensor) (required):`` Input and output of the collective. The function operates in-place. - - - ``op (smdistributed.dataparallel.torch.distributed.ReduceOp) (optional)``: The reduction operation to combine tensors across different ranks.  Defaults to ``SUM`` if None is given. - - * Supported ops: ``AVERAGE``, ``SUM``, ``MIN``, ``MAX`` - - - ``group (smdistributed.dataparallel.torch.distributed.group) (optional):`` Process group is not supported in ``smdistributed.dataparallel``. This parameter exists for API parity with torch.distributed only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool) (optional):`` Whether this op should be an async op. Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - .. rubric:: Notes - - ``smdistributed.dataparallel.torch.distributed.allreduce``, in most - cases, is ~2X slower than all-reducing - with ``smdistributed.dataparallel.torch.parallel.distributed.DistributedDataParallel`` and - hence, it is not recommended to be used for performing gradient - reduction during the training - process. ``smdistributed.dataparallel.torch.distributed.allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. - - -.. function:: smdistributed.dataparallel.torch.distributed.broadcast(tensor, src=0, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Broadcasts the tensor (torch.tensor) to the whole group. - - ``tensor`` must have the same number of elements as GPUs in the - cluster. - - **Inputs:** - - - ``tensor (torch.tensor)(required)`` - - - ``src (int)(optional)`` - - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process group is not supported in ``smdistributed.dataparallel``. This parameter exists for API parity with ``torch.distributed`` only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool)(optional):`` Whether this op should be an async op. Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_gather(tensor_list, tensor, group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Gathers tensors from the whole group in a list. - - - **Inputs:** - - - ``tensor_list (list[torch.tensor])(required):`` Output list. It - should contain correctly-sized tensors to be used for output of the - collective. - - ``tensor (torch.tensor)(required):`` Tensor to be broadcast from - current process. - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.all_to_all_single(output_t, input_t, output_split_sizes=None, input_split_sizes=None, group=group.WORLD, async_op=False) - :noindex: - - Each process scatters input tensor to all processes in a group and return gathered tensor in output. - - **Inputs:** - - - output_t - - input_t - - output_split_sizes - - input_split_sizes - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. function:: smdistributed.dataparallel.torch.distributed.barrier(group=smdistributed.dataparallel.torch.distributed.group.WORLD, async_op=False) - :noindex: - - Synchronizes all ``smdistributed.dataparallel`` processes. - - **Inputs:** - - - tensor (torch.tensor)(required): Data to be sent if src is the rank of current process, and tensor to be used to save received data otherwise. - - - src (int)(optional): Source rank. - - - ``group (smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. - - * Only supported value is ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` - - - ``async_op (bool)(optional):`` Whether this op should be an async op. - Defaults to ``False``. - - **Returns:** - - - Async op work handle, if async_op is set to True. ``None``, - otherwise. - - -.. class:: smdistributed.dataparallel.torch.parallel.DistributedDataParallel(module, device_ids=None, output_device=None, broadcast_buffers=True, process_group=None, bucket_cap_mb=None) - :noindex: - - ``smdistributed.dataparallel's`` implementation of distributed data - parallelism for PyTorch. In most cases, wrapping your PyTorch Module - with ``smdistributed.dataparallel's`` ``DistributedDataParallel (DDP)`` is - all you need to do to use ``smdistributed.dataparallel``. - - Creation of this DDP class requires ``smdistributed.dataparallel`` - already initialized - with ``smdistributed.dataparallel.torch.distributed.init_process_group()``. - - This container parallelizes the application of the given module by - splitting the input across the specified devices by chunking in the - batch dimension. The module is replicated on each machine and each - device, and each such replica handles a portion of the input. During the - backwards pass, gradients from each node are averaged. - - The batch size should be larger than the number of GPUs used locally. - ​ - Example usage - of ``smdistributed.dataparallel.torch.parallel.DistributedDataParallel``: - - .. code:: python - - import torch - import smdistributed.dataparallel.torch.distributed as dist - from smdistributed.dataparallel.torch.parallel import DistributedDataParallel as DDP - - dist.init_process_group() - - # Pin GPU to be used to process local rank (one GPU per process) - torch.cuda.set_device(dist.get_local_rank()) - - # Build model and optimizer - model = ... - optimizer = torch.optim.SGD(model.parameters(), -                             lr=1e-3 * dist.get_world_size()) - # Wrap model with smdistributed.dataparallel's DistributedDataParallel - model = DDP(model) - - **Parameters:** - - - ``module (torch.nn.Module)(required):`` PyTorch NN Module to be - parallelized - - ``device_ids (list[int])(optional):`` CUDA devices. This should only - be provided when the input module resides on a single CUDA device. - For single-device modules, - the ``ith module replica is placed on device_ids[i]``. For - multi-device modules and CPU modules, device_ids must be None or an - empty list, and input data for the forward pass must be placed on the - correct device. Defaults to ``None``. - - ``output_device (int)(optional):`` Device location of output for - single-device CUDA modules. For multi-device modules and CPU modules, - it must be None, and the module itself dictates the output location. - (default: device_ids[0] for single-device modules).  Defaults - to ``None``. - - ``broadcast_buffers (bool)(optional):`` Flag that enables syncing - (broadcasting) buffers of the module at beginning of the forward - function. ``smdistributed.dataparallel`` does not support broadcast - buffer yet. Please set this to ``False``. - - ``process_group(smdistributed.dataparallel.torch.distributed.group)(optional):`` Process - group is not supported in ``smdistributed.dataparallel``. This - parameter exists for API parity with torch.distributed only. Only - supported value is - ``smdistributed.dataparallel.torch.distributed.group.WORLD.`` Defaults - to ``None.`` - - ``bucket_cap_mb (int)(optional):`` DistributedDataParallel will - bucket parameters into multiple buckets so that gradient reduction of - each bucket can potentially overlap with backward - computation. ``bucket_cap_mb`` controls the bucket size in - MegaBytes (MB) (default: 25). - - .. rubric:: Notes - - - This module assumes all parameters are registered in the model by the - time it is created. No parameters should be added nor removed later. - - This module assumes all parameters are registered in the model of - each distributed processes are in the same order. The module itself - will conduct gradient all-reduction following the reverse order of - the registered parameters of the model. In other words, it is users’ - responsibility to ensure that each distributed process has the exact - same model and thus the exact same parameter registration order. - - You should never change the set of your model’s parameters after - wrapping up your model with DistributedDataParallel. In other words, - when wrapping up your model with DistributedDataParallel, the - constructor of DistributedDataParallel will register the additional - gradient reduction functions on all the parameters of the model - itself at the time of construction. If you change the model’s - parameters after the DistributedDataParallel construction, this is - not supported and unexpected behaviors can happen, since some - parameters’ gradient reduction functions might not get called. - - -.. class:: smdistributed.dataparallel.torch.distributed.ReduceOp - :noindex: - - An enum-like class for supported reduction operations - in ``smdistributed.dataparallel``. - - The values of this class can be accessed as attributes, for - example, ``ReduceOp.SUM``. They are used in specifying strategies for - reduction collectives such as -  ``smdistributed.dataparallel.torch.distributed.all_reduce(...)``. - - - ``AVERAGE`` - - ``SUM`` - - ``MIN`` - - ``MAX`` diff --git a/doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_tensorflow.rst deleted file mode 100644 index 09120b2ded..0000000000 --- a/doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_tensorflow.rst +++ /dev/null @@ -1,558 +0,0 @@ -################################################################# -TensorFlow Guide to SageMaker's distributed data parallel library -################################################################# - -.. admonition:: Contents - - - :ref:`tensorflow-sdp-modify-11x` - - :ref:`tensorflow-sdp-api-11x` - -.. _tensorflow-sdp-modify-11x: - - -Modify a TensorFlow 2.x training script to use SageMaker data parallel -====================================================================== - -The following steps show you how to convert a TensorFlow 2.x training -script to utilize the distributed data parallel library. - -The distributed data parallel library APIs are designed to be close to Horovod APIs. -See `SageMaker distributed data parallel TensorFlow examples -`__ -for additional details on how to implement the data parallel library. - -- First import the distributed data parallel library’s TensorFlow client and initialize it: - - .. code:: python - - import smdistributed.dataparallel.tensorflow as sdp - sdp.init() - - -- Pin each GPU to a single smdistributed.dataparallel process - with ``local_rank`` - this refers to the relative rank of the - process within a given node. ``sdp.tensorflow.local_rank()`` API - provides you the local rank of the device. The leader node will be - rank 0, and the worker nodes will be rank 1, 2, 3, and so on. This is - invoked in the next code block as ``sdp.local_rank()``. - ``set_memory_growth`` is not directly related to SMD, but must be set - for distributed training with TensorFlow. - - .. code:: python - - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: -     tf.config.experimental.set_memory_growth(gpu, True) - if gpus: -     tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], 'GPU') - - -- Scale the learning rate by the number of workers. - ``sdp.tensorflow.size()`` API provides you number of workers in the - cluster. This is invoked in the next code block as ``sdp.size()``. - - .. code:: python - - learning_rate = learning_rate * sdp.size() - - -- Use the library’s ``DistributedGradientTape`` to optimize AllReduce - operations during training. This wraps ``tf.GradientTape``. - - .. code:: python - - with tf.GradientTape() as tape: -       output = model(input) -       loss_value = loss(label, output) - - # Wrap tf.GradientTape with the library's DistributedGradientTape - tape = sdp.DistributedGradientTape(tape) - - -- Broadcast initial model variables from the leader node (rank 0) to - all the worker nodes (ranks 1 through n). This is needed to ensure a - consistent initialization across all the worker ranks. For this, you - use ``sdp.tensorflow.broadcast_variables`` API after the - model and optimizer variables are initialized. This is invoked in the - next code block as ``sdp.broadcast_variables()``. - - .. code:: python - - sdp.broadcast_variables(model.variables, root_rank=0) - sdp.broadcast_variables(opt.variables(), root_rank=0) - - -- Finally, modify your script to save checkpoints only on the leader - node. The leader node will have a synchronized model. This also - avoids worker nodes overwriting the checkpoints and possibly - corrupting the checkpoints. - - .. code:: python - - if sdp.rank() == 0: -     checkpoint.save(checkpoint_dir) - - -All put together, the following is an example TensorFlow2 training -script you will have for distributed training with the library. - -.. code:: python - - import tensorflow as tf - - # Import the library's TF API - import smdistributed.dataparallel.tensorflow as sdp - - # Initialize the library - sdp.init() - - gpus = tf.config.experimental.list_physical_devices('GPU') - for gpu in gpus: -     tf.config.experimental.set_memory_growth(gpu, True) - if gpus: -     # Pin GPUs to a single process -     tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], 'GPU') - - # Prepare Dataset - dataset = tf.data.Dataset.from_tensor_slices(...) - - # Define Model - mnist_model = tf.keras.Sequential(...) - loss = tf.losses.SparseCategoricalCrossentropy() - - # Scale Learning Rate - # LR for 8 node run : 0.000125 - # LR for single node run : 0.001 - opt = tf.optimizers.Adam(0.000125 * sdp.size()) - - @tf.function - def training_step(images, labels, first_batch): -     with tf.GradientTape() as tape: -         probs = mnist_model(images, training=True) -         loss_value = loss(labels, probs) - -     # Wrap tf.GradientTape with the library's DistributedGradientTape -     tape = sdp.DistributedGradientTape(tape) - -     grads = tape.gradient(loss_value, mnist_model.trainable_variables) -     opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) - -     if first_batch: -        # Broadcast model and optimizer variables -        sdp.broadcast_variables(mnist_model.variables, root_rank=0) -        sdp.broadcast_variables(opt.variables(), root_rank=0) - -     return loss_value - - ... - - # Save checkpoints only from master node. - if sdp.rank() == 0: -     checkpoint.save(checkpoint_dir) - - -.. _tensorflow-sdp-api-11x: - - -TensorFlow API -============== - -.. rubric:: Supported versions - -Use version 1.0.0 or version 1.2.0 or later of ``smdistributed.dataparallel`` to use this -library with TensorFlow. - -.. function:: smdistributed.dataparallel.tensorflow.init() - :noindex: - - Initialize ``smdistributed.dataparallel``. Must be called at the - beginning of the training script. - - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - - .. rubric:: Notes - - ``init()`` needs to be called only once. It will throw an error if - called more than once: - - ``init() called more than once. smdistributed.dataparallel is already initialized.`` - - -.. function:: smdistributed.dataparallel.tensorflow.size() - :noindex: - - The total number of GPUs across all the nodes in the cluster. For - example, in a 8 node cluster with 8 GPUs each, ``size`` will be equal - to 64. - - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the total number of GPUs, across all - nodes in the cluster. - - -.. function:: smdistributed.dataparallel.tensorflow.local_size() - :noindex: - - The total number of GPUs on a node. For example, on a node with 8 - GPUs, ``local_size`` will be equal to 8. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the total number of GPUs on itself. - - -.. function:: smdistributed.dataparallel.tensorflow.rank() - :noindex: - - The rank of the node in the cluster. The rank ranges from 0 to number of - nodes - 1. This is similar to MPI's World Rank. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the node. - - -.. function:: smdistributed.dataparallel.tensorflow.local_rank() - :noindex: - - Local rank refers to the relative rank of the - GPUs’ ``smdistributed.dataparallel`` processes within the node. For - example, if a node contains 8 GPUs, it has - 8 ``smdistributed.dataparallel`` processes, then each process will - get a local rank ranging from 0 to 7. - - **Inputs:** - - - ``None`` - - **Returns:** - - - An integer scalar containing the rank of the GPU and - its ``smdistributed.dataparallel`` process. - - -.. function:: smdistributed.dataparallel.tensorflow.allreduce(tensor, param_index, num_params, compression=Compression.none, op=ReduceOp.AVERAGE) - :noindex: - - Performs an all-reduce operation on a tensor (``tf.Tensor``). - - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors. By - default, ``smdistributed.dataparallel`` AllReduce averages the - tensors across the participating workers. - ​ - **Inputs:** - - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all ranks. - - ``param_index (int)(required):`` 0 if you are reducing a single tensor. Index of the tensor if you are reducing a list of tensors. - - ``num_params (int)(required):`` len(tensor). - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``op (optional)(smdistributed.dataparallel.tensorflow.ReduceOp)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``SUM``, ``MIN``, ``MAX``, ``AVERAGE`` - - **Returns:** - - - A tensor of the same shape and type as input ``tensor``, all-reduced across all the processes. - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast_global_variables(root_rank) - :noindex: - - Broadcasts all global variables from root rank to all other processes. - - **Inputs:** - - - ``root_rank (int)(required):`` Rank of the process from which global - variables will be broadcasted to all other processes. - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast_variables(variables, root_rank) - :noindex: - - Applicable for TensorFlow 2.x only. - ​ - Broadcasts variables from root rank to all other processes. - ​ - With TensorFlow 2.x, ``broadcast_variables`` is used to - broadcast ``model.variables`` and ``optimizer.variables`` post - initialization from the leader node to all the worker nodes. This - ensures a consistent initialization across all the worker ranks. - - **Inputs:** - - - ``variables (tf.Variable)(required):`` Variables to be broadcasted. - - ``root_rank (int)(required):`` Rank of the process from which - variables will be broadcasted to all other processes. - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.oob_allreduce(tensor, compression=Compression.none, op=ReduceOp.AVERAGE) - :noindex: - - OutOfBand (oob) AllReduce is simplified AllReduce function for use cases - such as calculating total loss across all the GPUs in the training. - oob_allreduce average the tensors, as reduction operation, across the - worker nodes. - - **Inputs:** - - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all worker nodes. - - ``compression`` (optional): Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different worker nodes. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - **Returns:** - - - ``None`` - - .. rubric:: Notes - - ``smdistributed.dataparallel.tensorflow.oob_allreduce``, in most - cases, is ~2x slower - than ``smdistributed.dataparallel.tensorflow.allreduce``  so it is not - recommended to be used for performing gradient reduction during the - training - process. ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. - - -.. function:: smdistributed.dataparallel.tensorflow.overlap(tensor) - :noindex: - - This function is applicable only for models compiled with XLA. Use this - function to enable ``smdistributed.dataparallel`` to efficiently - overlap backward pass with the all reduce operation. - - Example usage: - - .. code:: python - - layer = tf.nn.dropout(...) # Or any other layer - layer = smdistributed.dataparallel.tensorflow.overlap(layer) - - The overlap operation is inserted into the TF graph as a node. It - behaves as an identity operation, and helps in achieving the - communication overlap with backward pass operation. - - **Inputs:** - - - ``tensor (tf.Tensor)(required):`` The tensor to be all-reduced. - - **Returns:** - - - ``None`` - - .. rubric:: Notes - - This operation helps in speeding up distributed training, as - the AllReduce operation does not have to wait for all the gradients to - be ready. Backward propagation proceeds sequentially from the output - layer of the network to the input layer. When the gradient computation - for a layer finishes, ``smdistributed.dataparallel`` adds them to a - fusion buffer. As soon as the size of the fusion buffer reaches a - predefined threshold (25 Mb), ``smdistributed.dataparallel`` starts - the AllReduce operation. - - -.. function:: smdistributed.dataparallel.tensorflow.broadcast(tensor, root_rank) - :noindex: - - Broadcasts the input tensor on root rank to the same input tensor on all - other ``smdistributed.dataparallel`` processes. - ​ - The broadcast will not start until all processes are ready to send and - receive the tensor. - - **Inputs:** - - - ``tensor (tf.Tensor)(required):`` The tensor to be broadcasted. - - ``root_rank (int)(required):`` Rank of the process from which - tensor will be broadcasted to all other processes. - - **Returns:** - - - A tensor of the same shape and type as tensor, with the value - broadcasted from root rank. - - -.. function:: smdistributed.dataparallel.tensorflow.shutdown() - :noindex: - - Shuts down ``smdistributed.dataparallel``. Optional to call at the end - of the training script. - - **Inputs:** - - - ``None`` - - **Returns:** - - - ``None`` - - -.. function:: smdistributed.dataparallel.tensorflow.DistributedOptimizer - :noindex: - - Applicable if you use the ``tf.estimator`` API in TensorFlow 2.x (2.3.1). - ​ - Construct a new ``DistributedOptimizer`` , which uses TensorFlow - optimizer under the hood for computing single-process gradient values - and applying gradient updates after the gradient values have been - combined across all ``smdistributed.dataparallel`` workers. - ​ - Example usage: - - .. code:: python - - opt = ... # existing optimizer from tf.train package or your custom optimizer - opt = smdistributed.dataparallel.tensorflow.DistributedOptimizer(opt) - - - - ``optimizer (tf.train.Optimizer)(required):`` TF Optimizer to use for computing gradients and applying updates. - - - ``name (str)(optional):`` Name prefix for the operations created when applying gradients. Defaults to ``smdistributed.dataparallel`` followed by provided optimizer type. - - - ``use_locking (bool)(optional):`` Whether to use locking when updating variables. Defaults to ``False``. - - - ``device_dense:`` Not supported. Raises not supported error. - - - ``device_sparse:`` Not supported. Raises not supported error. - - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``. - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - - ``bucket_cap_mb (int)(optional):`` Size of ``smdistributed.dataparallel`` fusion buffer size. Defaults to 25MB that works optimally for most case. If you provide a value, expects the (value * 1024 * 1024) i.e., bytes to be multiple of 128. - - -.. function:: smdistributed.dataparallel.tensorflow.DistributedGradientTape - :noindex: - - Applicable to TensorFlow 2.x only. - - Construct a new ``DistributedGradientTape``, which uses - TensorFlow’s ``GradientTape`` under the hood, using an AllReduce to - combine gradient values before applying gradients to model weights. - ​ - Example Usage: - - .. code:: python - - with tf.GradientTape() as tape: -       output = model(input) -       loss_value = loss(label, output) - - # Wrap in smdistributed.dataparallel's DistributedGradientTape - tape = smdistributed.dataparallel.tensorflow.DistributedGradientTape(tape) - - - - ``gradtape (tf.GradientTape)(required):`` GradientTape to use for computing gradients and applying updates. - - - ``device_dense:`` Not supported. Raises not supported error. - - - ``device_sparse:`` Not supported. Raises not supported error. - - - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. - - *  Supported compression types - ``none``, ``fp16`` - - - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``. - - - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given. - - * Supported ops: ``AVERAGE`` - - -.. function:: smdistributed.dataparallel.tensorflow.BroadcastGlobalVariablesHook - :noindex: - - Applicable if you use the ``tf.estimator`` API in TensorFlow 2.x (2.3.1). - - - ``SessionRunHook`` that will broadcast all global variables from root - rank to all other processes during initialization. - ​ - This is necessary to ensure consistent initialization of all workers - when training is started with random weights or restored from a - checkpoint. - ​ - Example Usage: - - .. code:: python - - hooks = [smdistributed.dataparallel.tensorflow.BroadcastGlobalVariablesHook(root_rank=0)] - ... - with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, -                                        hooks=hooks, -                                        config=config) as mon_sess: -      ... - - - - ``root_rank (int)(required):`` Rank of the process from which global - variables will be broadcasted to all other processes. - - -.. function:: smdistributed.dataparallel.tensorflow.Compression - :noindex: - - Optional Gradient Compression algorithm that can be used in AllReduce - operation. - - - ``none``: alias for ``NoneCompression``. Do not compression gradient - tensors. - - ``fp16``: alias for ``FP16Compression``. Compress the floating point - gradient tensors to 16-bit (FP16) - - -.. function:: smdistributed.dataparallel.tensorflow.ReduceOp - :noindex: - - Supported reduction operations in ``smdistributed.dataparallel``. - - - ``AVERAGE`` - - ``SUM`` - - ``MIN`` - - ``MAX`` diff --git a/doc/api/training/sdp_versions/v1_0_0.rst b/doc/api/training/sdp_versions/v1_0_0.rst deleted file mode 100644 index 291e89921d..0000000000 --- a/doc/api/training/sdp_versions/v1_0_0.rst +++ /dev/null @@ -1,9 +0,0 @@ - -Version 1.0.0 -============= - -.. toctree:: - :maxdepth: 1 - - v1.0.0/smd_data_parallel_pytorch.rst - v1.0.0/smd_data_parallel_tensorflow.rst diff --git a/doc/api/training/sdp_versions/v1_1_x.rst b/doc/api/training/sdp_versions/v1_1_x.rst deleted file mode 100644 index 90c3a984a7..0000000000 --- a/doc/api/training/sdp_versions/v1_1_x.rst +++ /dev/null @@ -1,9 +0,0 @@ - -Version 1.1.x -============= - -.. toctree:: - :maxdepth: 1 - - v1.1.x/smd_data_parallel_pytorch.rst - v1.1.x/smd_data_parallel_tensorflow.rst diff --git a/doc/api/training/smd_data_parallel.rst b/doc/api/training/smd_data_parallel.rst deleted file mode 100644 index 14f70a777f..0000000000 --- a/doc/api/training/smd_data_parallel.rst +++ /dev/null @@ -1,110 +0,0 @@ -############################################### -The SageMaker Distributed Data Parallel Library -############################################### - -SageMaker's distributed data parallel library extends SageMaker’s training -capabilities on deep learning models with near-linear scaling efficiency, -achieving fast time-to-train with minimal code changes. - -When training a model on a large amount of data, machine learning practitioners -will often turn to distributed training to reduce the time to train. -In some cases, where time is of the essence, -the business requirement is to finish training as quickly as possible or at -least within a constrained time period. -Then, distributed training is scaled to use a cluster of multiple nodes, -meaning not just multiple GPUs in a computing instance, but multiple instances -with multiple GPUs. However, as the cluster size increases, it is possible to see a significant drop -in performance due to communications overhead between nodes in a cluster. - -SageMaker's distributed data parallel library addresses communications overhead in two ways: - -1. The library performs AllReduce, a key operation during distributed training that is responsible for a - large portion of communication overhead. -2. The library performs optimized node-to-node communication by fully utilizing AWS’s network - infrastructure and Amazon EC2 instance topology. - -To learn more about the core features of this library, see -`Introduction to SageMaker's Distributed Data Parallel Library -`_ -in the SageMaker Developer Guide. - -Use with the SageMaker Python SDK -================================= - -To use the SageMaker distributed data parallel library with the SageMaker Python SDK, you will need the following: - -- A TensorFlow or PyTorch training script that is - adapted to use the distributed data parallel library. The :ref:`sdp_api_docs` includes - framework specific examples of training scripts that are adapted to use this library. -- Your input data must be in an S3 bucket or in FSx in the AWS region - that you will use to launch your training job. If you use the Jupyter - notebooks provided, create a SageMaker notebook instance in the same - region as the bucket that contains your input data. For more - information about storing your training data, refer to - the `SageMaker Python SDK data - inputs `__ documentation. - -When you define -a Pytorch or TensorFlow ``Estimator`` using the SageMaker Python SDK, -you must select ``dataparallel`` as your ``distribution`` strategy: - -.. code:: - - distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } - -We recommend you use one of the example notebooks as your template to launch a training job. When -you use an example notebook you’ll need to swap your training script with the one that came with the -notebook and modify any input functions as necessary. For instructions on how to get started using a -Jupyter Notebook example, see `Distributed Training Jupyter Notebook Examples -`_. - -Once you have launched a training job, you can monitor it using CloudWatch. To learn more, see -`Monitor and Analyze Training Jobs Using Metrics -`_. - - -After you train a model, you can see how to deploy your trained model to an endpoint for inference by -following one of the `example notebooks for deploying a model -`_. -For more information, see `Deploy Models for Inference -`_. - -.. _sdp_api_docs: - -API Documentation -================= - -This section contains the SageMaker distributed data parallel API documentation. If you are a -new user of this library, it is recommended you use this guide alongside -`SageMaker's Distributed Data Parallel Library -`_. - -Select a version to see the API documentation for version. - -.. toctree:: - :maxdepth: 1 - - sdp_versions/latest.rst - sdp_versions/v1_1_x.rst - sdp_versions/v1_0_0.rst - -.. important:: - The distributed data parallel library only supports training jobs using CUDA 11. When you define a PyTorch or TensorFlow - ``Estimator`` with ``dataparallel`` parameter ``enabled`` set to ``True``, - it uses CUDA 11. When you extend or customize your own training image - you must use a CUDA 11 base image. See - `SageMaker Python SDK's distributed data parallel library APIs - `_ - for more information. - - -Release Notes -============= - -New features, bug fixes, and improvements are regularly made to the SageMaker -distributed data parallel library. - -.. toctree:: - :maxdepth: 1 - - smd_data_parallel_release_notes/smd_data_parallel_change_log diff --git a/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst b/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst deleted file mode 100644 index 8de575a218..0000000000 --- a/doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst +++ /dev/null @@ -1,203 +0,0 @@ -.. _sdp_1.2.2_release_note: - -SageMaker Distributed Data Parallel 1.2.2 Release Notes -======================================================= - -*Date: November. 24. 2021* - -**New Features** - -* Added support for PyTorch 1.10 -* PyTorch ``no_sync`` API support for DistributedDataParallel -* Timeout when training stalls due to allreduce and broadcast collective calls - -**Bug Fixes** - -* Fixed a bug that would impact correctness in the mixed dtype case -* Fixed a bug related to the timeline writer that would cause a crash when SageMaker Profiler is enabled for single node jobs. - -**Improvements** - -* Performance optimizations for small models on small clusters - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers: - -- PyTorch 1.10 DLC release: `v1.0-pt-sagemaker-1.10.0-py38 `_ - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.10.0-gpu-py38-cu113-ubuntu20.04-sagemaker - ----- - -Release History -=============== - -SageMaker Distributed Data Parallel 1.2.1 Release Notes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -*Date: June. 29. 2021* - -**New Features:** - -- Added support for TensorFlow 2.5.0. - -**Improvements** - -- Improved performance on a single node and small clusters (2-4 nodes). - -**Bug fixes** - -- Enable ``sparse_as_dense`` by default for SageMaker distributed data - parallel library for TensorFlow APIs: ``DistributedGradientTape`` and - ``DistributedOptimizer``. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers: - -- TensorFlow 2.5.0 DLC release: `v1.0-tf-2.5.0-tr-py37 - `__ - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/tensorflow-training:2.5.0-gpu-py37-cu112-ubuntu18.04-v1.0 - - -SageMaker Distributed Data Parallel 1.2.0 Release Notes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- New features -- Bug Fixes - -**New features:** - -- Support of `EFA network - interface `__ for distributed - AllReduce. For best performance, it is recommended you use an - instance type that supports Amazon Elastic Fabric Adapter - (ml.p3dn.24xlarge and ml.p4d.24xlarge) when you train a model using - SageMaker Distributed data parallel. - -**Bug Fixes:** - -- Improved performance on single node and small clusters. - ----- - -SageMaker Distributed Data Parallel 1.1.2 Release Notes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- Bug Fixes -- Known Issues - -**Bug Fixes:** - -- Fixed a bug that caused some TensorFlow operations to not work with - certain data types. Operations forwarded from C++ have been extended - to support every dtype supported by NCCL. - -**Known Issues:** - -- SageMaker Distributed data parallel has slower throughput than NCCL - when run using a single node. For the best performance, use - multi-node distributed training with smdistributed.dataparallel. Use - a single node only for experimental runs while preparing your - training pipeline. - ----- - -SageMaker Distributed Data Parallel 1.1.1 Release Notes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- New Features -- Bug Fixes -- Known Issues - -**New Features:** - -- Adds support for PyTorch 1.8.1 - -**Bug Fixes:** - -- Fixes a bug that was causing gradients from one of the worker nodes - to be added twice resulting in incorrect ``all_reduce`` results under - some conditions. - -**Known Issues:** - -- SageMaker distributed data parallel still is not efficient when run - using a single node. For the best performance, use multi-node - distributed training with ``smdistributed.dataparallel``. Use a - single node only for experimental runs while preparing your training - pipeline. - ----- - -SageMaker Distributed Data Parallel 1.1.0 Release Notes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- New Features -- Bug Fixes -- Improvements -- Known Issues - -**New Features:** - -- Adds support for PyTorch 1.8.0 with CUDA 11.1 and CUDNN 8 - -**Bug Fixes:** - -- Fixes crash issue when importing ``smdataparallel`` before PyTorch - -**Improvements:** - -- Update ``smdataparallel`` name in python packages, descriptions, and - log outputs - -**Known Issues:** - -- SageMaker DataParallel is not efficient when run using a single node. - For the best performance, use multi-node distributed training with - ``smdataparallel``. Use a single node only for experimental runs - while preparing your training pipeline. - -Getting Started - -For getting started, refer to SageMaker Distributed Data Parallel Python -SDK Guide -(https://docs.aws.amazon.com/sagemaker/latest/dg/data-parallel-use-api.html#data-parallel-use-python-skd-api). - ----- - -SageMaker Distributed Data Parallel 1.0.0 Release Notes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- First Release -- Getting Started - -First Release -------------- - -SageMaker’s distributed data parallel library extends SageMaker’s -training capabilities on deep learning models with near-linear scaling -efficiency, achieving fast time-to-train with minimal code changes. -SageMaker Distributed Data Parallel: - -- optimizes your training job for AWS network infrastructure and EC2 - instance topology. -- takes advantage of gradient update to communicate between nodes with - a custom AllReduce algorithm. - -The library currently supports TensorFlow v2 and PyTorch via `AWS Deep -Learning -Containers `__. - -Getting Started ---------------- - -For getting started, refer to `SageMaker Distributed Data Parallel -Python SDK -Guide `__. diff --git a/doc/api/training/smd_model_parallel.rst b/doc/api/training/smd_model_parallel.rst deleted file mode 100644 index c40bc258fb..0000000000 --- a/doc/api/training/smd_model_parallel.rst +++ /dev/null @@ -1,61 +0,0 @@ -The SageMaker Distributed Model Parallel Library ------------------------------------------------- - -The Amazon SageMaker distributed model parallel library is a model parallelism library for training -large deep learning models that were previously difficult to train due to GPU memory limitations. -The library automatically and efficiently splits a model across multiple GPUs and instances and coordinates model training, -allowing you to increase prediction accuracy by creating larger models with more parameters. - -You can use the library to automatically partition your existing TensorFlow and PyTorch workloads -across multiple GPUs with minimal code changes. The library's API can be accessed through the Amazon SageMaker SDK. - -See the following sections to learn more about the SageMaker model parallel library APIs. - -.. toctree:: - :maxdepth: 3 - - smp_versions/latest - smd_model_parallel_general - - -.. tip:: - - We recommended using this API documentation with the conceptual guide at - `SageMaker's Distributed Model Parallel - `_ - in the *Amazon SageMaker developer guide*. This developer guide documentation includes: - - - An overview of model parallelism, and the library's - `core features `_, - and `extended features for PyTorch `_. - - Instructions on how to modify `TensorFlow - `_ - and `PyTorch - `_ - training scripts. - - Instructions on how to `run a distributed training job using the SageMaker Python SDK - and the SageMaker model parallel library - `_. - - `Configuration tips and pitfalls - `_. - - -.. important:: - The model parallel library only supports training jobs using CUDA 11. When you define a PyTorch or TensorFlow - ``Estimator`` with ``modelparallel`` parameter ``enabled`` set to ``True``, - it uses CUDA 11. When you extend or customize your own training image - you must use a CUDA 11 base image. See - `Extend or Adapt A Docker Container that Contains the Model Parallel Library - `__ - for more information. - -Release Notes -============= - -New features, bug fixes, and improvements are regularly made to the SageMaker -distributed model parallel library. - -.. toctree:: - :maxdepth: 1 - - smd_model_parallel_release_notes/smd_model_parallel_change_log diff --git a/doc/api/training/smd_model_parallel_general.rst b/doc/api/training/smd_model_parallel_general.rst deleted file mode 100644 index 71f9115580..0000000000 --- a/doc/api/training/smd_model_parallel_general.rst +++ /dev/null @@ -1,396 +0,0 @@ -################################# -Use with the SageMaker Python SDK -################################# - -Walk through the following pages to learn about the SageMaker model parallel library's APIs -to configure and enable distributed model parallelism -through an Amazon SageMaker estimator. - -.. _sm-sdk-modelparallel-params: - -Configuration Parameters for ``distribution`` -============================================= - -Amazon SageMaker's TensorFlow and PyTorch estimator objects contain a ``distribution`` parameter, -which you can use to enable and specify parameters for SageMaker distributed training. -The SageMaker model parallel library internally uses MPI. -To use model parallelism, both ``smdistributed`` and MPI must be enabled -through the ``distribution`` parameter. - -.. tip:: - - This page provides you a complete list of parameters you can use - when you construct a SageMaker estimator and configure for distributed training. - - To find examples of how to construct a SageMaker estimator with the distributed training parameters, see - `Launch a SageMaker Distributed Model Parallel Training Job `_ - in the `SageMaker's Distributed Model Parallel developer guide `_. - -.. contents:: Table of Contents - :depth: 3 - :local: - -Parameters for ``smdistributed`` ----------------------------------- - -You can use the following parameters to initialize the library -configuring a dictionary for ``modelparallel``, which goes -into the ``smdistributed`` option for the ``distribution`` parameter. - -.. note:: - - ``partitions`` for TensorFlow and ``pipeline_parallel_degree`` for PyTorch are required parameters. - All other parameters in the following - table are optional. - -Common Parameters -~~~~~~~~~~~~~~~~~ - -.. list-table:: - :widths: 10 20 10 60 - :header-rows: 1 - - * - Parameter - - Type / Valid values - - Default - - Description - * - ``partitions`` for TensorFlow and PyTorch with smdistributed-modelparallel=v1.6) - - int - - - - **Required.** The number of partitions to split the model into. - In case of ``pipeline_parallel_degree`` for PyTorch, this is the number of devices - over which pipeline parallelism will be performed. - * - ``microbatches`` - - int - - 1 - - The number of microbatches to perform pipelining over. 1 means no pipelining. - Batch size must be divisible by the number of microbatches. - * - ``pipeline`` - - ``"interleaved"`` or ``"simple"`` - - ``"interleaved"`` - - The pipeline schedule. - * - ``optimize`` - - ``"memory"`` or ``"speed"`` - - ``"memory"`` - - Determines the distribution mechanism of transformer layers. - If optimizing ``speed``, there will be less communication across tensor-parallel ranks - and layer normalization will not be distributed. However, there will be duplicate activations - stored across tensor-parallel ranks. - If optimizing ``memory``, there will be no redundant activations stored, - but this will result in more communication overhead across tensor parallel ranks. - * - ``placement_strategy`` - - ``"cluster"``, ``"spread"``, or a permutation of the string ``D``, ``P``, and ``T``. - - ``"cluster"`` - - Determines the mapping of model partitions onto physical devices. - When hybrid model/data parallelism is used, ``cluster`` places a single model replica in - neighboring device IDs. Contrarily, ``spread`` places a model replica as far as possible. - For more information, see :ref:`ranking-basics`. - - In case of the permutation letters, ``D`` stands for reduced-data parallelism, - ``P`` stands for pipeline parallelism, - and ``T`` stands for tensor parallelism. - ``spread`` is equivalent to ``"TPD"``, and ``cluster`` is equivalent to ``"DPT"``. - For more information, see :ref:`ranking-basics-tensor-parallelism`. - - Note: For TensorFlow, tensor parallelism is not implemented and - available parameter values are only ``"spread"`` and ``"cluster"``. - * - ``auto_partition`` - - bool - - ``True`` - - Enable auto-partitioning. If disabled, ``default_partition`` parameter must be provided. - * - ``default_partition`` - - int - - ``0`` - - **Required** if ``auto_partition`` is false. The partition ID to place operations/modules - that are not placed in any ``smp.partition`` contexts. - -TensorFlow-specific Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :widths: 10 20 10 60 - :header-rows: 1 - - * - Parameter - - Type / Valid values - - Default - - Description - * - ``contiguous`` - - bool - - ``True`` - - Whether the model partitions should be contiguous. If true, each partition forms a connected component in the computational graph, unless the graph itself is not connected. - * - ``horovod`` - - bool - - ``False`` - - Must be set to ``True`` if hybrid model/data parallelism is used and the data parallelism (DP) framework is Horovod. - - -PyTorch-specific Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :widths: 10 20 10 60 - :header-rows: 1 - - * - Parameter - - Type / Valid values - - Default - - Description - * - ``memory_weight`` - - float [0.0, 1.0] - - ``0.2`` if ``optimize`` is ``"speed"``, else ``0.8`` - - The weight of memory balancing in the auto-partitioni ng objective, as opposed to balancing computational load. If 0.0, the library only tries to balance computation; if 1.0 the library only tries to balance the memory use. Any value in between interpolates between these extremes. - * - ``ddp`` - - bool - - ``False`` - - Must be set to True if hybrid model/data parallelism is used with DistributedDataParallel. DistributedDataParallel is used with NCCL backend, and uses the MASTER_PORT provided by SageMaker. - * - ``active_microbatches`` (**smdistributed-modelparallel**>=v1.3) - - int - - ``partitions`` + 2 - - This is the maximum number of microbatches that are simultaneously in execution during pipelining. Jointly scaling batch size and number of microbatches can often mitigate the pipeline bubble overhead, but that can lead to increased memory usage if too many microbatches are simultaneously in execution. In such cases setting the number of active microbatches to a lower number can help control memory usage. By default this is set to two plus the number of partitions of the model. - * - ``deterministic_server`` (**smdistributed-modelparallel**>=v1.3) - - bool - - ``False`` - - Setting this to true ensures that the execution server for pipelining executes requests in the same order across all data parallel ranks. - * - ``offload_activations`` (**smdistributed-modelparallel**>=v1.6) - - bool - - False - - Enables activation - offloading. To improve GPU memory usage, use activation offloading - only when (1) the ``microbatches`` and ``active_microbatches`` are - greater than 1, and (2) activation checkpointing is enabled for at - least one module in the model. - * - ``activation_loading_horizon`` (**smdistributed-modelparallel**>=v1.6) - - int - - 4 - - Specify the number - of pipeline tasks. This determines how early the activations should - be loaded back to the GPU, expressed in number of pipeline tasks. - Smaller value indicates that activations are loaded closer in time to - when they are needed for backward pass. Setting this value too small - might improve memory usage, but might potentially cause throughput - loss and GPU bottlenecks during the CPU-to-GPU data transfer. - * - ``tensor_parallel_degree`` (**smdistributed-modelparallel**>=v1.6) - - int - - 1 - - The number of devices over which the tensor parallel modules will be distributed. - If ``tensor_parallel_degree`` is greater than 1, then ``ddp`` must be set to ``True``. - * - ``fp16_params`` (**smdistributed-modelparallel**>=v1.6) - - bool - - ``False`` - - If ``True``, the parameters of the distributed modules will be initialized in FP16. - * - ``shard_optimizer_state`` (**smdistributed-modelparallel**>=v1.6) - - bool - - ``False`` - - If ``True``, the library shards the optimizer state of all parameters across - the data parallel processes which hold the same parameter. - This optimizer state sharding happens in a balanced manner. - Note that when sharding optimizer state, full optimizer saving is not currently supported. - Please save partial optimizer state. For more information about saving and loading checkpoints with - optimizer state sharding, see `Instructions for Checkpointing with Tensor Parallelism `_. - * - ``prescaled_batch`` (**smdistributed-modelparallel**>=v1.6) - - bool - - ``False`` - - If ``True`` and when ``smp.nn.DistributedTransformerLMHead`` is used - (this is typically used for GPT-2 or GPT-3 models), - the library assumes that the devices in the same tensor parallelism group - receive the same input data. Otherwise, it is assumed that they receive - different examples. To learn more, see :ref:`prescaled-batch`. - * - ``skip_tracing`` (**smdistributed-modelparallel**>=v1.6) - - bool - - False - - Skips the initial tracing step. This can be useful in very large models - where even model tracing at the CPU is not possible due to memory constraints. - - -Parameters for ``mpi`` ----------------------- - -For the ``"mpi"`` key, a dict must be passed which contains: - -* ``"enabled"``: Set to ``True`` to launch the training job with MPI. - -* ``"processes_per_host"``: Specifies the number of processes MPI should launch on each host. - In SageMaker a host is a single Amazon EC2 ml instance. The SageMaker distributed model parallel library maintains - a one-to-one mapping between processes and GPUs across model and data parallelism. - This means that SageMaker schedules each process on a single, separate GPU and no GPU contains more than one process. - If you are using PyTorch, you must restrict each process to its own device using - ``torch.cuda.set_device(smp.local_rank())``. To learn more, see - `Modify a PyTorch Training Script - `_. - - .. important:: - ``process_per_host`` must be less than or equal to the number of GPUs per instance, and typically will be equal to - the number of GPUs per instance. - - For example, if you use one instance with 4-way model parallelism and 2-way data parallelism, - then processes_per_host should be 2 x 4 = 8. Therefore, you must choose an instance that has at least 8 GPUs, - such as an ml.p3.16xlarge. - - The following image illustrates how 2-way data parallelism and 4-way model parallelism is distributed across 8 GPUs: - the model is partitioned across 4 GPUs, and each partition is added to 2 GPUs. - - .. image:: smp_versions/model-data-parallel.png - :width: 650 - :alt: 2-way data parallelism and 4-way model parallelism distributed across 8 GPUs - - -* ``"custom_mpi_options"``: Use this key to pass any custom MPI options you might need. - To avoid Docker warnings from contaminating your training logs, we recommend the following flag. - ```--mca btl_vader_single_copy_mechanism none``` - - -.. _ranking-basics: - -Ranking Basics without Tensor Parallelism -========================================= - -The library maintains a one-to-one mapping between processes and available GPUs: -for each GPU, there is a corresponding CPU process. Each CPU process -maintains a “rank” assigned by MPI, which is a 0-based unique index for -the process. For instance, if a training job is launched with 4 -``p3dn.24xlarge`` instances using all its GPUs, there are 32 processes -across all instances, and the ranks of these processes range from 0 to -31. - -The ``local_rank`` of a process is the rank of the process among the -processes in the same instance. This can range from 0 up to the number -of GPUs in the instance, but can be lower if fewer processes than GPUs are -launched in the instance. For instance, in the preceding -example, ``local_rank``\ s of the processes will range from 0 to 7, -since there are 8 GPUs in a ``p3dn.24xlarge`` instance. - -When model parallelism is used together with data parallelism (Horovod for TensorFlow -and DDP for PyTorch), the library partitions the set of processes into -disjoint \ ``mp_group``\ s. An ``mp_group`` is a subset of all processes -that together hold a single, partitioned model replica. - -For instance, if -a single node job is launched with 8 local processes with -``partitions=2`` (meaning the model will be split into 2), there are -four \ ``mp_group``\ s. The specific sets of processes that form the -``mp_group``\ s can be adjusted by the ``placement_strategy`` option. - -- If ``placement_strategy`` is ``spread``, then the four - ``mp_group``\ s are ``[0, 4], [1, 5], [2, 6], [3, 7]``. The - ``mp_rank`` is the rank of a process within each ``mp_group``. For example, - the ``mp_rank`` is 0 for the processes 0, 1, 2, and 3, and the ``mp_rank`` is 1 for - the processes 4, 5, 6, and 7. - - Analogously, the library defines ``dp_group``\ s as sets of processes that - all hold the same model partition, and perform data parallelism among - each other. If ``placement_strategy`` is ``spread``, there are two ``dp_group``\ s: - ``[0, 1, 2, 3]`` and ``[4, 5, 6, 7]``. - - Since each process within the ``dp_group`` holds the same partition of - the model, and makes allreduce calls among themselves. Allreduce for - data parallelism does not take place *across* ``dp_group``\ s. - ``dp_rank`` is defined as the rank of a process within its ``dp_group``. - In the preceding example, the \ ``dp_rank`` of process 6 is 2. - -- If ``placement_strategy`` is ``cluster``, the four ``mp_group``\ s - become ``[0, 1], [2, 3], [4, 5], [6, 7]``, and the the two ``dp_group``\ s become - ``[0, 2, 4, 6]`` and ``[1, 3, 5, 7]``. - -.. _ranking-basics-tensor-parallelism: - -Placement Strategy with Tensor Parallelism -========================================== - -In addition to the two placement strategies introduced in the previous section, -the library provides additional placement strategies for extended tensor parallelism features -for PyTorch. The additional placement strategies (parallelism types) are denoted as follows: - -- ``D`` stands for (reduced) data parallelism. -- ``P`` stands for pipeline parallelism. -- ``T`` stands for tensor parallelism. - -With given permutation of the tree letters, the library takes the right-most letter -as the first strategy performs over the global ranks in ascending order. -Contrarily, the parallelism type represented by the left-most letter is performed -over the ranks that are as distant as possible. - -- **Example:** Given 8 devices with ``tp_size() == 2``, - ``pp_size() == 2``, ``rdp_size() == 2`` - - - ``placement_strategy: "DPT"`` gives - - ==== ======== ======= ======= - rank rdp_rank pp_rank tp_rank - ==== ======== ======= ======= - 0 0 0 0 - 1 0 0 1 - 2 0 1 0 - 3 0 1 1 - 4 1 0 0 - 5 1 0 1 - 6 1 1 0 - 7 1 1 1 - ==== ======== ======= ======= - - - ``placement_strategy: "PTD"`` gives - - ==== ======== ======= ======= - rank rdp_rank pp_rank tp_rank - ==== ======== ======= ======= - 0 0 0 0 - 1 1 0 0 - 2 0 0 1 - 3 1 0 1 - 4 0 1 0 - 5 1 1 0 - 6 0 1 1 - 7 1 1 1 - ==== ======== ======= ======= - -Because the neighboring ranks are placed on the same instance with -high-bandwidth NVLinks, it is recommended to place the -parallelism type that has higher bandwidth requirements for your model -on the right-most position in the ``placement_strategy`` string. Because -tensor parallelism often requires frequent communication, placing -``T`` in the right-most position is recommended (as in the default -``"cluster"`` strategy). In many large models, keeping the default of -``"cluster"`` would result in the best performance. - - -.. _prescaled-batch: - -Prescaled Batch -=============== - -``prescaled_batch`` is a configuration parameter that can be useful for -``DistributedTransformerLMHead``, which is used for GPT-2 and GPT-3. - -The way tensor parallelism works is that when a module is distributed, -the inputs to the distributed module in different ``tp_rank``\ s gets -shuffled around in a way that is sliced by the hidden dimension and -scaled by the batch dimension. For example, if tensor parallel degree is -8, the inputs to ``DistributedTransformer`` (a tensor with shape -``[B, S, H]`` where ``B``\ =batch size, ``S``\ =sequence length, -``H``\ =hidden width) in different ``tp_rank``\ s will be communicated -around, and the shapes will become ``[8B, S, H/8]``. Each ``tp_rank`` -has the batch from all the peer ``tp_rank``\ s, but only the slice that -interacts with their local partition of the module. - -By default, the library assumes that each ``tp_rank`` gets assigned a -different batch, and performs the communication described above. If -``prescaled_batch`` is true, then the library assumes that the input -batch is already scaled (and is the same across the ``tp_rank``\ s), and -only does the slicing. In the example above, the library assumes that -input tensor has shape ``[8B, S, H]``, and only converts it into -``[8B, S, H/8]``. So if ``prescaled_batch`` is true, it is the user’s -responsibility to feed the same batch to the ``tp_rank``\ s in the same -``TP_GROUP``. This can be done by doing the data sharding based on -``smp.rdp_size()`` and ``smp.rdp_rank()``, instead of ``smp.dp_size()`` -and ``smp.dp_rank()``. When ``prescaled_batch`` is true, the global -batch size is ``smp.rdp_size()`` multiplied by the per-``MP_GROUP`` -batch size. When ``prescaled_batch`` is false, global batch size is -``smp.dp_size()`` multiplied by the per-``PP_GROUP`` batch size. - -If you use pipeline parallelism degree 1, then you can keep -``prescaled_batch`` false (the default option). If you use a pipeline -parallellism degree more than 1, it is recommended to use -``prescaled_batch`` true, so that you can increase per-``MP_GROUP`` -batch size for efficient pipelining, without running into out-of-memory -issues. diff --git a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst deleted file mode 100644 index feed17a101..0000000000 --- a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.rst +++ /dev/null @@ -1,454 +0,0 @@ -Sagemaker Distributed Model Parallel 1.6.0 Release Notes -======================================================== - -*Date: December. 20. 2021* - -**New Features** - -- **PyTorch** - - - Added extended memory-saving features for PyTorch 1.8.1: - - - Tensor parallelism - - Optimizer state sharding - - Activation checkpointing - - Activation offloading - - For more information, see the following documentation: - - - `SageMaker distributed model parallel developer guide `_ - - `SageMaker distributed model parallel API documentation for v1.6.0 `_ - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following -AWS Deep Learning Container(s): - -- Deep Learning Container for PyTorch 1.8.1: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04 - ----- - -Release History -=============== - -Sagemaker Distributed Model Parallel 1.5.0 Release Notes --------------------------------------------------------- - -*Date: November. 03. 2021* - -**New Features** - -- **PyTorch** - - - Currency update for PyTorch 1.10.0 - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following -AWS Deep Learning Containers: - -- Deep Learning Container for PyTorch 1.10.0: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.10.0-gpu-py38-cu113-ubuntu20.04-sagemaker - ----- - -Sagemaker Distributed Model Parallel 1.4.0 Release Notes --------------------------------------------------------- - -*Date: June. 29. 2021* - -**New Features** - -- **TensorFlow** - - - Added support for TensorFlow v2.5.0. - - Added support for ``keras.model.fit()``. - -**Migration to AWS Deep Learning Containers** - -This version passed benchmark testing and is migrated to the following -AWS Deep Learning Containers: - -- Deep Learning Container for TensorFlow 2.5.0: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/tensorflow-training:2.5.0-gpu-py37-cu112-ubuntu18.04-v1.0 - -- Deep Learning Container for PyTorch 1.9.1: - - .. code:: - - 763104351884.dkr.ecr..amazonaws.com/pytorch-training:1.9.1-gpu-py38-cu111-ubuntu20.04 - ----- - -Sagemaker Distributed Model Parallel 1.3.1 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Known Issues - -**New Features** - -- **TensorFlow** - - - Exposes a new decorator ``register_post_partition_hook``. This allows - invoking the decorated methods just after model partition but before - executing the first step. For example loading a checkpoint. Refer to - the `SageMaker distributed model parallel API - documentation `__ - for more information. - -**Bug Fixes** - -- **PyTorch** - - - Improved memory efficiency when using active microbatches by clearing - activations at end of each microbatch. - -- **TensorFlow** - - - Fixed issue that caused hangs when training some models with XLA - enabled. - -**Known Issues** - -- **PyTorch** - - - A crash was observed when ``optimizer.step()`` was called for certain - optimizers such as AdaDelta, when the partition on which this method - was called has no local parameters assigned to it after partitioning. - This is due to a bug in PyTorch which `has since been - fixed `__. Till that - makes its way to the next release of PyTorch, only call - ``optimizer.step()`` on processes which have at least one local - parameter. This can be checked like this - ``len(list(model.local_parameters())) > 0``. - - - A performance regression still exists when training on SMP with - PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the - slowdown in performance of ``.grad`` method calls in PyTorch 1.7.1 - compared to 1.6. See the related discussion: - https://github.com/pytorch/pytorch/issues/50636. This issue does not - exist with PyTorch 1.8. - ----- - -Sagemaker Distributed Model Parallel 1.3.0 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Known Issues - -.. _new-features-1: - -**New Features** - -.. _pytorch-2: - -- **PyTorch** - - Add support for PyTorch 1.8 - - - Adds a new method to DistributedModel ``register_comm_hook`` (for - PyTorch 1.8 and newer only). This method behaves the same as the - corresponding method with the same name in - ``torch.DistributedDataParallel`` API. Refer to the `SageMaker - distributed model parallel API - documentation `__ - for more information. - -**Improvements** - -- Adds a configuration ``active_microbatches`` to the SageMaker SDK API - for launching jobs, to control the number of active microbatches - during training. This helps limit memory usage in cases where the - number of microbatches is high. Refer to the `SageMaker Python SDK - parameters API - documentation `__ - for more information. - -- Adds a configuration ``deterministic_server`` to the SageMaker SDK - API for launching jobs, which ensures that the execution server for - pipeline parallelism processes requests in a deterministic order - across data parallel ranks. Refer to the `SageMaker Python SDK - parameters API - documentation `__ - for more information. - -- Parameter passing is now supported in ``module.forward`` methods for - DistributedModel and its submodules. This removes the restriction of - having to pass ``nn.Parameter`` to the ``__init__`` call and making - it a member of the module to use it. ## Bug Fixes - -.. _pytorch-3: - -- **PyTorch** - - - Fixed a case where training hangs due to a module having computation - which requires grads that is not used by the final output of the - module. Now such a situtation raises an error with suggestions on - making such computation compatible. - - - Fixed an issue with buffers which caused the buffers to not be on the - correct device after a model is partitioned, and not be synchronized - across steps (when ``broadcast_buffers`` is True). This could have - caused correctness issues in models with buffers. - -.. _known-issues-1: - -**Known Issues** - -.. _pytorch-4: - -- **PyTorch** - - - ``mp_barrier`` and ``get_mp_process_group`` are wrongly marked as - deprecated methods. Ignore the deprecation warning. - - - A crash was observed when ``optimizer.step()`` was called for certain - optimizers such as AdaDelta, when the partition on which this method - was called has no local parameters assigned to it after partitioning. - This is due to a bug in PyTorch which `has since been - fixed `__. Till that - makes its way to the next release of PyTorch, only call - ``optimizer.step()`` on processes which have at least one local - parameter. This can be checked like this - ``len(list(model.local_parameters())) > 0``. - - - A performance regression still exists when training on SMP with - PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the - slowdown in performance of ``.grad`` method calls in PyTorch 1.7.1 - compared to 1.6. See the related discussion: - https://github.com/pytorch/pytorch/issues/50636. This issue does not - exist with PyTorch 1.8. - ----- - -Sagemaker Distributed Model Parallel 1.2.0 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Known Issues - -.. _new-features-2: - -**New Features** - -.. _pytorch-5: - -- **PyTorch** - - Add support for PyTorch 1.7.1 - - - Adds support for ``gradient_as_bucket_view`` (PyTorch 1.7.1 only), - ``find_unused_parameters`` (PyTorch 1.7.1 only) and - ``broadcast_buffers`` options to ``smp.DistributedModel``. These - options behave the same as the corresponding options (with the same - names) in ``torch.DistributedDataParallel`` API. Refer to the - `SageMaker distributed model parallel API - documentation `__ - for more information. - - - Adds support for ``join`` (PyTorch 1.7.1 only) context manager, which - is to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs - across participating processes. - - - Adds support for ``_register_comm_hook`` (PyTorch 1.7.1 only) which - will register the callable as a communication hook for DDP. NOTE: - Like in DDP, this is an experimental API and subject to change. - -.. _tensorflow-2: - -- **Tensorflow** - - - Adds support for Tensorflow 2.4.1 - -.. _bug-fixes-1: - -**Bug Fixes** - -.. _pytorch-6: - -- **PyTorch** - - - ``Serialization``: Fix a bug with serialization/flattening where - instances of subclasses of dict/OrderedDicts were - serialized/deserialized or internally flattened/unflattened as - regular dicts. - -.. _tensorflow-3: - -- **Tensorflow** - - - Fix a bug that may cause a hang during evaluation when there is no - model input for one partition. - -.. _known-issues-2: - -**Known Issues** - -.. _pytorch-7: - -- **PyTorch** - - - A performance regression was observed when training on SMP with - PyTorch 1.7.1 compared to 1.6.0. The rootcause was found to be the - slowdown in performance of ``.grad`` method calls in PyTorch 1.7.1 - compared to 1.6.0. See the related discussion: - https://github.com/pytorch/pytorch/issues/50636. - ----- - -Sagemaker Distributed Model Parallel 1.1.0 Release Notes --------------------------------------------------------- - -- New Features -- Bug Fixes -- Improvements -- Performance -- Known Issues - -.. _new-features-3: - -**New Features** - -The following sections describe new feature releases that are common -across frameworks and that are framework specific. - -**Common across frameworks*** - -- Custom slicing support (``smp_slice`` method) for objects passed to ``smp.step`` decorated functions - - To pass an object to ``smp.step`` that contains tensors that needs to be - split across microbatches and is not an instance of list, dict, tuple or - set, you should implement ``smp_slice`` method for the object. - - Below is an example of how to use this with PyTorch: - - .. code-block:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # SMP will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - -.. _pytorch-8: - -- **PyTorch** - - - Add support for smp.DistributedModel.cpu() - - ``smp.DistributedModel.cpu()`` - `allgather `__\ s - parameters and buffers across all ``mp_ranks`` and moves them to the - CPU. - - - Add ``trace_memory_usage`` option to ``smp.DistributedModel`` to measure memory usage per module - - Adds ``trace_memory_usage`` option to ``smp.DistributedModel``. This - attempts to measure memory usage per module during tracing. If this is - disabled, memory usage is estimated through the sizes of tensors - returned from the module. This option is disabled by default. - -.. _bug-fixes-2: - -**Bug Fixes** - -.. _pytorch-9: - -- **PyTorch** - - - ``torch.nn.Sequential``: Fix a bug with ``torch.nn.Sequential`` which - causes a failure with the error message : - ``shouldnt go less than 0, there is a bug`` when the inputs to the - first module don’t require grads. - - - ``smp.DistributedModel``: Fix a bug with ``DistributedModel`` - execution when a module has multiple parents. The bug surfaces with - the error message: - ``actual_parent should be different than module_execution_stack parent only for torch.nn.ModuleList`` - - - ``apex.optimizers.FusedNovoGrad``: Fix a bug with - ``apex.optimizers.FusedNovoGrad`` which surfaces with the error - message: ``KeyError: 'exp_avg_sq'`` - -**Improvements** - -*Usability* - -.. _pytorch-10: - -- **PyTorch** - - - ``smp.DistributedModel``: Improve the error message when the forward - pass on ``smp.DistributedModel`` is called outside the ``smp.step`` - decorated function. - - - ``smp.load``: Add user friendly error messages when loading - checkpoints with ``smp.load``. - -*Partitioning Algorithm* - -.. _pytorch-11: - -- **PyTorch** - - - Better memory balancing by taking into account the existing modules - already assigned to the parent, while partitioning the children of a - given module. - -**Performance** - -.. _tensorflow-4: - -- **Tensorflow** - - - Addresses long pre-processing times introduced by SMP XLA optimizer - when dealing with large graphs and large number of microbatches. BERT - (large) preprocessing time goes down from 40 minutes to 6 minutes on - p3.16xlarge. - -.. _known-issues-3: - -**Known Issues** - -.. _pytorch-12: - -- **PyTorch** - - - Serialization for Torch in SMP overwrites instances of dict subclass - to be dict itself, instead of the instances of subclass. One of the - use cases which fails because of this issue is when a user implements - a subclass of OrderedDict with the ``__getitem__`` method. After - serialization/deserialization in SMP, indexing on the object will - lead to errors. A workaround is to use the dict keys to access the - underlying item. diff --git a/doc/api/training/smp_versions/archives.rst b/doc/api/training/smp_versions/archives.rst deleted file mode 100644 index c1b3d55491..0000000000 --- a/doc/api/training/smp_versions/archives.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _smdmp-pt-version-archive: - -.. toctree:: - :maxdepth: 1 - - v1_5_0.rst - v1_4_0.rst - v1_3_0.rst - v1_2_0.rst - v1_1_0.rst diff --git a/doc/api/training/smp_versions/latest.rst b/doc/api/training/smp_versions/latest.rst deleted file mode 100644 index 336fe7df87..0000000000 --- a/doc/api/training/smp_versions/latest.rst +++ /dev/null @@ -1,36 +0,0 @@ -############################################### -Use the Library's API to Adapt Training Scripts -############################################### - -The library provides Common APIs that you can use across frameworks, -as well as framework-specific APIs for TensorFlow and PyTorch. - -Select the latest or one of the previous versions of the API documentation -depending on which version of the library you need to use. -To use the library, reference the -**Common API** documentation alongside the framework specific API documentation. - -Version 1.6.0 (Latest) -====================== - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - latest/smd_model_parallel_common_api - latest/smd_model_parallel_pytorch - latest/smd_model_parallel_pytorch_tensor_parallel - latest/smd_model_parallel_tensorflow - -To find archived API documentation for the previous versions of the library, -see the following link: - - -Documentation Archive -===================== - -.. toctree:: - :maxdepth: 1 - - archives diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst deleted file mode 100644 index d1f6b4d45b..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,517 +0,0 @@ -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -.. contents:: Table of Contents - :depth: 3 - :local: - -The Library's Core APIs ------------------------ - -This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - -MPI Basics ----------- - -The library exposes the following basic MPI primitives to its Python API: - -**Global** - -- ``smp.rank()`` : The global rank of the current process. -- ``smp.size()`` : The total number of processes. -- ``smp.get_world_process_group()`` : - ``torch.distributed.ProcessGroup`` that contains all processes. -- ``smp.CommGroup.WORLD``: The communication group corresponding to all processes. -- ``smp.local_rank()``: The rank among the processes on the current instance. -- ``smp.local_size()``: The total number of processes on the current instance. -- ``smp.get_mp_group()``: The list of ranks over which the current model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different replicas of the same model partition. - -**Tensor Parallelism** - -- ``smp.tp_rank()`` : The rank of the process within its - tensor-parallelism group. -- ``smp.tp_size()`` : The size of the tensor-parallelism group. -- ``smp.get_tp_process_group()`` : Equivalent to - ``torch.distributed.ProcessGroup`` that contains the processes in the - current tensor-parallelism group. -- ``smp.CommGroup.TP_GROUP`` : The communication group corresponding to - the current tensor parallelism group. - -**Pipeline Parallelism** - -- ``smp.pp_rank()`` : The rank of the process within its - pipeline-parallelism group. -- ``smp.pp_size()`` : The size of the pipeline-parallelism group. -- ``smp.get_pp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current pipeline-parallelism group. -- ``smp.CommGroup.PP_GROUP`` : The communication group corresponding to - the current pipeline parallelism group. - -**Reduced-Data Parallelism** - -- ``smp.rdp_rank()`` : The rank of the process within its - reduced-data-parallelism group. -- ``smp.rdp_size()`` : The size of the reduced-data-parallelism group. -- ``smp.get_rdp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current reduced data parallelism - group. -- ``smp.CommGroup.RDP_GROUP`` : The communication group corresponding - to the current reduced data parallelism group. - -**Model Parallelism** - -- ``smp.mp_rank()`` : The rank of the process within its model-parallelism - group. -- ``smp.mp_size()`` : The size of the model-parallelism group. -- ``smp.get_mp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current model-parallelism group. -- ``smp.CommGroup.MP_GROUP`` : The communication group corresponding to - the current model parallelism group. - -**Data Parallelism** - -- ``smp.dp_rank()`` : The rank of the process within its data-parallelism - group. -- ``smp.dp_size()`` : The size of the data-parallelism group. -- ``smp.get_dp_process_group()`` : ``torch.distributed.ProcessGroup`` - that contains the processes in the current data-parallelism group. -- ``smp.CommGroup.DP_GROUP`` : The communication group corresponding to - the current data-parallelism group. - -.. _communication_api: - -Communication API ------------------ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst deleted file mode 100644 index 3ca65c17cb..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,639 +0,0 @@ -PyTorch API -=========== - -To use the PyTorch-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. py:class:: smp.DistributedModel() - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled with the SageMaker model parallel library, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - **Behavior of** ``smp.DistributedModel`` **with Tensor Parallelism** - - When a model is wrapped by ``smp.DistributedModel``, the library - immediately traverses the modules of the model object, and replaces the - modules that are supported for tensor parallelism with their distributed - counterparts. This replacement happens in place. If there are no other - references to the original modules in the script, they are - garbage-collected. The module attributes that previously referred to the - original submodules now refer to the distributed versions of those - submodules. - - **Example:** - - .. code:: python - - # register DistributedSubmodule as the distributed version of Submodule - # (note this is a hypothetical example, smp.nn.DistributedSubmodule does not exist) - smp.tp_register_with_module(Submodule, smp.nn.DistributedSubmodule) - - class MyModule(nn.Module): - def __init__(self): - ... - - self.submodule = Submodule() - ... - - # enabling tensor parallelism for the entire model - with smp.tensor_parallelism(): - model = MyModule() - - # here model.submodule is still a Submodule object - assert isinstance(model.submodule, Submodule) - - model = smp.DistributedModel(model) - - # now model.submodule is replaced with an equivalent instance - # of smp.nn.DistributedSubmodule - assert isinstance(model.module.submodule, smp.nn.DistributedSubmodule) - - If ``pipeline_parallel_degree`` (equivalently, ``partitions``) is 1, the - placement of model partitions into GPUs and the initial broadcast of - model parameters and buffers across data-parallel ranks take place - immediately. This is because it does not need to wait for the model - partition when ``smp.DistributedModel`` wrapper is called. For other - cases with ``pipeline_parallel_degree`` greater than 1, the broadcast - and device placement will be deferred until the first call of an - ``smp.step``-decorated function happens. This is because the first - ``smp.step``-decorated function call is when the model partitioning - happens if pipeline parallelism is enabled. - - Because of the module replacement during the ``smp.DistributedModel`` - call, any ``load_state_dict`` calls on the model, as well as any direct - access to model parameters, such as during the optimizer creation, - should be done **after** the ``smp.DistributedModel`` call. - - Since the broadcast of the model parameters and buffers happens - immediately during ``smp.DistributedModel`` call when the degree of - pipeline parallelism is 1, using ``@smp.step`` decorators is not - required when tensor parallelism is used by itself (without pipeline - parallelism). - - For more information about the library's tensor parallelism APIs for PyTorch, - see :ref:`smdmp-pytorch-tensor-parallel`. - - **Additional Methods of** ``smp.DistributedModel`` **for Tensor Parallelism** - - The following are the new methods of ``smp.DistributedModel``, in - addition to the ones listed in the - `documentation `__. - - .. function:: distributed_modules() - - - An iterator that runs over the set of distributed - (tensor-parallelized) modules in the model - - .. function:: is_distributed_parameter(param) - - - Returns ``True`` if the given ``nn.Parameter`` is distributed over - tensor-parallel ranks. - - .. function:: is_distributed_buffer(buf) - - - Returns ``True`` if the given buffer is distributed over - tensor-parallel ranks. - - .. function:: is_scaled_batch_parameter(param) - - - Returns ``True`` if the given ``nn.Parameter`` is operates on the - scaled batch (batch over the entire ``TP_GROUP``, and not only the - local batch). - - .. function:: is_scaled_batch_buffer(buf) - - - Returns ``True`` if the parameter corresponding to the given - buffer operates on the scaled batch (batch over the entire - ``TP_GROUP``, and not only the local batch). - - .. function:: default_reducer_named_parameters() - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``DP_GROUP``. - - .. function:: scaled_batch_reducer_named_parameters() - - - Returns an iterator that runs over ``(name, param)`` tuples, for - ``param`` that is allreduced over the ``RDP_GROUP``. - - - -.. class:: smp.DistributedOptimizer - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst deleted file mode 100644 index 413fc7cc46..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_pytorch_tensor_parallel.rst +++ /dev/null @@ -1,835 +0,0 @@ -.. _smdmp-pytorch-tensor-parallel: - -PyTorch API for Tensor Parallelism -================================== - -SageMaker distributed tensor parallelism works by replacing specific submodules -in the model with their distributed implementations. The distributed modules -have their parameters and optimizer states partitioned across tensor-parallel -ranks. This is to compute the same output as it would have been computed by -the original modules. Since tensor parallelism occurs across data-parallel -ranks, a rank might collect slices of the activations corresponding to the -data shards on other devices that are part of the same tensor parallelism group. - -You can enable or disable tensor parallelism for specific parts of the model. -Within the enabled parts, the replacements with distributed modules will take -place on a best-effort basis for those module supported for tensor parallelism. -Alternatively, you can directly import and use the library’s distributed -modules in the model definition. - -Some of the supported modules (such as ``smp.nn.Transformer``) are high-level -blocks that contain many operations. Because custom implementations -(as opposed to the built-in PyTorch modules) are typically used for these -high-level blocks, the library offers an API that you can use to register -specific distributed versions with such custom modules (provided that they -are functionally equivalent). This allows the library to automatically replace -the occurrences of such PyTorch modules with their distributed counterparts -provided by the library. -For more information, see the following topics. - -.. contents:: Topics - :depth: 3 - :local: - -.. _registering-tp-modules: - -Registering Tensor Parallelism Distributed Modules --------------------------------------------------- - -Although PyTorch natively provides some of the commonly used (and -tensor-parallelizable) building blocks such as Transformer, users often -use custom implementations for such higher-level modules. To distribute -such modules with tensor parallelism, you need to register the -distributed modules to the custom module implementation in your class, -so that the library knows how to distribute the custom module. When you -register the distributed modules, make sure the custom module that you -use is functionally equivalent to the distributed module. You can verify -this by taking a look at the equivalent reference implementations in the -:ref:`smdmp-tp-appendix`. -These implementations are functionally equivalent to their distributed -versions in ``smp.nn`` module. - -.. decorator:: @smp.tp_register(dist_module, init_hook=None, forward_hook=None, return_hook=None) - - - A class decorator that registers the ``dist_module`` class with - the module class that it is attached to. The hooks can be used to - adapt to different interfaces used with ``__init__`` and - ``forward`` methods. - - **Arguments:** - - - ``dist_module``: A subclass of ``smp.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smp.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer - # as the distributed version of MyTransformer - @smp.tp_register(smp.nn.DistributedTransformer, init_hook=init_hook) - class MyTransformer(nn.Module): - def __init__(self, config): - ... - - def forward(self, hidden_states, attention_mask): - ... - -.. function:: smp.tp_register_with_module(module_cls, dist_module, init_hook=None, forward_hook=None, return_hook=None) - - - When you do not have direct access to model definition code, you - can use this API to similarly register a distributed module with - an existing module class. - - - **Arguments:** - - - ``module_cls``: The existing module class that will be - distributed. - - ``dist_module``: A subclass of ``smp.nn.DistributedModule`` - that implements the distributed version of the module class the - decorator is attached to. Any distributed module class defined - in ``smp.nn`` module can be used. - - ``init_hook``: A callable that translates the arguments of the - original module ``__init__`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``__init__`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``__init__`` method (including argument order and default - values), except it must exclude ``self``. - - ``forward_hook``: A callable that translates the arguments of - the original module ``forward`` method to an ``(args, kwargs)`` - tuple compatible with the arguments of the corresponding - distributed module ``forward`` method. Must return a tuple, - whose first element is an iterable representing the positional - arguments, and second element is a ``dict`` representing the - keyword arguments. The input signature of the ``init_hook`` - must **exactly** match the signature of the original - ``forward`` method (including argument order and default - values), except it must exclude ``self``. - - ``return_hook``: A callable that translates the object returned - from the distributed module to the return object expected of - the original module. - - - **Example:** - - .. code:: python - - from somelibrary import MyTransformer - - init_hook = lambda config: ((), config.to_dict()) - - # register smp.nn.DistributedTransformer as the distributed version of MyTransformer - smp.tp_register_with_module(MyTransformer, - smp.nn.DistributedTransformer, - init_hook=init_hook) - -.. _smdmp-supported-modules-for-tp: - -Supported Modules for Tensor Parallelism ----------------------------------------- - -The following modules are supported for tensor -parallelism. - -- ``smp.nn.DistributedLinear`` (implements ``nn.Linear``) -- ``smp.nn.DistributedTransformerLMHead`` -- ``smp.nn.DistributedTransformer`` -- ``smp.nn.DistributedTransformerLayer`` -- ``smp.nn.DistributedAttentionLayer`` -- ``smp.nn.DistributedTransformerOutputLayer`` -- ``smp.nn.DistributedEmbedding`` - -.. contents:: Topics - :depth: 3 - :local: - -.. _tp-module-api: - -Tensor Parallelism Module APIs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. class:: smp.nn.DistributedLinear(in_features, out_features) - - - Tensor-parallel implementation of the ``nn.Linear`` class. - Functionally equivalent to an ``nn.Linear`` module with the same - ``in_features`` and ``out_features``. In other words, - ``in_features`` and ``out_features`` are the number of *global* - channels across tensor-parallel ranks. - - **Arguments:** - - - ``in_features``: The total number of input channels for the - linear layer across all tensor-parallel ranks. - - ``out_features``: The total number of output channels for the - linear layer across all tensor-parallel ranks. - -.. class:: smp.nn.DistributedTransformerLMHead(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, vocab_size=30522, num_positions=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, num_token_types=0, causal_mask_size=None, add_cross_attention=False, add_lm_head=True, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - - - Constructs a distributed transformer model, including embeddings - and a single LM head. A word embedding of size - ``(vocab_size, hidden_size)`` is created, as well as a positional - embedding of size ``(num_positions, hidden_size)``, and the - embeddings are added together. If ``num_token_types`` is larger - than 0, a separate embedding of size - ``(num_token_types, hidden_size)`` is created, and further added - on top. - - The embeddings are fed through a ``DistributedTransformer``, and - if ``add_lm_head`` is ``True``, the output passes through a single - LM head, which is a linear module without bias whose weight is - tied to the word embeddings. - - See ``DistributedTransformerLayer`` for a description of the rest - of the arguments. - - **Methods:** - - - ``forward(self, inputs)`` - - - If ``add_cross_attention`` is ``True``, ``inputs`` must be a - tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, cross_states, cross_states, cross_mask, labels)``. - - Otherwise, ``inputs`` must be a tuple - ``(input_ids, attention_mask, token_type_ids, position_ids, labels)``. - - If ``token_type_ids`` is ``None``, token type embedding will - not be used. - - ``input_ids`` is assumed to be of shape ``[N, S]``, where - ``N`` is the batch size and ``S`` is sequence length. - - ``attention_mask`` is assumed to be a 0-1 tensor of shape - ``[N, S]``, where 1 represents a masked position. - -.. class:: smp.nn.DistributedTransformer(num_layers=12, num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - - - A sequence of ``smp.nn.DistributedTransformerLayer``\ s, whose - number is given by ``num_layers`` argument. For the other - arguments and methods, refer to - ``smp.nn.DistributedTransformerLayer``. - - If both ``pre_layernorm`` and ``post_layernorm`` are ``True``, - layer normalization is applied to both the input and the output of - the ``DistributedTransformer``, in addition to the intermediate - attention and transformer-output layers. - -.. class:: smp.nn.DistributedTransformerLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, intermediate_size=4096, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, causal_mask_size=None, add_cross_attention=False, pre_layernorm=False, post_layernorm=True) - - - Tensor-parallel implementation of a single transformer layer. - Number of attention heads, hidden size, and intermediate size - refer to the global quantities across all tensor-parallel ranks. - - **Arguments:** - - - ``num_attention_heads``: The total number of attention heads - across tensor-parallel ranks - - ``attention_head_size``: The number of channels of a single - attention head. - - ``hidden_size``: The hidden dimension of the transformer. The - input tensor ``hidden_states`` is assumed to have its last - dimension size equal to ``hidden_size``. - - ``intermediate_size``: The number of output channels in the - first linear transformation of the transformer output layer. - ``DistributedTransformerOutputLayer`` first maps - ``hidden_size`` dimensions of its input tensor into - ``intermediate_size`` dimensions, and then maps it back into - ``hidden_size`` dimensions. - - ``attention_dropout_prob``: The dropout probability applied to - the attention probabilities. - - ``hidden_dropout_prob``: The dropout probability used in - dropout layers other than the one applied to the attention - probabilities. - - ``activation``: Choice of activation function to use at the - output layer. Must be ``"gelu"`` or ``"relu"``. - - ``layernorm_epsilon``: The epsilon added to the denominator of - layer normalization for numerical stability. - - ``initializer_range``: If ``use_normal_initialization`` is - ``True``, the standard deviation of the normal random variable - to initialize the weights with. - - ``use_normal_initialization``: If ``True``, the weights are - initialized with normal distribution with standard deviation - given by ``initializer_range``. Otherwise, default PyTorch - initialization is used. - - ``causal_mask_size``: If ``None``, no causal mask is used on - attentions. Otherwise, should be set to maximum sequence length - to apply a causal mask to the attention scores. This is used, - for instance, in GPT-2. - - ``add_cross_attention``: If ``True``, a cross-attention layer - will be added after the self-attention block. The - cross-attention layer computes the attention keys and values - based on the ``cross_states`` input (instead of - ``hidden_states`` input, as in self-attention. This is used in - the decoder block of encoder-decoder architectures. For - encoder-only architectures that only use self-attention, this - should be kept ``False``. - - ``pre_layernorm``: If ``True``, inserts layer normalization at - the input. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - ``post_layernorm``: If ``True``, inserts layer normalization at - the output. At least one of ``pre_layernorm`` and - ``post_layernorm`` must be ``True``. - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the transformer - layer. - - - **Arguments:** - - - If ``add_cross_attention=False``, ``inputs`` must be a - tuple ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, where ``N`` is the batch - size, and ``S`` is the sequence length. - - If ``add_cross_attention=True``, ``inputs`` must be a - tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is assumed to be a tensor of - dimensions ``[N, S_1, H]``, where ``N`` is batch size, - ``S_1`` is sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_1]``, where ``N`` is the batch - size, and ``S_1`` is the sequence length, and - ``cross_mask`` is assumed to be a tensor of size - ``[N, 1, 1, S_2]``. Keys and values for the attention - heads in the cross-attention layer (but not the - self-attention layer) are computed using - ``cross_states``, and ``cross_mask`` is applied as the - attention mask in the cross-attention layer (but not the - self-attention layer). - - - **Returns:** - - - If ``add_cross_attention=False``, a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is the output of the transformer, and - ``attention_mask`` is the same the ``attention_mask`` - argument. - - If ``add_cross_attention=True``, a tuple - ``(hidden_states, cross_states, attention_mask, cross_mask)``, - where ``hidden_states`` is the output of the transformer, - and the next three tensors are the same as the input - arguments. - -.. class:: smp.nn.DistributedAttentionLayer(num_attention_heads=32, attention_head_size=32, hidden_size=1024, attention_dropout_prob=0.1, hidden_dropout_prob=0.1, layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, cross_attention=False, causal_mask_size=None, pre_layernorm=False, post_layernorm=True) - - - A distributed implementation for the attention block. Includes the - computation of the self- or cross-attention (context layer), - followed by a linear mapping and dropout, which is optionally - followed by the residual-connection and layer normalization. - - **Arguments:** - - - See ``DistributedTransformerLayer`` for a description of the - arguments. - - If ``cross_attention`` is ``True``, computes the attentions - with respect to the ``cross_states`` tensor of the ``forward`` - method input tuple. - - - **Methods:** - - - ``forward(self, inputs)``: Forward pass for the attention - layer. - - - **Arguments:** - - - If ``cross_attention=False``, ``inputs`` must be a tuple - ``(hidden_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S, H]``, where ``N`` is batch size, ``S`` is - sequence length, and ``H`` is ``hidden_size``. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S]``, \***\* where ``N`` is the - batch size, and ``S`` is the sequence length. - - If ``cross_attention=True``, ``inputs`` must be a tuple - ``(hidden_states, cross_states, attention_mask)``, where - ``hidden_states`` is assumed to be a tensor of dimensions - ``[N, S_1, H]``, where ``N`` is batch size, ``S_1`` is - sequence length, and ``H`` is ``hidden_size``. - ``cross_states`` is assumed to be a tensor of size - ``[N, S_2, H]``, similarly interpreted. - ``attention_mask`` is assumed to be a tensor of - dimensions ``[N, 1, 1, S_2]``, where ``N`` is the batch - size, and ``S_2`` is the sequence length. Keys and values - for the attention heads are computed using - ``cross_states``. - - - **Returns:** - - - A single tensor that is the output of the attention - layer. - -.. class:: smp.nn.DistributedTransformerOutputLayer(hidden_size=1024, intermediate_size=4096, hidden_dropout_prob=0.1, activation="gelu", layernorm_epsilon=1e-5, initializer_range=0.02, use_normal_initialization=False, pre_layernorm=False, post_layernorm=True) - - - Distributed implementation of a single transformer output layer. A - single ``DistributedTransformerLayer`` with - ``add_cross_attention=False`` consists of a single - ``DistributedAttentionLayer`` immediately followed by a single - ``DistributedTransformerOutputLayer``. The latter linearly maps - the last channel of the input tensor from ``hidden_size`` to - ``intermediate_size``, and then maps it back to ``hidden_size``. - - **Arguments:** - - - See ``DistributedTransformerLayer`` for a description of the - arguments. - -.. class:: smp.nn.DistributedEmbedding(num_embeddings,embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, initializer_range=0.02, _skip_allgather=False,_skip_scatter_and_merge=False,) - - - Distributed implementation of a single Embedding Layer. Currently - only supports splitting across the embedding_dim. - - **Arguments:** - - - See ``DistributedEmbedding`` for a description of the - arguments. - -.. _enabling-tp: - -Enabling Tensor Parallelism -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are two ways tensor parallelism can be enabled. - -First, you can use -the distributed module implementations in ``smp.nn`` module directly in -your model definition. See :ref:`smdmp-supported-modules-for-tp` -for a complete list of built-in distributed modules. Here is an example -of how this can be done: - -.. code:: python - - import torch.nn as nn - import smdistributed.modelparallel.torch as smp - - class TransformerModel: - def __init__(self): - self.embedding = nn.Embedding(vocab_size, hidden_size) - - # directly instantiate smp.nn.DistributedTransformer and use it - self.encoder = smp.nn.DistributedTransformer(num_layers, hidden_size, **kwargs) - - self.pooler = nn.Linear(hidden_size, hidden_size) - - def forward(self, hidden_states): - emb_out = self.embedding(hidden_states) - enc_out = self.encoder(emb_out) - return self.pooler(enc_out) - -Second, you can enable tensor parallelism for specific modules or blocks -of code, which will automatically enable tensor parallelism for the -supported modules within that scope. To do this, you can use the -following API: - -.. decorator:: smp.tensor_parallelism(enabled=True, **kwargs) - - - A context manager that enables or disables tensor parallelism for - any supported module that is created inside. If there are nested - contexts, the innermost will override the rest. If there are - multiple supported modules created within the context, where one - is the submodule of the other, only the outermost module will be - distributed. If a supported module shares weights with another - (supported or unsupported) module, or if its hyperparameters do - not support distribution (e.g., not divisible by the tensor - parallelism degree), tensor parallelism will **not** be enabled - for this module even if this API is used. - - **Example:** - - .. code:: python - - with smp.tensor_parallelism(): - self.m0 = nn.Linear(20, 20) # will be distributed - with smp.tensor_parallelism(enabled=False): - self.m1 = nn.Linear(20, 20) # will not be distributed - - - Keyword arguments `kwargs` can be used to modify the configurations of the distributed modules created inside the context. If a keyword argument provided here matches any `__init__` method arguments of a `DistributedModule` that substitutes a module created inside the `smp.tensor_parallelism` context, this keyword will override the value defined in the `init_hook`. - -.. function:: smp.set_tensor_parallelism(module, enabled=True, **kwargs) - - - Enables or disables tensor parallelism for the supported - submodules of ``module``. If enabling, the outermost supported - modules will be distributed. If disabling, tensor parallelism will - be disabled for the entire module subtree of ``module``. Unlike - the context manager, this API can be used after the model creation - (but before wrapping with :class:`smp.DistributedModel`), so direct - access to model definition code is not required. If a supported - module shares weights with another (supported or unsupported) - module, or if its hyperparameters do not support distribution - (e.g., not divisible by the tensor parallelism degree), tensor - parallelism will **not** be enabled for this module. - - Keyword arguments ``kwargs`` can be used to modify the - configurations of the distributed modules created inside the - context. If a keyword argument provided here matches any - ``__init__`` method arguments of a :class:`smp.DistributedModel` that - substitutes a module created inside the ``smp.tensor_parallelism`` - context, this keyword will override the value defined in the - ``init_hook``. - - **Example:** - - .. code:: python - - model = MyModel() - smp.set_tensor_parallelism(model.encoder, True) - smp.set_tensor_parallelism(model.encoder.embedding, True) - - # outermost supported submodules in model.encoder will be distributed, except for - # model.encoder.embedding - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - -.. _activation-checkpointing-api: - -Activation Checkpointing APIs ------------------------------ - -``smdistributed.modelparallel`` provides three APIs to enable -activation checkpointing: one for checkpointing modules, -one for checkpointing sequential modules, and -one for checkpointing pretrained models. - -For a conceptual guide and examples, see -`Activation Checkpointing `_ -in the *SageMaker's Distributed Model Parallel developer guide*. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint(module, *args, preserve_rng_state=True) - - - Checkpoints the module passed. Throws error if, during manual - partitioning, all children of module are not on same rank as the - module itself, i.e. the module tree is split across multiple - partitions. During auto-partitioning, if the module is split - across multiple partitions, then this call is ignored(with a - warning). Note that this call applies to the module instance only, - not to the module class. - - - **Arguments:** - - - ``module (Instance of nn.Module)``: The module to be - checkpointed. Note that unlike native checkpointing in - PyTorch’s, activation checkpointing in - ``smdistributed.modelparallel`` is at the granularity of a - module. A generic function cannot be passed here. - - ``args``: Tuple containing inputs to the module. - - ``preserve_rng_state (bool, default=True)``: Omit stashing and - restoring the RNG state during each checkpoint. - -.. class:: smdistributed.modelparallel.torch.patches.checkpoint.checkpoint_sequential(sequential_module, input, strategy="each", preserve_rng_state=True, pack_args_as_tuple=False) - - - Checkpoints the modules inside - `nn.Sequential `__. - This can be used even if different layers that are part of the - sequential container lie on different partitions. Each layer part - of the sequential module that is checkpointed must lie completely - within one partition. If this is not the case during manual - partitioning, then an error will be thrown. If this is not the - case during auto partitioning, a warning will be raised and this - module will be run without checkpointing. - - - **Arguments** - - - ``sequential_module (nn.Sequential)``: the sequential module to - be checkpointed. - - ``input (torch.Tensor or a tuple of torch.Tensors)``: input to - the module, which can be a tensor or a tuple of tensors. If a - tuple is passed, then pack_args_as_tuple should be set to True. - - ``strategy (string, default=“each”)`` : Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. This determines how much - memory can be reduced. It can take the following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example, if a sequential consists of - [a, b, c, d] where a,b are on pp_rank0 and c,d are on - pp_rank 1, then this strategy would checkpoint a,b together - and then c,d together. This means effectively, inputs of a, - outputs of b, inputs of c, and outputs of d are in memory; - the reamining activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x layers together on a best effort basis. - It can group x layers together if there are x layers - consecutively on the same partition. For example: - [a,b,c,d,e] where a,b are on pp_rank0 and c,d,e are on - pp_rank 1. If the strategy is ``group_3,`` then a,b are - checkpointed together on pp_rank0 and c,d,e are checkpointed - together on pp_rank1. - - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the checkpointed layer takes a tuple as - input, then this needs to be set to True. - -.. class:: smp.set_activation_checkpointing(module, preserve_rng_state=True, pack_args_as_tuple=False, strategy="each") - - - This API is recommended when importing pretrained models from - libraries, such as PyTorch and Hugging Face Transformers. This is - particularly useful when you don’t have access to the model - definition code and not be able to replace a module call with - checkpoint. - - - **Arguments**: - - - ``module (Instance of nn.Module or nn.Sequential)``: The module - to checkpoint. - - ``preserve_rng_state (bool, default=True)``: Set to ``False`` - to omit stashing and restoring the RNG state during each - checkpoint. - - ``pack_args_as_tuple (bool, default=False)``: *Can only be - passed when module is a sequential module.* To ensure that - backward works correctly, the autograd function has to unpack - any tuples received. If the layer checkpointed takes a tuple as - input, then this needs to be set to True. - - ``strategy: (string, default=“each”)``: *Can only be passed - when module is a sequential module.* Strategy determines how - many layers part of the sequential module need to be grouped - together for one checkpointing call. - - This determines how much memory can be reduced. It can take the - following values - - - ``each`` : The default is to checkpoint each module inside - the sequential separately. - - ``contiguous``: Groups consecutive layers on the same - partition together. For example if a sequential consists of - ``[a, b, c, d]`` where ``a, b`` are on ``pp_rank0`` and ``c, d`` are on - ``pp_rank 1``, then this strategy would checkpoint a,b together - and then ``c, d`` together. This means effectively, the inputs of - ``a``, outputs of ``b``, inputs of ``c``, and outputs of ``d`` are in - memory, and the rest of the activations are recomputed. - - ``group_2, group_3, group_4, etc:`` More generally, - ``group_x`` where x is an integer. This strategy provides - more flexibility in how many layers to group together. - ``group_x`` groups x number of layers together on a best - effort basis if there are x layers consecutively in the same - partition. **Example**: Assume a module with layers ``[a, b, - c, d, e]``. The layers a and b are on pp_rank0, and ``c``, ``d``, and - ``e`` are on ``pp_rank 1``. If the strategy is ``group_3,`` then ``a``, - ``b`` are checkpointed together on ``pp_rank0``, and ``c``, ``d``, ``e`` are - checkpointed together on ``pp_rank1``. - -.. _smdmp-tp-appendix: - -Appendix: Reference Implementations for Modules ------------------------------------------------ - -The following are reference implementations for transformer-related -modules. Note that this is not the actual ``smdistributed`` source code, -but the distributed implementations provided in the library are the -distributed versions of these reference implementations, and can be used -to determine whether the distributed modules perform the same operations -as the custom modules in your script. - -To keep the implementations simple, we only assume keyword arguments, -and assume the existence of a method ``parse_args(kwargs)``, which -parses the arguments to ``__init__`` methods and sets the relevant -attributes of the module, such as ``hidden_size`` and -``num_attention_heads``. - -``smp.nn.DistributedTransformer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class Transformer(nn.Module): - def __init__(self, **kwargs): - super(Transformer, self).__init__() - self.parse_args(kwargs) - - self.layers = [] - for l in range(self.num_layers): - self.layers.append(TransformerLayer(**kwargs)) - - self.seq_layers = nn.Sequential(*self.layers) - - def forward(self, inp): - return self.seq_layers(inp) - -``smp.nn.DistributedTransformerLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerLayer, self).__init__() - self.parse_args(kwargs) - - self.attention = AttentionLayer(**kwargs) - self.output = TransformerOutputLayer(**kwargs) - - if self.add_cross_attention: - self.cross_attention = AttentionLayer(cross_attention=True, **kwargs) - - def forward(self, inp): - if self.add_cross_attention: - hidden_states, cross_states, attention_mask, cross_mask = inp - else: - hidden_states, attention_mask = inp - - attention_output = self.attention((hidden_states, attention_mask)) - if self.add_cross_attention: - attention_output = self.cross_attention((attention_output, - cross_states, - cross_mask)) - - output = self.output(attention_output) - - if self.add_cross_attention: - return output, cross_states, attention_mask, cross_mask - else: - return output, attention_mask - -``smp.nn.DistributedAttentionLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class AttentionLayer(nn.Module): - def __init__(self, **kwargs): - super(AttentionLayer, self).__init__() - self.parse_args(kwargs) - self.attention_head_size = self.hidden_size // self.num_attention_heads - - self.query = nn.Linear(self.hidden_size, self.hidden_size) - self.key = nn.Linear(self.hidden_size, self.hidden_size) - self.value = nn.Linear(self.hidden_size, self.hidden_size) - self.dense = nn.Linear(self.hidden_size, self.hidden_size) - - self.dropout1 = nn.Dropout(self.attention_dropout_prob) - self.dropout2 = nn.Dropout(self.hidden_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def transpose(self, tensor, key=False): - shape = tensor.size()[:-1] + - (self.num_attention_heads, self.attention_head_size) - tensor = torch.reshape(tensor, shape) - if key: - return tensor.permute(0, 2, 3, 1) - else: - return tensor.permute(0, 2, 1, 3) - - def forward(self, inp): - if self.cross_attention: - hidden_states, cross_states, attention_mask = inp - else: - hidden_states, attention_mask = inp - - if self.pre_layernorm: - norm_states = self.pre_layernorm(hidden_states) - else: - norm_states = hidden_states - - query_layer = self.query(norm_states) - - if self.cross_attention: - key_layer = self.key(cross_states) - value_layer = self.value(cross_states) - else: - key_layer = self.key(norm_states) - value_layer = self.value(norm_states) - - query_layer = self.transpose(query_layer) - key_layer = self.transpose(key_layer, key=True) - value_layer = self.transpose(value_layer) - - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - - if not self.cross_attention and self.causal_mask is not None: - attention_scores = self.apply_causal_mask(attention_scores) - - attention_scores = attention_scores + attention_mask - - attention_probs = F.softmax(attention_scores, dim=-1) - attention_probs = self.dropout1(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.local_attention_size,) - context_layer = torch.reshape(context_layer, new_context_layer_shape) - - self_attention = self.dense(context_layer) - self_attention = self.dropout2(self_attention) - - if self.post_layernorm: - return self.layernorm(self_attention + hidden_states) - else: - return self_attention - -``smp.nn.DistributedTransformerOutputLayer`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - class TransformerOutputLayer(nn.Module): - def __init__(self, **kwargs): - super(TransformerOutputLayer, self).__init__() - self.parse_args(kwargs) - - self.dense1 = nn.Linear(self.hidden_size, self.intermediate_size) - self.dense2 = nn.Linear(self.intermediate_size, self.hidden_size) - - self.dropout = nn.Dropout(self.attention_dropout_prob) - - if self.pre_layernorm: - self.pre_layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - if self.post_layernorm: - self.layernorm = nn.LayerNorm(self.hidden_size, - eps=self.layernorm_epsilon) - - def forward(self, inp): - if self.pre_layernorm: - norm_inp = self.pre_layernorm(inp) - else: - norm_inp = inp - - dense1_output = self.dense1(norm_inp) - if self.activation == "gelu": - act_output = F.gelu(dense1_output) - else: - act_output = F.relu(dense1_output) - - dense2_output = self.dense2(act_output) - output = self.dropout(dense2_output) - - if self.post_layernorm: - return self.layernorm(inp + output) - else: - return output diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 7f21f7a557..0000000000 --- a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,165 +0,0 @@ -TensorFlow API -============== - -To use the TensorFlow-specific APIs for SageMaker distributed model parallism, -you need to add the following import statement at the top of your training script. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following APIs in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/model-data-parallel.png b/doc/api/training/smp_versions/model-data-parallel.png deleted file mode 100644 index 089b84673a..0000000000 Binary files a/doc/api/training/smp_versions/model-data-parallel.png and /dev/null differ diff --git a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 8a8e87252e..0000000000 --- a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,485 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - -.. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index 3b822d79e9..0000000000 --- a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,521 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.6.0** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step`` but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 252c60d16b..0000000000 --- a/doc/api/training/smp_versions/v1.1.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,164 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - ​ - -.. class:: smp.CheckpointManager - :noindex: - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - - **Important:** ``smp.CheckpointManager.restore()`` must be called after - the first training step. This is because the first call of the - ``smp.step`` function constructs and partitions the model, which must - take place before the checkpoint restore. Calling it before the first - ``smp.step`` call might result in hangs or unexpected behavior. - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 1:                    # NOTE: restore occurs on the second step -         ckpt_manager.restore() -     loss = train_step(inputs) - diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 533611ef5e..0000000000 --- a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,487 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - -.. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index 7e09d64262..0000000000 --- a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,553 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.6.0** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view (PyTorch 1.7.1 only)`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - **Available for PyTorch 1.7.1 only** - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True`` for - ``smp.DistributedModel``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index e47d313a4c..0000000000 --- a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,164 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.4.1, 2.3.1** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - ​ - -.. class:: smp.CheckpointManager - :noindex: - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - - **Important:** ``smp.CheckpointManager.restore()`` must be called after - the first training step. This is because the first call of the - ``smp.step`` function constructs and partitions the model, which must - take place before the checkpoint restore. Calling it before the first - ``smp.step`` call might result in hangs or unexpected behavior. - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 1:                    # NOTE: restore occurs on the second step -         ckpt_manager.restore() -     loss = train_step(inputs) - diff --git a/doc/api/training/smp_versions/v1.3.0/add_smd_version.sh b/doc/api/training/smp_versions/v1.3.0/add_smd_version.sh deleted file mode 100755 index 92d99ca43c..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/add_smd_version.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# add_no_index2.py -import fileinput -import sys - -for line in fileinput.input(inplace=True): - if '.. class::' in line or '.. function::' in line or '.. data::' in line or '.. _' in line: - sys.stdout.write(line + ' :noindex:\n') - else: - sys.stdout.write(line) diff --git a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 625a7fcbf1..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,488 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - - .. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index d2fcb95954..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,572 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.8.1** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 8dc0b56b1f..0000000000 --- a/doc/api/training/smp_versions/v1.3.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,172 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1, 2.4.1** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 625a7fcbf1..0000000000 --- a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,488 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - - .. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index d2fcb95954..0000000000 --- a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,572 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.8.1** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 131fc327ac..0000000000 --- a/doc/api/training/smp_versions/v1.4.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,172 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1, 2.4.1, 2.5.0** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst deleted file mode 100644 index 625a7fcbf1..0000000000 --- a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_common_api.rst +++ /dev/null @@ -1,488 +0,0 @@ -.. admonition:: Contents - - - :ref:`communication_api` - - :ref:`mpi_basics` - -Common API -========== - -The following SageMaker distribute model parallel APIs are common across all frameworks. - -**Important**: This API document assumes you use the following import statement in your training scripts. - -**TensorFlow** - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -**PyTorch** - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. function:: smp.init( ) - :noindex: - - Initialize the library. Must be called at the beginning of training script. - -.. function:: @smp.step(non_split_inputs, input_split_axes, [*args, **kwargs]) - :noindex: - - A decorator that must be placed over a function that represents a single - forward and backward pass (for training use cases), or a single forward - pass (for evaluation use cases). Any computation that is defined inside - the ``smp.step``-decorated function is executed in a pipelined manner. - - By default, every tensor input to the function is split across its batch - dimension into a number of microbatches specified while launching the - training job. This behavior can be customized through the arguments to - ``smp.step``, described below. The library then orchestrates the execution of - each microbatch across all partitions, based on the chosen pipeline - type. - - In a typical use case, forward pass and back-propagation are executed - inside an \ ``smp.step``-decorated function and gradients, loss, and - other relevant metrics (such as accuracy, etc.) are returned from - ``smp.step``-decorated function. - - Any gradient post-processing operation, such as gradient clipping and - allreduce, as well as ``optimizer.apply_gradients`` calls (for TF) or - ``optimizer.step`` (for PT) should be applied on the gradients returned - from the ``smp.step`` function, and not inside the ``smp.step`` - function. This is because every operation inside ``smp.step`` is - executed once per microbatch, so having these operations inside - ``smp.step`` can either be inefficient (in the case of allreduce), or - lead to wrong results (in the case of ``apply_gradients`` / - ``optimizer.step``). - - If the objects returned from the ``smp.step``-decorated function contain - ``tf.Tensor``\ s / ``torch.Tensor``\ s, they are converted to - ``StepOutput`` objects. A ``StepOutput`` object encapsulates all - versions of the tensor across different microbatches - (see ``StepOutput`` entry for more information). - - The argument to ``smp.step`` decorated function should either be a tensor - or an instance of list, tuple, dict or set for it to be split across - microbatches. If your object doesn't fall into this category, you can make - the library split your object, by implementing ``smp_slice`` method. - - Below is an example of how to use it with PyTorch. - - .. code:: python - - class CustomType: - def __init__(self, tensor): - self.data = tensor - - # The library will call this to invoke slicing on the object passing in total microbatches (num_mb) - # and the current microbatch index (mb). - def smp_slice(self, num_mb, mb, axis): - dim_size = list(self.data.size())[axis] - - split_size = dim_size // num_mb - sliced_tensor = self.data.narrow(axis, mb * split_size, split_size) - return CustomType(sliced_tensor, self.other) - - custom_obj = CustomType(torch.ones(4,)) - - @smp.step() - def step(custom_obj): - loss = model(custom_obj) - model.backward(loss) - return loss - - - **Important:** ``smp.step`` splits the batch into microbatches, and - executes everything inside the decorated function once per microbatch. - This might affect the behavior of batch normalization, any operation - that explicitly uses the batch size information, or any other Python - code that is expected to run once. - - **TensorFlow-specific behavior** - - ``smp.step`` is a wrapper that - inherits from and extends the behavior of ``tf.function``, and as such, - all the caveats that apply to the use of ``tf.function``\ s also apply - to ``smp.step``. In particular, any operation that is inside - ``smp.step`` executes in graph mode, and not eager mode. - - In the first call, ``smp.step`` performs tracing of the wrapped function every time - one of the tensor arguments changes their shape or dtype, or for every - new value of a Python argument, if there is one. Tracing is expensive, - so such scenarios should be avoided as much as possible or, - alternatively, an ``input_signature`` argument must be provided. For - more information on the usage of ``tf.function``, refer to the - TensorFlow documentation: - - - https://www.tensorflow.org/api_docs/python/tf/function\ - - https://www.tensorflow.org/guide/function\ - - Each ``smp.step`` decorated function must have a return value that depends on the - output of ``smp.DistributedModel``. - - **Common parameters** - - - ``non_split_inputs`` (``list``): The list of arguments to the decorated function - that should not be split along the batch dimension. Should be used - for all input tensors that do not have a batch dimension. Should be a - list of argument names as ``str``, as they appear in the signature of - the ``smp.step``-decorated function. By default it is considered an - empty list. - - - ``input_split_axes`` (``dict``): A dict that maps the argument name to its batch - axis. The keys should be the argument names as ``str``, as they - appear in the signature of the ``smp.step``-decorated function.  By - default all batch axes are assumed to be the 0-axis. - - **TensorFlow-only parameters** - - - All arguments of ``tf.function``. Note: - The \ ``experimental_compile`` argument of ``tf.function`` may not - work as expected with ``smp.step``, since it interferes with - pipelining and model partitioning. To enable XLA with the library, you can - instead use \ ``tf.config.optimizer.set_jit(True)``. - - **PyTorch-only parameters** - - - ``detach_outputs`` (``bool``) : If ``True``, calls ``torch.Tensor.detach()`` on - all returned ``torch.Tensor`` outputs. Setting it to ``False`` - increases memory consumption, unless ``detach()`` is manually called - on the returned tensors, because the model graph is not cleared from - memory after the training step. Set to \ ``True`` by default. - - **Returns** - - - The same object(s) returned from the decorated function. All - returned \ ``tf.Tensor``, \ ``tf.Variable``  objects (for TF) or - ``torch.Tensor`` objects (for PT) are wrapped inside - a \ ``StepOutput`` object, even when they are inside a Python - ``list``, ``tuple``, or ``dict``. - - - -.. class:: StepOutput - :noindex: - - - A class that encapsulates all versions of a ``tf.Tensor`` - or \ ``torch.Tensor`` across all microbatches. - - When a particular ``tf.Tensor`` or ``torch.Tensor`` is computed inside - ``smp.step``, different versions of the tensor are computed for each - microbatch. - - When this tensor is returned from ``smp.step`` and is accessed outside - of the decorated function, it appears as a ``StepOutput`` object, which - contains all such versions. For example, - - - In the case of Tensorflow, the gradient for a particular - ``tf.Variable`` is computed on each microbatch individually, and if - this gradient is returned from ``smp.step``, all gradients for this - ``tf.Variable`` become part of the same ``StepOutput`` object. The - ``StepOutput`` class offers the following API for commonly-used - post-processing operations on such tensors. - - In the case of PyTorch, the loss for each microbatch is computed - individually and all the ``torch.Tensor``\ s that represent the loss - for different microbatches become part of same ``StepOutput`` object, - if loss is returned from the ``smp.step`` function. - - - The ``StepOutput`` class offers the following API for commonly-used - post-processing operations on tensors. - - .. data:: StepOutput.outputs - :noindex: - - Returns a list of the underlying tensors, indexed by microbatch. - - .. function:: StepOutput.reduce_mean( ) - :noindex: - - Returns a ``tf.Tensor``, ``torch.Tensor`` that averages the constituent ``tf.Tensor`` s - ``torch.Tensor`` s. This is commonly used for averaging loss and gradients across microbatches. - - .. function:: StepOutput.reduce_sum( ) - :noindex: - - Returns a ``tf.Tensor`` / - ``torch.Tensor`` that sums the constituent - ``tf.Tensor``\ s/\ ``torch.Tensor``\ s. - - .. function:: StepOutput.concat( ) - :noindex: - - Returns a - ``tf.Tensor``/``torch.Tensor`` that concatenates tensors along the - batch dimension using ``tf.concat`` / ``torch.cat``. - - .. function:: StepOutput.stack( ) - :noindex: - - Applies ``tf.stack`` / ``torch.stack`` - operation to the list of constituent ``tf.Tensor``\ s / - ``torch.Tensor``\ s. - - **TensorFlow-only methods** - - .. function:: StepOutput.merge( ) - :noindex: - - Returns a ``tf.Tensor`` that - concatenates the constituent ``tf.Tensor``\ s along the batch - dimension. This is commonly used for merging the model predictions - across microbatches. - - .. function:: StepOutput.accumulate(method="variable", var=None) - :noindex: - - Functionally the same as ``StepOutput.reduce_mean()``. However, it is - more memory-efficient, especially for large numbers of microbatches, - since it does not wait for all constituent \ ``tf.Tensor``\ s to be - ready to start averaging them, thereby saving memory. - - In some cases (XLA for example) ``StepOutput.reduce_mean()`` might end - up being more memory-efficient than ``StepOutput.accumulate()``. - - **Parameters** - - - ``method`` (``"add_n"`` or ``"accumulate_n"`` or ``"variable"``): - If ``"add_n"`` or ``"accumulate_n"``, the library uses - ``tf.add_n`` and ``tf.accumulate_n``, respectively, to implement - accumulation. If ``"variable"``, the library uses an internal ``tf.Variable`` - into which to accumulate the tensors. Default is \ ``"variable"``. - Note: Memory usage behavior of these choices can depend on the model - and implementation. - - - ``var``: A ``tf.Variable`` into which, if provided, the library uses to - accumulate the tensors. If \ ``None``, the library internally creates a - variable. If ``method`` is not ``"variable"``, this argument is - ignored. - -.. _mpi_basics: - :noindex: - -MPI Basics -^^^^^^^^^^ - -The library exposes the following basic MPI primitives to its Python API: - -- ``smp.rank()``: The rank of the current process. -- ``smp.size()``: The total number of processes. -- ``smp.mp_rank()``: The rank of the process among the processes that - hold the current model replica. -- ``smp.dp_rank()``: The rank of the process among the processes that - hold different replicas of the same model partition. -- ``smp.dp_size()``: The total number of model replicas. -- ``smp.local_rank()``: The rank among the processes on the current - instance. -- ``smp.local_size()``: The total number of processes on the current - instance. -- ``smp.get_mp_group()``: The list of ranks over which the current - model replica is partitioned. -- ``smp.get_dp_group()``: The list of ranks that hold different - replicas of the same model partition. - - .. _communication_api: - :noindex: - -Communication API -^^^^^^^^^^^^^^^^^ - -The library provides a few communication primitives which can be helpful while -developing the training script. These primitives use the following -``enum`` s as arguments to specify which processes the communication -should involve. -​ - -**Helper structures** - -.. data:: smp.CommGroup - :noindex: - - An ``enum`` that takes the values - ``CommGroup.WORLD``, ``CommGroup.MP_GROUP``, and ``CommGroup.DP_GROUP``. - These values can also be accessed as ``smp.WORLD``, ``smp.MP_GROUP``, - and ``smp.DP_GROUP`` respectively. - - - ``CommGroup.WORLD``: Represents the entire group of processes used in - training - - ``CommGroup.MP_GROUP``: Represents the group of processes that hold - the same model replica as the current process. The processes in a - single ``MP_GROUP`` collectively store an entire replica of the - model. - - ``CommGroup.DP_GROUP``: Represents the group of processes that hold - the same model partition as the current process. The processes in a - single ``DP_GROUP`` perform data parallelism/allreduce among - themselves. - -.. data:: smp.RankType - :noindex: - - An ``enum`` that takes the values - ``RankType.WORLD_RANK``, ``RankType.MP_RANK``, and ``RankType.DP_RANK``. - - - ``RankType.WORLD_RANK``: The associated rank is to be interpreted as - the rank of the process across all processes used in training. - - ``RankType.MP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``MP_GROUP``. - - ``RankType.DP_RANK``: The associated rank is to be interpreted as the - rank of the process within the ``DP_GROUP``. - - -**Communication primitives:** - -.. function:: smp.broadcast(obj, group) - :noindex: - - Sends the object to all processes in the - group. The receiving process must call ``smp.recv_from`` to receive the - sent object. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be broadcast. - - - ``group``: A ``CommGroup`` argument that represents to which group of - processes the object will be sent. - - **Notes** - - - When you use ``broadcast`` on the sender process, there needs - to be an accompanying ``smp.recv_from()`` call on the receiver - processes. - - - This is a synchronous call; the ``broadcast`` statement - returns only after all ranks participating in the call have made a - matching ``recv_from`` call. - - **Example** - - .. code:: python - - if smp.rank() == 0: -     smp.broadcast(something, group=smp.CommGroup.WORLD) - else: -     smp.recv_from(0, rank_type=smp.RankType.WORLD_RANK) - -.. function:: smp.send(obj, dest_rank, rank_type) - :noindex: - - Sends the object ``obj`` to - ``dest_rank``, which is of a type specified by ``rank_type``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be sent. - - - ``dest_rank`` (``int``): An integer denoting the rank of the receiving process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``dest_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then ``obj`` is sent to process - with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the current - process. - - **Notes** - - - Note: \ This is a synchronous call; the ``send`` statement returns - only after the destination rank has made a matching - ``recv_from`` call. - -.. function:: smp.recv_from(src_rank, rank_type) - :noindex: - - Receive an object from a peer process. Can be used with a matching - ``smp.send`` or a ``smp.broadcast`` call. - - **Inputs** - - - ``src_rank`` (``int``): An integer denoting rank of the sending process. - - - ``rank_type`` (``enum``): A ``smp.RankType`` ``enum`` that determines how - ``dest_rank`` is to be interpreted. For example if ``src_rank`` is 1 - and ``rank_type`` is ``MP_RANK``, then the object is received from - the process with ``mp_rank`` 1 in the ``MP_GROUP`` which contains the - current process. - - **Returns** - - Returns the python object that is sent by the peer process. - - **Notes** - - - Note: This is a synchronous call; the ``recv_from`` statement returns - only after the source rank has made a matching ``send`` or - ``broadcast`` call, and the object is received. - -.. function:: smp.allgather(obj, group) - :noindex: - - A collective call that gathers all the - submitted objects across all ranks in the specified ``group``. Returns a - list whose ``i``\ th index contains the object submitted by the - ``i``\ th rank in ``group``. - - **Inputs** - - - ``obj``: An arbitrary picklable Python object that will be - allgathered. - - - ``group`` : A ``CommGroup`` argument that represents which group of - processes participate in ``allgather``. - - **Notes** - - - Note: This is a synchronous call; the ``allgather`` statement returns - only after all ranks participating in the call have made a matching - ``allgather`` call, and all the objects are received at the current - rank. - - **Examples** - - .. code:: python - - # assuming mp_size() == 2 - - if smp.mp_rank() == 0: -     out = smp.allgather(obj1, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - else: -     out = smp.allgather(obj2, smp.CommGroup.MP_GROUP)  # returns [obj1, obj2] - -.. function:: smp.barrier(group=smp.WORLD) - :noindex: - - A statement that hangs until all - processes in the specified group reach the barrier statement, similar to - ``MPI_Barrier()``. - - **Inputs** - - - ``group``: An ``smp.CommGroup`` ``enum`` that specifies the group of - processes participating in the barrier call. Defaults to - ``smp.WORLD``. - - **Examples** - - - Assume there are 8 processes and 2 model partitions, and - therefore 4 \ ``mp_group``\ s, and 2 ``dp_group``\ s. If - the \ ``barrier`` call is passed the value ``smp.MP_GROUP`` for its - group argument, then each process only waits until the other process - of its own ``mp_group`` reaches that point. It does not wait for - processes outside that ``mp_group``. - -.. function:: smp.dp_barrier() - :noindex: - - Same as passing ``smp.DP_GROUP``\ to ``smp.barrier()``. - Waits for the processes in the same \ ``dp_group`` as - the current process to reach the same point in execution. - -.. function:: smp.mp_barrier() - :noindex: - - Same as passing ``smp.MP_GROUP`` to - ``smp.barrier()``. Waits for the processes in the same ``mp_group`` as - the current process to reach the same point in execution. diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst deleted file mode 100644 index d2fcb95954..0000000000 --- a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_pytorch.rst +++ /dev/null @@ -1,572 +0,0 @@ -.. admonition:: Contents - - - :ref:`pytorch_saving_loading` - - :ref:`pytorch_saving_loading_instructions` - -PyTorch API -=========== - -**Supported versions: 1.7.1, 1.8.1** - -This API document assumes you use the following import statements in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.torch as smp - - -.. tip:: - - Refer to - `Modify a PyTorch Training Script - `_ - to learn how to use the following API in your PyTorch training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of ``torch.nn.Module`` which specifies the model to be - partitioned. Accepts a ``torch.nn.Module`` object ``module`` which is - the model to be partitioned. The returned ``DistributedModel`` object - internally manages model parallelism and data parallelism. Only one - model in the training script can be wrapped with - ``smp.DistributedModel``. - - **Example:** - - .. code:: python - - model = smp.DistributedModel(model) - - **Important**: The ``__call__`` and  ``backward`` method calls on the - ``smp.DistributedModel`` object (in the following example, the object - is \ ``model``) can only be made inside a ``smp.step``-decorated - function. - - - Since ``DistributedModel``  is a ``torch.nn.Module``, a forward pass can - be performed by calling the \ ``DistributedModel`` object on the input - tensors. - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - For a backward pass, one needs to call the backward function on - the \ ``DistributedModel`` object, with tensors and gradients as - arguments, replacing the PyTorch operations \ ``torch.Tensor.backward`` - or ``torch.autograd.backward``. - - - The API for ``model.backward`` is very similar to - ``torch.autograd.backward``. For example, the following - ``backward`` calls: - - .. code:: python - - torch.autograd.backward(loss) or loss.backward() - - should be replaced with: - - .. code:: python - - model.backward(loss) # loss is a tensor with only one element as its data - - Similarly, for non-scalar tensors, replace the following - ``backward`` call containing incoming gradient arguments: - - .. code:: python - - torch.autograd.backward(outputs, out_grads) - - with the following line: - - .. code:: python - - model.backward(outputs, out_grads) - - In these examples, all ``__call__``  and ``backward`` method calls on - the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside - a ``smp.step``-decorated function. - - **Using DDP** - - If DDP is enabled, do not not place a PyTorch - ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because - the ``DistributedModel`` wrapper will also handle data parallelism. - - Unlike the original DDP wrapper, when you use ``DistributedModel``, - model parameters and buffers are not immediately broadcast across - processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the - ``smp.step``-decorated function when the partition is done. - - **Parameters** - - - ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism). - - - ``trace_device`` (``"cpu"`` or ``"gpu"``) (default: ``"gpu"``) - Whether to perform the tracing step on the GPU or CPU. The tracing step gathers - information on the order of execution of modules, the shapes of - intermediate outputs, and execution times, to be used by the - partitioning algorithm. If ``trace_device`` is set to GPU, accurate - module execution times can be gathered during tracing for potentially - improved partitioning decision. However, if the model is too large to - fit in a single GPU, then ``trace_device`` should be set to CPU. - - - ``trace_execution_times`` (``bool``) (default: ``False``): If ``True``, - the library profiles the execution time of each module during tracing, and uses - it in the partitioning decision. This improves the partitioning - decision, but it might make the tracing slower. It may also introduce - some degree of non-determinism in partitioning results, because of the - inherent randomness in module execution times. Must be ``False`` if - ``trace_device`` is ``"cpu"``. - - - ``overlapping_allreduce`` (``bool``) (default: ``True``): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` while launching training). The library uses this flag - to decide whether to do overlapping allreduce whenever a parameter - gradients are ready. This leads to overlapping of communication and - computation and can improve performance. If this is set to ``False`` , - allreduce is performed at the end of the step. - - - ``backward_passes_per_step`` (``int``) (default: 1): This is only - applicable for hybrid data parallelism/model parallelism use cases (when - ``ddp`` is set to ``True`` in config). This parameter indicates the - number of backward passes to perform before calling allreduce on DDP. - This allows accumulating updates over multiple mini-batches before - reducing and applying them. - - - ``average_grads_across_microbatches`` (``bool``) (default: ``True``): - Whether or not the computed gradients should be averaged across - microbatches. If ``False``, the computed gradients will be summed across - microbatches, but not divided by the number of microbatches. In typical - use case where the computed loss is averaged over the mini-batch, this - should be left as ``True``. If you use a loss function that only sums - the per-sample loss across the batch (and not divide by the batch size), - then this must be set to ``False`` for correctness. - - - ``bucket_cap_mb`` (default: 25): \ ``DistributedDataParallel`` buckets - parameters into multiple buckets so that gradient reduction of each - bucket can potentially overlap with backward - computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes - (MB). - - - ``trace_memory_usage`` (default: False): When set to True, the library attempts - to measure memory usage per module during tracing. If this is disabled, - memory usage will be estimated through the sizes of tensors returned from - the module. - - - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``. - This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper. - Please see: `broadcast_buffer `__. - - - ``gradient_as_bucket_view`` (default: False): To be - used with ``ddp=True``. This parameter is forwarded to the underlying - ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view `__. - - **Properties** - - - ``partitioned``: Is ``True`` if the model is partitioned, ``False`` - otherwise. Initialized to ``False`` when ``DistributedModel`` is first - created. It becomes be ``True`` during the first call - to ``smp.step``-decorated function. Once the model is partitioned, the - local parameters or local ``state_dict`` can be fetched using the - following methods. - - **Methods** - - .. function:: backward(tensors, grad_tensors) - :noindex: - - Triggers a distributed backward - pass across model partitions. Example usage provided in the previous - section. The API is very similar - to https://pytorch.org/docs/stable/autograd.html#torch.autograd.backward. - ``retain_grad`` and ``create_graph``  flags are not supported. - - .. function:: local_buffers( ) - :noindex: - - Returns an iterator over buffers for the modules in - the partitioned model that have been assigned to the current process. - - .. function:: local_named_buffers( ) - :noindex: - - Returns an iterator over buffers for the - modules in the partitioned model that have been assigned to the current - process. This yields both the name of the buffer as well as the buffer - itself. - - .. function:: local_parameters( ) - :noindex: - - Returns an iterator over parameters for the - modules in the partitioned model that have been assigned to the current - process. - - .. function:: local_named_parameters( ) - :noindex: - - Returns an iterator over parameters for - the modules in the partitioned model that have been assigned to the - current process. This yields both the name of the parameter as well as - the parameter itself. - - .. function:: local_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. - - .. function:: local_named_modules( ) - :noindex: - - Returns an iterator over the modules in the - partitioned model that have been assigned to the current process. This - yields both the name of the module as well as the module itself. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains local - parameters that belong to the current \ ``mp_rank``. This ``state_dict`` - contains a key \ ``_smp_is_partial`` to indicate this is a - partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains parameters - for the entire model. It first collects the \ ``local_state_dict``  and - gathers and merges the \ ``local_state_dict`` from all ``mp_rank``\ s to - create a full ``state_dict``. Please note that this needs to be called on all ranks with - ``dp_rank()==0`` to ensure the gather happens properly. - If it is only called on all such ranks, it can hang. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.module.load_state_dict()`` , - except: It first gathers and merges the ``state_dict``\ s across - ``mp_rank``\ s, if they are partial. The actual loading happens after the - model partition so that each rank knows its local parameters. - - .. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. Returns a ``RemovableHandle`` object ``handle``, - which can be used to remove the hook by calling ``handle.remove()``. - - .. function:: cpu( ) - :noindex: - - Allgathers parameters and buffers across all ``mp_rank``\ s and moves them - to the CPU. - - .. function:: join( ) - :noindex: - - A context manager to be used in conjunction with an instance of - ``smp.DistributedModel`` to be able to train with uneven inputs across - participating processes. This is only supported when ``ddp=True``. This will use the join with the wrapped - ``DistributedDataParallel`` instance. For more information, see: - `join `__ - in the PyTorch documentation. - - .. function:: register_comm_hook( state, callable ) - :noindex: - - **Available for PyTorch 1.8.1 only** - Registers a communication hook which is an enhancement that provides - a flexible hook ``callable`` to users where they can specify how - gradients are aggregated across multiple workers. This method will be called on the wrapped ``DistributedDataParallel`` instance. - - Please note that when you register a comm hook you have full control of how the gradients are processed. - When using only data parallelism with Torch DDP you are expected to average grads across data parallel replicas within the hook. - Similarly, when using DistributedModel you have to averaging grads across data parallel replicas within the hook. - In addition to that, you also have to average grads across microbatches within the hook unless you explicitly desire to not average based on your loss function. - See ``average_grads_across_microbatches`` for more information about averaging grads across microbatches. - - This is only supported when ``ddp=True`` and ``overlapping_allreduce=True`` (default). - For more information, see: - `register_comm_hook `__ - in the PyTorch documentation. - - - -.. class:: smp.DistributedOptimizer - :noindex: - - **Parameters** - - ``optimizer`` - - An optimizer wrapper for saving/loading optimizer states. This wrapper - returns ``optimizer`` with the following methods overridden: - - .. function:: state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains optimizer state for the entire model. - It first collects the ``local_state_dict`` and gathers and merges - the ``local_state_dict`` from all ``mp_rank``s to create a full - ``state_dict``. - - .. function:: load_state_dict( ) - :noindex: - - Same as the ``torch.optimizer.load_state_dict()`` , except: - - - It first gathers and merges the local ``state_dict``\ s if they are - partial. - - The actual loading happens after the model partition so that each - rank knows its local parameters. - - .. function:: local_state_dict( ) - :noindex: - - Returns the ``state_dict`` that contains the - local optimizer state that belongs to the current \ ``mp_rank``. This - ``state_dict`` contains a key \ ``_smp_is_partial`` to indicate this is - a partial \ ``state_dict``, which indicates whether the - ``state_dict`` contains elements corresponding to only the current - partition, or to the entire model. - - ​ -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (int) - The index of the partition. - - A context manager which places all modules defined inside into the - partition with ID ``index``.  The ``index`` argument must be less than - the number of partitions. - - Use ``smp.partition`` to implement manual partitioning. - If ``"auto_partition"`` is ``True``, then the - ``smp.partition`` contexts are ignored. Any module that is not placed in - any ``smp.partition`` context is placed in the - ``default_partition`` defined through the SageMaker Python SDK. - - When ``smp.partition`` contexts are nested, the innermost context - overrides the rest (see the following example). In PyTorch, manual - partitioning should be done inside the module \ ``__init__``, and the - partition assignment applies to the modules that are *created* inside - the ``smp.partition`` context. - - Example: - - .. code:: python - - class Model(torch.nn.Module): -     def __init__(self): -         with smp.partition(1): -             self.child0 = Child0()            # child0 on partition 1 -             with smp.partition(2): -                 self.child1 = Child1()        # child1 on partition 2 -             self.child2 = Child2()            # child2 on partition 1 -         self.child3 = Child3()                # child3 on default_partition - -.. function:: smp.get_world_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of all - processes, which can be used with the ``torch.distributed`` API. - Requires ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_mp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``MP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.get_dp_process_group( ) - :noindex: - - Returns a ``torch.distributed`` ``ProcessGroup`` that consists of the - processes in the ``DP_GROUP`` which contains the current process, which - can be used with the \ ``torch.distributed`` API. Requires - ``"ddp": True`` in SageMaker Python SDK parameters. - -.. function:: smp.is_initialized( ) - :noindex: - - Returns ``True`` if ``smp.init`` has already been called for the - process, and ``False`` otherwise. - -.. function::smp.is_tracing( ) - :noindex: - - Returns ``True`` if the current process is running the tracing step, and - ``False`` otherwise. - -.. data:: smp.nn.FusedLayerNorm - :noindex: - - `Apex Fused Layer Norm `__ is currently not - supported by the library. ``smp.nn.FusedLayerNorm`` replaces ``apex`` - ``FusedLayerNorm`` and provides the same functionality. This requires - ``apex`` to be installed on the system. - -.. data:: smp.optimizers.FusedNovoGrad - :noindex: - - - `Fused Novo Grad optimizer `__ is - currently not supported by the library. ``smp.optimizers.FusedNovoGrad`` replaces ``apex`` ``FusedNovoGrad`` - optimizer and provides the same functionality. This requires ``apex`` to - be installed on the system. - -.. data:: smp.optimizers.FusedLamb - :noindex: - - - `FusedLamb optimizer `__ - currently doesn’t work with the library. ``smp.optimizers.FusedLamb`` replaces - ``apex`` ``FusedLamb`` optimizer and provides the same functionality. - This requires ``apex`` to be installed on the system. - -.. data:: smp.amp.GradScaler - :noindex: - - `Torch AMP Gradscaler `__ - currently doesn’t work with the library. ``smp.amp.GradScaler`` replaces - ``torch.amp.GradScaler`` and provides the same functionality. - -.. _pytorch_saving_loading: - :noindex: - -APIs for Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. function:: smp.save( ) - :noindex: - - Saves an object. This operation is similar to ``torch.save()``, except - it has an additional keyword argument, ``partial``, and accepts only - string type for the argument ``f`` (file). If ``partial=True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an ``mp_rank`` - index to your saved file. - - **Parameters** - - - ``obj`` (dict): A saved object. - - ``f`` (str): A string containing a file name. - - ``partial`` (bool, default= ``True``):  When set to ``True``, each - ``mp_rank`` saves a separate checkpoint file and the library adds an - ``mp_rank`` index to the saved file. If you want to be able to load - and further train a model that you save with ``smp.save()``, you must - set ``partial=True``. - - ``pickle_module`` (picklemodule, default = module ``"pickle"`` from ``"/opt/conda/lib/python3.6/pickle.py"``): - A module used for pickling metadata and objects. - - ``pickle_protocol``  (int, default=2): Can be specified to - override the defaultprotocol. - -.. function:: smp.load( ) - :noindex: - - Loads an object saved with ``smp.save()`` from a file. - - Similar to, `torch.load() `__, - except it has an additional keyword argument, ``partial``, and accepts - only string type for the argument ``f`` (file). If \ ``partial=True``, - then each ``mp_rank`` loads a separate checkpoint file. - - **Parameters** - - - ``f`` (string): A string containing a file name. - - ``map_location`` (function): A function - `torch.device `__, - a string, or a dict specifying how to remap storage locations. - - ``pickle_module`` (pickle module): A module used for unpickling - metadata and objects (has to match the \ ``pickle_module``\ used to - serialize file). - - ``pickle_load_args`` (Python 3 only): Optional keyword arguments - passed to ``pickle_module.load()`` and ``pickle_module.Unpickler()``. - - ``partial`` (bool, default= ``True``): When set to ``True``, each - ``mp_rank`` loads the checkpoint corresponding to the ``mp_rank``. - Should be used when loading a model trained with the library. - -.. _pytorch_saving_loading_instructions: - :noindex: - -General Instruction For Saving and Loading -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The library can save partial or full checkpoints. - -- For partial checkpoints, each ``mp_rank`` saves its own checkpoint - file with only the parameters that belong to that rank. -- For full checkpoints, the library saves a single checkpoint that contains - entire model parameters. - -When **saving** using ``smp.save()``, each rank only holds its own -parameters. If you want to save the full model, there will be some -communication between the ranks to create the full model. If you save -checkpoints often, you should save partial checkpoints for best -performance. - -When **loading** using ``smp.load()``, the library can load either partial or | -full checkpoints or full checkpoints saved by a non-model-parallel model. If you -want to resume training with a non-model-parallel model or do inference, you need -a full checkpoint. - -The following is an example of how you can save and load a checkpoint: - -.. code:: python - - # Original model and optimizer - model = MyModel(...) - optimizer = MyOpt(...) - - # model parallel wrapper - model = smp.DistributedModel(model) - optimizer = smp.DistributedOptimizer(optimizer) - - # To save, always save on dp_rank 0 to avoid data racing - if partial: -     # To save the partial model on each mp rank -     # the library will create `checkpoint.pt_{mprank}` for each mp rank -     if save_partial_model: -         if smp.dp_rank() == 0: -             model_dict = model.local_state_dict() # save the partial model -             opt_dict = optimizer.local_state_dict() # save the partial optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 f"/checkpoint.pt", -                 partial=True, -             ) - -     # To save the full model -     if save_full_model: -         if smp.dp_rank() == 0: -             model_dict = model.state_dict() # save the full model -             opt_dict = optimizer.state_dict() # save the full optimizer state -             smp.save( -                 {"model_state_dict": model_dict, "optimizer_state_dict": opt_dict}, -                 "/checkpoint.pt", -                 partial=False, -             ) - - # To load, load on all ranks. - # The only difference for partial/full loading is the partial flag in smp.load - # Load partial checkpoint - if partial_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=True) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) - # Load full checkpoint - if full_checkpoint: -    checkpoint = smp.load("/checkpoint.pt", partial=False) -    model.load_state_dict(checkpoint["model_state_dict"]) -    optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) diff --git a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst deleted file mode 100644 index 131fc327ac..0000000000 --- a/doc/api/training/smp_versions/v1.5.0/smd_model_parallel_tensorflow.rst +++ /dev/null @@ -1,172 +0,0 @@ -TensorFlow API -============== - -**Supported version: 2.3.1, 2.4.1, 2.5.0** - -**Important**: This API document assumes you use the following import statement in your training scripts. - -.. code:: python - - import smdistributed.modelparallel.tensorflow as smp - -.. tip:: - - Refer to - `Modify a TensorFlow Training Script - `_ - to learn how to use the following API in your TensorFlow training script. - -.. class:: smp.DistributedModel - :noindex: - - A sub-class of the Keras \ ``Model`` class, which defines the model to - be partitioned. Model definition is done by sub-classing - ``smp.DistributedModel`` class, and implementing the ``call()`` method, - in the same way as the Keras model sub-classing API. Any operation that - is part of the \ ``smp.DistributedModel.call()`` method is subject to - partitioning, meaning that every operation placed inside executes in - exactly one of the devices (the operations outside run on all devices). - - - Similar to the regular Keras API, the forward pass is done by directly - calling the model object on the input tensors. For example: - - .. code:: python - - predictions = model(inputs)   # model is a smp.DistributedModel object - - However, ``model()`` calls can only be made inside a - ``smp.step``-decorated function. - - The outputs from a ``smp.DistributedModel`` are available in all ranks, - regardless of which rank computed the last operation. - - **Methods:** - - .. function:: save_model(save_path="/opt/ml/model") - :noindex: - - **Inputs** - - ``save_path`` (``string``): A path to save an unpartitioned model with latest training weights. - - Saves the entire, - unpartitioned model with the latest trained weights to ``save_path`` in - TensorFlow ``SavedModel`` format. Defaults to ``"/opt/ml/model"``, which - SageMaker monitors to upload the model artifacts to Amazon S3. - -.. function:: smp.partition(index) - :noindex: - - **Inputs** - - - ``index`` (``int``): The index of the partition. - - A context manager which places all operations defined inside into the - partition whose ID is equal to ``index``. When - ``smp.partition`` contexts are nested, the innermost context overrides - the rest. The ``index`` argument must be smaller than the number of - partitions. - - ``smp.partition`` is used in the manual partitioning API; - if \ ``"auto_partition"`` parameter is set to ``True`` while launching - training, then ``smp.partition`` contexts are ignored. Any operation - that is not placed in any ``smp.partition`` context is placed in the - ``default_partition``, as shown in the following example: - - .. code:: python - - # auto_partition: False - # default_partition: 0 - smp.init() - [...] - x = tf.constant(1.2)                     # placed in partition 0 - with smp.partition(1): -     y = tf.add(x, tf.constant(2.3))      # placed in partition 1 -     with smp.partition(3): -         z = tf.reduce_sum(y)             # placed in partition 3 - - -.. function:: register_post_partition_hook(hook) - :noindex: - - Registers a callable ``hook`` to - be executed after the model is partitioned. This is useful in situations - where an operation needs to be executed after the model partition during - the first call to ``smp.step``, but before the actual execution of the - first forward pass. - - .. code:: python - - @smp.register_post_partition_hook - def test_eager(): - # All statements here will be executed right after partition but before the first forward pass - tf.print("Entered hook through eager context") - -.. class:: smp.CheckpointManager - :noindex: - - - A subclass of TensorFlow - `CheckpointManager `__, - which is used to manage checkpoints. The usage is similar to TensorFlow - ``CheckpointManager``. - - The following returns a ``CheckpointManager`` object. - - .. code:: python - - smp.CheckpointManager(checkpoint, -                       directory="/opt/ml/checkpoints", -                       max_to_keep=None, -                       checkpoint_name="ckpt") - - **Parameters** - - - ``checkpoint``: A `tf.train.Checkpoint - `__ instance - that represents a model checkpoint. - - - ``directory``: (``str``) The path to a directory in which to write - checkpoints. A file named "checkpoint" is also written to this - directory (in a human-readable text format) which contains the state - of the ``CheckpointManager``. Defaults to - ``"/opt/ml/checkpoints"``, which is the directory that SageMaker - monitors for uploading the checkpoints to Amazon S3. - - ``max_to_keep`` (``int``): The number of checkpoints to keep. If - ``None``, all checkpoints are kept. - - ``checkpoint_name`` (``str``): Custom name for the checkpoint file. - Defaults to ``"ckpt"``. - - - **Methods:** - - .. function:: save( ) - :noindex: - - Saves a new checkpoint in the specified directory. Internally uses ``tf.train.CheckpointManager.save()``. - - .. function:: restore( ) - :noindex: - - Restores the latest checkpoint in the specified directory. - Internally uses ``tf.train.CheckpointManager.restore()``. - - - **Examples:** - - .. code:: python - - checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) - ckpt_manager = smp.CheckpointManager(checkpoint, max_to_keep=5)  # use /opt/ml/checkpoints - - for inputs in train_ds: -     loss = train_step(inputs) -     # [...] -     ckpt_manager.save()  # save a new checkpoint in /opt/ml/checkpoints - - .. code:: python - - for step, inputs in enumerate(train_ds): -     if step == 0: -         ckpt_manager.restore() -     loss = train_step(inputs) diff --git a/doc/api/training/smp_versions/v1_1_0.rst b/doc/api/training/smp_versions/v1_1_0.rst deleted file mode 100644 index 34b2d83b6b..0000000000 --- a/doc/api/training/smp_versions/v1_1_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.1.0 -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.1.0/smd_model_parallel_common_api - v1.1.0/smd_model_parallel_pytorch - v1.1.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_2_0.rst b/doc/api/training/smp_versions/v1_2_0.rst deleted file mode 100644 index 4201de0b52..0000000000 --- a/doc/api/training/smp_versions/v1_2_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.2.0 -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.2.0/smd_model_parallel_common_api - v1.2.0/smd_model_parallel_pytorch - v1.2.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_3_0.rst b/doc/api/training/smp_versions/v1_3_0.rst deleted file mode 100644 index 80d73acbd9..0000000000 --- a/doc/api/training/smp_versions/v1_3_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.3.x -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.3.0/smd_model_parallel_common_api - v1.3.0/smd_model_parallel_pytorch - v1.3.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_4_0.rst b/doc/api/training/smp_versions/v1_4_0.rst deleted file mode 100644 index 4485ae6a40..0000000000 --- a/doc/api/training/smp_versions/v1_4_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.4.x -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.4.0/smd_model_parallel_common_api - v1.4.0/smd_model_parallel_pytorch - v1.4.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/smp_versions/v1_5_0.rst b/doc/api/training/smp_versions/v1_5_0.rst deleted file mode 100644 index c93761efa4..0000000000 --- a/doc/api/training/smp_versions/v1_5_0.rst +++ /dev/null @@ -1,12 +0,0 @@ - -Version 1.5.x -============= - -To use the library, reference the Common API documentation alongside the framework specific API documentation. - -.. toctree:: - :maxdepth: 1 - - v1.5.0/smd_model_parallel_common_api - v1.5.0/smd_model_parallel_pytorch - v1.5.0/smd_model_parallel_tensorflow diff --git a/doc/api/training/tuner.rst b/doc/api/training/tuner.rst deleted file mode 100644 index 89ca4a7bab..0000000000 --- a/doc/api/training/tuner.rst +++ /dev/null @@ -1,22 +0,0 @@ -HyperparameterTuner -------------------- - -.. autoclass:: sagemaker.tuner.HyperparameterTuner - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.tuner.ContinuousParameter - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.tuner.IntegerParameter - :members: - :undoc-members: - :show-inheritance: - -.. autoclass:: sagemaker.tuner.CategoricalParameter - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/environment_variables.rst b/doc/api/utility/environment_variables.rst deleted file mode 100644 index 8fcc61d013..0000000000 --- a/doc/api/utility/environment_variables.rst +++ /dev/null @@ -1,7 +0,0 @@ -Environment Variables ---------------------- - -.. automodule:: sagemaker.environment_variables - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/hyperparameters.rst b/doc/api/utility/hyperparameters.rst deleted file mode 100644 index 41b571c778..0000000000 --- a/doc/api/utility/hyperparameters.rst +++ /dev/null @@ -1,7 +0,0 @@ -Hyperparameters ---------------- - -.. automodule:: sagemaker.hyperparameters - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/image_uris.rst b/doc/api/utility/image_uris.rst deleted file mode 100644 index e6be7e8424..0000000000 --- a/doc/api/utility/image_uris.rst +++ /dev/null @@ -1,7 +0,0 @@ -Image URIs ----------- - -.. automodule:: sagemaker.image_uris - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/index.rst b/doc/api/utility/index.rst deleted file mode 100644 index f097b44c8a..0000000000 --- a/doc/api/utility/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -############ -Utility APIs -############ - -.. toctree:: - :maxdepth: 1 - :glob: - - * diff --git a/doc/api/utility/inputs.rst b/doc/api/utility/inputs.rst deleted file mode 100644 index 0ac3907b3d..0000000000 --- a/doc/api/utility/inputs.rst +++ /dev/null @@ -1,12 +0,0 @@ -Inputs ------- - -.. automodule:: sagemaker.inputs - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: sagemaker.dataset_definition.inputs - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/lambda_helper.rst b/doc/api/utility/lambda_helper.rst deleted file mode 100644 index 709f75c4c0..0000000000 --- a/doc/api/utility/lambda_helper.rst +++ /dev/null @@ -1,7 +0,0 @@ -Lambda Utilities ----------------- - -.. automodule:: sagemaker.lambda_helper - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/model_uris.rst b/doc/api/utility/model_uris.rst deleted file mode 100644 index 9c3674540d..0000000000 --- a/doc/api/utility/model_uris.rst +++ /dev/null @@ -1,7 +0,0 @@ -Model URIs ----------- - -.. automodule:: sagemaker.model_uris - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/network.rst b/doc/api/utility/network.rst deleted file mode 100644 index 8fb56c5e6d..0000000000 --- a/doc/api/utility/network.rst +++ /dev/null @@ -1,7 +0,0 @@ -Network Configuration ---------------------- - -.. automodule:: sagemaker.network - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/s3.rst b/doc/api/utility/s3.rst deleted file mode 100644 index 4846b66a6d..0000000000 --- a/doc/api/utility/s3.rst +++ /dev/null @@ -1,7 +0,0 @@ -S3 Utilities ------------- - -.. automodule:: sagemaker.s3 - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/script_uris.rst b/doc/api/utility/script_uris.rst deleted file mode 100644 index 4e1a5b5f7f..0000000000 --- a/doc/api/utility/script_uris.rst +++ /dev/null @@ -1,7 +0,0 @@ -Script URIs ------------ - -.. automodule:: sagemaker.script_uris - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/api/utility/session.rst b/doc/api/utility/session.rst deleted file mode 100644 index bde42a48d6..0000000000 --- a/doc/api/utility/session.rst +++ /dev/null @@ -1,7 +0,0 @@ -Session ------------------- - -.. automodule:: sagemaker.session - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/conf.py b/doc/conf.py deleted file mode 100644 index a866c7292b..0000000000 --- a/doc/conf.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -"""Placeholder docstring""" -from __future__ import absolute_import - -import pkg_resources -from datetime import datetime -import sys -import os - -sys.path.append(os.path.join(os.path.dirname(__file__), ".")) -from doc_utils.jumpstart_doc_utils import create_jumpstart_model_table # noqa: E402 - -project = "sagemaker" -version = pkg_resources.require(project)[0].version - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.doctest", - "sphinx.ext.intersphinx", - "sphinx.ext.todo", - "sphinx.ext.coverage", - "sphinx.ext.autosummary", - "sphinx.ext.napoleon", - "sphinx.ext.autosectionlabel", -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -source_suffix = ".rst" # The suffix of source filenames. -master_doc = "index" # The master toctree document. - -copyright = "%s, Amazon" % datetime.now().year - -# The full version, including alpha/beta/rc tags. -release = version - -# List of directories, relative to source directory, that shouldn't be searched -# for source files. -exclude_trees = ["_build"] - -pygments_style = "default" - -autoclass_content = "both" -autodoc_default_flags = ["show-inheritance", "members", "undoc-members"] -autodoc_member_order = "bysource" - -html_theme = "sphinx_rtd_theme" - -html_theme_options = { - "collapse_navigation": True, - "sticky_navigation": True, - "navigation_depth": 6, - "includehidden": True, - "titles_only": False, -} - - -html_static_path = ["_static"] - -htmlhelp_basename = "%sdoc" % project - -# For Adobe Analytics -html_js_files = [ - "https://a0.awsstatic.com/s_code/js/3.0/awshome_s_code.js", - "https://cdn.datatables.net/1.10.23/js/jquery.dataTables.min.js", - "js/datatable.js", -] - -html_css_files = [ - "https://cdn.datatables.net/1.10.23/css/jquery.dataTables.min.css", -] - -html_context = {"css_files": ["_static/theme_overrides.css"]} - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {"http://docs.python.org/": None} - -# autosummary -autosummary_generate = True - -# autosectionlabel -autosectionlabel_prefix_document = True - - -def setup(app): - create_jumpstart_model_table() diff --git a/doc/doc_utils/jumpstart_doc_utils.py b/doc/doc_utils/jumpstart_doc_utils.py deleted file mode 100644 index 07aea20f3e..0000000000 --- a/doc/doc_utils/jumpstart_doc_utils.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import -from urllib import request -import json -from packaging.version import Version - -JUMPSTART_REGION = "eu-west-2" -SDK_MANIFEST_FILE = "models_manifest.json" -JUMPSTART_BUCKET_BASE_URL = "https://jumpstart-cache-prod-{}.s3.{}.amazonaws.com".format( - JUMPSTART_REGION, JUMPSTART_REGION -) - - -def get_jumpstart_sdk_manifest(): - url = "{}/{}".format(JUMPSTART_BUCKET_BASE_URL, SDK_MANIFEST_FILE) - with request.urlopen(url) as f: - models_manifest = f.read().decode("utf-8") - return json.loads(models_manifest) - - -def get_jumpstart_sdk_spec(key): - url = "{}/{}".format(JUMPSTART_BUCKET_BASE_URL, key) - with request.urlopen(url) as f: - model_spec = f.read().decode("utf-8") - return json.loads(model_spec) - - -def create_jumpstart_model_table(): - sdk_manifest = get_jumpstart_sdk_manifest() - sdk_manifest_top_versions_for_models = {} - - for model in sdk_manifest: - if model["model_id"] not in sdk_manifest_top_versions_for_models: - sdk_manifest_top_versions_for_models[model["model_id"]] = model - else: - if Version( - sdk_manifest_top_versions_for_models[model["model_id"]]["version"] - ) < Version(model["version"]): - sdk_manifest_top_versions_for_models[model["model_id"]] = model - - file_content = [] - - file_content.append("==================================\n") - file_content.append("JumpStart Available Model Table\n") - file_content.append("==================================\n") - file_content.append( - """ - JumpStart for the SageMaker Python SDK uses model ids and model versions to access the necessary - utilities. This table serves to provide the core material plus some extra information that can be useful - in selecting the correct model id and corresponding parameters.\n - """ - ) - file_content.append( - """ - If you want to automatically use the latest version of the model, use "*" for the `model_version` attribute. - We highly suggest pinning an exact model version however.\n - """ - ) - file_content.append("\n") - file_content.append(".. list-table:: Available Models\n") - file_content.append(" :widths: 50 20 20 20\n") - file_content.append(" :header-rows: 1\n") - file_content.append(" :class: datatable\n") - file_content.append("\n") - file_content.append(" * - Model ID\n") - file_content.append(" - Fine Tunable?\n") - file_content.append(" - Latest Version\n") - file_content.append(" - Min SDK Version\n") - - for model in sdk_manifest_top_versions_for_models.values(): - model_spec = get_jumpstart_sdk_spec(model["spec_key"]) - file_content.append(" * - {}\n".format(model["model_id"])) - file_content.append(" - {}\n".format(model_spec["training_supported"])) - file_content.append(" - {}\n".format(model["version"])) - file_content.append(" - {}\n".format(model["min_version"])) - - f = open("doc_utils/jumpstart.rst", "w") - f.writelines(file_content) diff --git a/doc/frameworks/chainer/index.rst b/doc/frameworks/chainer/index.rst deleted file mode 100644 index 6c91f34d6f..0000000000 --- a/doc/frameworks/chainer/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -########################## -Chainer -########################## - -A managed environment for Chainer training and hosting on Amazon SageMaker - -.. toctree:: - :maxdepth: 1 - - using_chainer - -.. toctree:: - :maxdepth: 2 - - sagemaker.chainer diff --git a/doc/frameworks/chainer/sagemaker.chainer.rst b/doc/frameworks/chainer/sagemaker.chainer.rst deleted file mode 100644 index 26a0c56c73..0000000000 --- a/doc/frameworks/chainer/sagemaker.chainer.rst +++ /dev/null @@ -1,26 +0,0 @@ -Chainer -======= - -Chainer Estimator ------------------ - -.. autoclass:: sagemaker.chainer.estimator.Chainer - :members: - :undoc-members: - :show-inheritance: - -Chainer Model -------------- - -.. autoclass:: sagemaker.chainer.model.ChainerModel - :members: - :undoc-members: - :show-inheritance: - -Chainer Predictor ------------------ - -.. autoclass:: sagemaker.chainer.model.ChainerPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/chainer/using_chainer.rst b/doc/frameworks/chainer/using_chainer.rst deleted file mode 100644 index 85c197350d..0000000000 --- a/doc/frameworks/chainer/using_chainer.rst +++ /dev/null @@ -1,590 +0,0 @@ -########################################### -Using Chainer with the SageMaker Python SDK -########################################### - -With Chainer Estimators, you can train and host Chainer models on Amazon SageMaker. - -Supported versions of Chainer: ``4.0.0``, ``4.1.0``, ``5.0.0``. - -We recommend that you use the latest supported version because that's where we focus most of our development efforts. - -For more information about Chainer, see https://github.com/chainer/chainer. - -For general information about using the SageMaker Python SDK, see :ref:`overview:Using the SageMaker Python SDK`. - -.. contents:: - -************************** -Train a Model with Chainer -************************** - -To train a Chainer model by using the SageMaker Python SDK: - -.. |create chainer estimator| replace:: Create a ``sagemaker.chainer.Chainer`` Estimator -.. _create chainer estimator: #create-an-estimator - -.. |call fit| replace:: Call the estimator's ``fit`` method -.. _call fit: #call-the-fit-method - -1. `Prepare a training script <#prepare-a-chainer-training-script>`_ -2. |create chainer estimator|_ -3. |call fit|_ - - -Prepare a Chainer training script -================================= - -Your Chainer training script must be a Python 2.7 or 3.5 compatible source file. - -The training script is similar to a training script you might run outside of SageMaker, but you -can access useful properties about the training environment through various environment variables, -such as the following: - -* ``SM_MODEL_DIR``: A string representing the path to the directory to write model artifacts to. - These artifacts are uploaded to S3 for model hosting. -* ``SM_NUM_GPUS``: An integer representing the number of GPUs available to the host. -* ``SM_OUTPUT_DATA_DIR``: A string representing the filesystem path to write output artifacts to. Output artifacts may - include checkpoints, graphs, and other files to save, not including model artifacts. These artifacts are compressed - and uploaded to S3 to the same S3 prefix as the model artifacts. - -Suppose you use two input channels, named 'train' and 'test', in the call to the Chainer estimator's ``fit()`` method. -The following environment variables are set, following the format "SM_CHANNEL_[channel_name]": - -* ``SM_CHANNEL_TRAIN``: A string representing the path to the directory containing data in the 'train' channel -* ``SM_CHANNEL_TEST``: Same as above, but for the 'test' channel. - -A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, -and saves a model to model_dir so that it can be hosted later. Hyperparameters are passed to your script as arguments -and can be retrieved with an argparse.ArgumentParser instance. For example, a training script might start -with the following: - -.. code:: python - - import argparse - import os - - if __name__ =='__main__': - - parser = argparse.ArgumentParser() - - # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=50) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--learning-rate', type=float, default=0.05) - - # Data, model, and output directories - parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST']) - - args, _ = parser.parse_known_args() - - # ... load from args.train and args.test, train a model, write model to args.model_dir. - -Because the SageMaker imports your training script, you should put your training code in a main guard -(``if __name__=='__main__':``) if you are using the same script to host your model, so that SageMaker does not -inadvertently run your training code at the wrong point in execution. - -For more on training environment variables, please visit https://github.com/aws/sagemaker-containers. - -Save the Model --------------- - -In order to save your trained Chainer model for deployment on SageMaker, your training script should save your model -to a certain filesystem path called `model_dir`. This value is accessible through the environment variable -``SM_MODEL_DIR``. The following code demonstrates how to save a trained Chainer model named ``model`` as -``model.npz`` at the end of training: - -.. code:: python - - import chainer - import argparse - import os - - if __name__=='__main__': - # default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable. - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - args, _ = parser.parse_known_args() - - # ... train `model`, then save it to `model_dir` as file 'model.npz' - chainer.serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model) - -After your training job is complete, SageMaker will compress and upload the serialized model to S3, and your model data -will available in the s3 ``output_path`` you specified when you created the Chainer Estimator. - -Using third-party libraries ---------------------------- - -When running your training script on SageMaker, it will have access to some pre-installed third-party libraries including ``chainer``, ``numpy``, and ``cupy``. -For more information on the runtime environment, including specific package versions, see `SageMaker Chainer Docker containers <#sagemaker-chainer-docker-containers>`__. - -If there are other packages you want to use with your script, you can include a ``requirements.txt`` file in the same directory as your training script to install other dependencies at runtime. -Both ``requirements.txt`` and your training script should be put in the same folder. -You must specify this folder in ``source_dir`` argument when creating a Chainer estimator. -A ``requirements.txt`` file is a text file that contains a list of items that are installed by using ``pip install``. -You can also specify the version of an item to install. -For information about the format of a ``requirements.txt`` file, see `Requirements Files `__ in the pip documentation. - -Create an Estimator -=================== - -You run Chainer training scripts on SageMaker by creating ``Chainer`` Estimators. -SageMaker training of your script is invoked when you call ``fit`` on a ``Chainer`` Estimator. -The following code sample shows how you train a custom Chainer script "chainer-train.py", passing -in three hyperparameters ('epochs', 'batch-size', and 'learning-rate'), and using two input channel -directories ('train' and 'test'). - -.. code:: python - - chainer_estimator = Chainer('chainer-train.py', - instance_type='ml.p3.2xlarge', - instance_count=1, - framework_version='5.0.0', - py_version='py3', - hyperparameters = {'epochs': 20, 'batch-size': 64, 'learning-rate': 0.1}) - chainer_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data', - 'test': 's3://my-data-bucket/path/to/my/test/data'}) - - -Call the fit Method -=================== - -You start your training script by calling ``fit`` on a ``Chainer`` Estimator. ``fit`` takes both required and optional -arguments. - -fit Required arguments ----------------------- - -- ``inputs``: This can take one of the following forms: A string - s3 URI, for example ``s3://my-bucket/my-training-data``. In this - case, the s3 objects rooted at the ``my-training-data`` prefix will - be available in the default ``train`` channel. A dict from - string channel names to s3 URIs. In this case, the objects rooted at - each s3 prefix will available as files in each channel directory. - -For example: - -.. code:: python - - {'train':'s3://my-bucket/my-training-data', - 'eval':'s3://my-bucket/my-evaluation-data'} - -.. optional-arguments-1: - -fit Optional arguments ----------------------- - -- ``wait``: Defaults to True, whether to block and wait for the - training script to complete before returning. -- ``logs``: Defaults to True, whether to show logs produced by training - job in the Python session. Only meaningful when wait is True. - -Distributed Training -==================== - - -Chainer allows you to train a model on multiple nodes using ChainerMN_, which distributes training with MPI. - -.. _ChainerMN: https://github.com/chainer/chainermn - -In order to run distributed Chainer training on SageMaker, your training script should use a ``chainermn`` Communicator -object to coordinate training between multiple hosts. - -SageMaker runs your script with ``mpirun`` if ``instance_count`` is greater than two. -The following are optional arguments modify how MPI runs your distributed training script. - -- ``use_mpi`` Boolean that overrides whether to run your training script with MPI. -- ``num_processes`` Integer that determines how many total processes to run with MPI. By default, this is equal to ``process_slots_per_host`` times the number of nodes. -- ``process_slots_per_host`` Integer that determines how many processes can be run on each host. By default, this is equal to one process per host on CPU instances, or one process per GPU on GPU instances. -- ``additional_mpi_options`` String of additional options to pass to the ``mpirun`` command. - - -********************* -Deploy Chainer models -********************* - -After an Chainer Estimator has been fit, you can host the newly created model in SageMaker. - -After calling ``fit``, you can call ``deploy`` on a ``Chainer`` Estimator to create a SageMaker Endpoint. -The Endpoint runs a SageMaker-provided Chainer model server and hosts the model produced by your training script, -which was run when you called ``fit``. This was the model you saved to ``model_dir``. - -``deploy`` returns a ``Predictor`` object, which you can use to do inference on the Endpoint hosting your Chainer model. -Each ``Predictor`` provides a ``predict`` method which can do inference with numpy arrays or Python lists. -Inference arrays or lists are serialized and sent to the Chainer model server by an ``InvokeEndpoint`` SageMaker -operation. - -``predict`` returns the result of inference against your model. By default, the inference result a NumPy array. - -.. code:: python - - # Train my estimator - chainer_estimator = Chainer(entry_point='train_and_deploy.py', - instance_type='ml.p3.2xlarge', - instance_count=1, - framework_version='5.0.0', - py_version='py3') - chainer_estimator.fit('s3://my_bucket/my_training_data/') - - # Deploy my estimator to a SageMaker Endpoint and get a Predictor - predictor = chainer_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1) - - # `data` is a NumPy array or a Python list. - # `response` is a NumPy array. - response = predictor.predict(data) - -You use the SageMaker Chainer model server to host your Chainer model when you call ``deploy`` on an ``Chainer`` -Estimator. The model server runs inside a SageMaker Endpoint, which your call to ``deploy`` creates. -You can access the name of the Endpoint by the ``name`` property on the returned ``Predictor``. - - -The SageMaker Chainer Model Server -================================== - -The Chainer Endpoint you create with ``deploy`` runs a SageMaker Chainer model server. -The model server loads the model that was saved by your training script and performs inference on the model in response -to SageMaker InvokeEndpoint API calls. - -You can configure two components of the SageMaker Chainer model server: Model loading and model serving. -Model loading is the process of deserializing your saved model back into an Chainer model. -Serving is the process of translating InvokeEndpoint requests to inference calls on the loaded model. - -You configure the Chainer model server by defining functions in the Python source file you passed to the Chainer constructor. - -Load a Model ------------- - -Before a model can be served, it must be loaded. The SageMaker Chainer model server loads your model by invoking a -``model_fn`` function that you must provide in your script. The ``model_fn`` should have the following signature: - -.. code:: python - - def model_fn(model_dir) - -SageMaker will inject the directory where your model files and sub-directories, saved by ``save``, have been mounted. -Your model function should return a model object that can be used for model serving. - -SageMaker provides automated serving functions that work with Gluon API ``net`` objects and Module API ``Module`` objects. If you return either of these types of objects, then you will be able to use the default serving request handling functions. - -The following code-snippet shows an example ``model_fn`` implementation. -This loads returns a Chainer Classifier from a multi-layer perceptron class ``MLP`` that extends ``chainer.Chain``. -It loads the model parameters from a ``model.npz`` file in the SageMaker model directory ``model_dir``. - -.. code:: python - - import chainer - import os - - def model_fn(model_dir): - chainer.config.train = False - model = chainer.links.Classifier(MLP(1000, 10)) - chainer.serializers.load_npz(os.path.join(model_dir, 'model.npz'), model) - return model.predictor - -Serve a Model -------------- - -After the SageMaker model server has loaded your model by calling ``model_fn``, SageMaker will serve your model. -Model serving is the process of responding to inference requests, received by SageMaker InvokeEndpoint API calls. -The SageMaker Chainer model server breaks request handling into three steps: - - -- input processing, -- prediction, and -- output processing. - -In a similar way to model loading, you configure these steps by defining functions in your Python source file. - -Each step involves invoking a python function, with information about the request and the return-value from the previous -function in the chain. Inside the SageMaker Chainer model server, the process looks like: - -.. code:: python - - # Deserialize the Invoke request body into an object we can perform prediction on - input_object = input_fn(request_body, request_content_type) - - # Perform prediction on the deserialized object, with the loaded model - prediction = predict_fn(input_object, model) - - # Serialize the prediction result into the desired response content type - output = output_fn(prediction, response_content_type) - -The above code-sample shows the three function definitions: - -- ``input_fn``: Takes request data and deserializes the data into an - object for prediction. -- ``predict_fn``: Takes the deserialized request object and performs - inference against the loaded model. -- ``output_fn``: Takes the result of prediction and serializes this - according to the response content type. - -The SageMaker Chainer model server provides default implementations of these functions. -You can provide your own implementations for these functions in your hosting script. -If you omit any definition then the SageMaker Chainer model server will use its default implementation for that -function. - -The ``Predictor`` used by Chainer in the SageMaker Python SDK serializes NumPy arrays to the `NPY `_ format -by default, with Content-Type ``application/x-npy``. The SageMaker Chainer model server can deserialize NPY-formatted -data (along with JSON and CSV data). - -If you rely solely on the SageMaker Chainer model server defaults, you get the following functionality: - -- Prediction on models that implement the ``__call__`` method -- Serialization and deserialization of NumPy arrays. - -The default ``input_fn`` and ``output_fn`` are meant to make it easy to predict on NumPy arrays. If your model expects -a NumPy array and returns a NumPy array, then these functions do not have to be overridden when sending NPY-formatted -data. - -In the following sections we describe the default implementations of input_fn, predict_fn, and output_fn. -We describe the input arguments and expected return types of each, so you can define your own implementations. - -Process Input -^^^^^^^^^^^^^ - -When an InvokeEndpoint operation is made against an Endpoint running a SageMaker Chainer model server, -the model server receives two pieces of information: - -- The request Content-Type, for example "application/x-npy" -- The request data body, a byte array - -The SageMaker Chainer model server will invoke an "input_fn" function in your hosting script, -passing in this information. If you define an ``input_fn`` function definition, -it should return an object that can be passed to ``predict_fn`` and have the following signature: - -.. code:: python - - def input_fn(request_body, request_content_type) - -Where ``request_body`` is a byte buffer and ``request_content_type`` is a Python string - -The SageMaker Chainer model server provides a default implementation of ``input_fn``. -This function deserializes JSON, CSV, or NPY encoded data into a NumPy array. - -Default NPY deserialization requires ``request_body`` to follow the `NPY `_ format. For Chainer, the Python SDK -defaults to sending prediction requests with this format. - -Default json deserialization requires ``request_body`` contain a single json list. -Sending multiple json objects within the same ``request_body`` is not supported. -The list must have a dimensionality compatible with the model loaded in ``model_fn``. -The list's shape must be identical to the model's input shape, for all dimensions after the first (which first -dimension is the batch size). - -Default csv deserialization requires ``request_body`` contain one or more lines of CSV numerical data. -The data is loaded into a two-dimensional array, where each line break defines the boundaries of the first dimension. - -The example below shows a custom ``input_fn`` for preparing pickled NumPy arrays. - -.. code:: python - - import numpy as np - - def input_fn(request_body, request_content_type): - """An input_fn that loads a pickled numpy array""" - if request_content_type == "application/python-pickle": - array = np.load(StringIO(request_body)) - return array - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - - - -Get Predictions ---------------- - -After the inference request has been deserialized by ``input_fn``, the SageMaker Chainer model server invokes -``predict_fn`` on the return value of ``input_fn``. - -As with ``input_fn``, you can define your own ``predict_fn`` or use the SageMaker Chainer model server default. - -The ``predict_fn`` function has the following signature: - -.. code:: python - - def predict_fn(input_object, model) - -Where ``input_object`` is the object returned from ``input_fn`` and -``model`` is the model loaded by ``model_fn``. - -The default implementation of ``predict_fn`` invokes the loaded model's ``__call__`` function on ``input_object``, -and returns the resulting value. The return-type should be a NumPy array to be compatible with the default -``output_fn``. - -The example below shows an overridden ``predict_fn``. This model accepts a Python list and returns a tuple of -bounding boxes, labels, and scores from the model in a NumPy array. This ``predict_fn`` can rely on the default -``input_fn`` and ``output_fn`` because ``input_data`` is a NumPy array, and the return value of this function is -a NumPy array. - -.. code:: python - - import chainer - import numpy as np - - def predict_fn(input_data, model): - with chainer.using_config('train', False), chainer.no_backprop_mode(): - bboxes, labels, scores = model.predict([input_data]) - bbox, label, score = bboxes[0], labels[0], scores[0] - return np.array([bbox.tolist(), label, score]) - -If you implement your own prediction function, you should take care to ensure that: - -- The first argument is expected to be the return value from input_fn. - If you use the default input_fn, this will be a NumPy array. -- The second argument is the loaded model. -- The return value should be of the correct type to be passed as the - first argument to ``output_fn``. If you use the default - ``output_fn``, this should be a NumPy array. - -Process Output -^^^^^^^^^^^^^^ - -After invoking ``predict_fn``, the model server invokes ``output_fn``, passing in the return-value from ``predict_fn`` -and the InvokeEndpoint requested response content-type. - -The ``output_fn`` has the following signature: - -.. code:: python - - def output_fn(prediction, content_type) - -Where ``prediction`` is the result of invoking ``predict_fn`` and -``content_type`` is the InvokeEndpoint requested response content-type. -The function should return a byte array of data serialized to content_type. - -The default implementation expects ``prediction`` to be an NumPy and can serialize the result to JSON, CSV, or NPY. -It accepts response content types of "application/json", "text/csv", and "application/x-npy". - -Working with existing model data and training jobs -================================================== - -Attach to Existing Training Jobs --------------------------------- - -You can attach an Chainer Estimator to an existing training job using the -``attach`` method. - -.. code:: python - - my_training_job_name = "MyAwesomeChainerTrainingJob" - chainer_estimator = Chainer.attach(my_training_job_name) - -After attaching, if the training job is in a Complete status, it can be -``deploy``\ ed to create a SageMaker Endpoint and return a -``Predictor``. If the training job is in progress, -attach will block and display log messages from the training job, until the training job completes. - -The ``attach`` method accepts the following arguments: - -- ``training_job_name (str):`` The name of the training job to attach - to. -- ``sagemaker_session (sagemaker.Session or None):`` The Session used - to interact with SageMaker - -Deploy Endpoints from Model Data --------------------------------- - -As well as attaching to existing training jobs, you can deploy models directly from model data in S3. -The following code sample shows how to do this, using the ``ChainerModel`` class. - -.. code:: python - - chainer_model = ChainerModel( - model_data="s3://bucket/model.tar.gz", - role="SageMakerRole", - entry_point="transform_script.py", - ) - - predictor = chainer_model.deploy(instance_type="ml.c4.xlarge", initial_instance_count=1) - -To see what arguments are accepted by the ``ChainerModel`` constructor, see :class:`sagemaker.chainer.model.ChainerModel`. - -Your model data must be a .tar.gz file in S3. SageMaker Training Job model data is saved to .tar.gz files in S3, -however if you have local data you want to deploy, you can prepare the data yourself. - -Assuming you have a local directory containg your model data named "my_model" you can tar and gzip compress the file and -upload to S3 using the following commands: - -:: - - tar -czf model.tar.gz my_model - aws s3 cp model.tar.gz s3://my-bucket/my-path/model.tar.gz - -This uploads the contents of my_model to a gzip compressed tar file to S3 in the bucket "my-bucket", with the key -"my-path/model.tar.gz". - -To run this command, you'll need the aws cli tool installed. Please refer to our `FAQ <#FAQ>`__ for more information on -installing this. - -******** -Examples -******** - -Amazon provides several example Jupyter notebooks that demonstrate end-to-end training on Amazon SageMaker using Chainer. -Please refer to: - -https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk - -These are also available in SageMaker Notebook Instance hosted Jupyter notebooks under the "sample notebooks" folder. - -************************* -SageMaker Chainer Classes -************************* - -For information about the different Chainer-related classes in the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/frameworks/chainer/sagemaker.chainer.html. - -*********************************** -SageMaker Chainer Docker containers -*********************************** - -When training and deploying training scripts, SageMaker runs your Python script in a Docker container with several -libraries installed. When creating the Estimator and calling deploy to create the SageMaker Endpoint, you can control -the environment your script runs in. - -SageMaker runs Chainer Estimator scripts in either Python 2.7 or Python 3.5. You can select the Python version by -passing a py_version keyword arg to the Chainer Estimator constructor. Setting this to py3 (the default) will cause your -training script to be run on Python 3.5. Setting this to py2 will cause your training script to be run on Python 2.7 -This Python version applies to both the Training Job, created by fit, and the Endpoint, created by deploy. - -The Chainer Docker images have the following dependencies installed: - -+-----------------------------+-------------+-------------+-------------+ -| Dependencies | chainer 4.0 | chainer 4.1 | chainer 5.0 | -+-----------------------------+-------------+-------------+-------------+ -| chainer | 4.0.0 | 4.1.0 | 5.0.0 | -+-----------------------------+-------------+-------------+-------------+ -| chainercv | 0.9.0 | 0.10.0 | 0.10.0 | -+-----------------------------+-------------+-------------+-------------+ -| chainermn | 1.2.0 | 1.3.0 | N/A | -+-----------------------------+-------------+-------------+-------------+ -| CUDA (GPU image only) | 9.0 | 9.0 | 9.0 | -+-----------------------------+-------------+-------------+-------------+ -| cupy | 4.0.0 | 4.1.0 | 5.0.0 | -+-----------------------------+-------------+-------------+-------------+ -| matplotlib | 2.2.0 | 2.2.0 | 2.2.0 | -+-----------------------------+-------------+-------------+-------------+ -| mpi4py | 3.0.0 | 3.0.0 | 3.0.0 | -+-----------------------------+-------------+-------------+-------------+ -| numpy | 1.14.3 | 1.15.3 | 1.15.4 | -+-----------------------------+-------------+-------------+-------------+ -| opencv-python | 3.4.0.12 | 3.4.0.12 | 3.4.0.12 | -+-----------------------------+-------------+-------------+-------------+ -| Pillow | 5.1.0 | 5.3.0 | 5.3.0 | -+-----------------------------+-------------+-------------+-------------+ -| Python | 2.7 or 3.5 | 2.7 or 3.5 | 2.7 or 3.5 | -+-----------------------------+-------------+-------------+-------------+ - -The Docker images extend Ubuntu 16.04. - -You must select a version of Chainer by passing a ``framework_version`` keyword arg to the Chainer Estimator -constructor. Currently supported versions are listed in the above table. You can also set framework_version to only -specify major and minor version, which will cause your training script to be run on the latest supported patch -version of that minor version. - -Alternatively, you can build your own image by following the instructions in the SageMaker Chainer containers -repository, and passing ``image_uri`` to the Chainer Estimator constructor. - -You can visit the SageMaker Chainer containers repository at https://github.com/aws/sagemaker-chainer-container diff --git a/doc/frameworks/huggingface/index.rst b/doc/frameworks/huggingface/index.rst deleted file mode 100644 index 3549f80977..0000000000 --- a/doc/frameworks/huggingface/index.rst +++ /dev/null @@ -1,13 +0,0 @@ -############ -Hugging Face -############ - -A managed environment for training using Hugging Face on Amazon SageMaker. For more information about Hugging Face on Amazon SageMaker, as well as sample Jupyter notebooks, see `Use Hugging Face with Amazon SageMaker `_. -For general information about using the SageMaker Python SDK, see :ref:`overview:Using the SageMaker Python SDK`. - -.. toctree:: - :maxdepth: 2 - - sagemaker.huggingface - Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK - Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK diff --git a/doc/frameworks/huggingface/sagemaker.huggingface.rst b/doc/frameworks/huggingface/sagemaker.huggingface.rst deleted file mode 100644 index 9060968570..0000000000 --- a/doc/frameworks/huggingface/sagemaker.huggingface.rst +++ /dev/null @@ -1,34 +0,0 @@ -Hugging Face -============ - -Hugging Face Estimator ----------------------- - -.. autoclass:: sagemaker.huggingface.HuggingFace - :members: - :undoc-members: - :show-inheritance: - -Hugging Face Training Compiler Configuration --------------------------------------------- - -.. autoclass:: sagemaker.huggingface.TrainingCompilerConfig - :members: - :undoc-members: - :show-inheritance: - -Hugging Face Model ------------------- - -.. autoclass:: sagemaker.huggingface.model.HuggingFaceModel - :members: - :undoc-members: - :show-inheritance: - -Hugging Face Predictor ----------------------- - -.. autoclass:: sagemaker.huggingface.model.HuggingFacePredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/index.rst b/doc/frameworks/index.rst deleted file mode 100644 index dca8535b8b..0000000000 --- a/doc/frameworks/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -########## -Frameworks -########## - -The SageMaker Python SDK supports managed training and inference for a variety of machine learning frameworks: - -.. toctree:: - :maxdepth: 2 - - mxnet/index - chainer/index - huggingface/index - pytorch/index - rl/index - sklearn/index - sparkml/index - tensorflow/index - xgboost/index diff --git a/doc/frameworks/mxnet/index.rst b/doc/frameworks/mxnet/index.rst deleted file mode 100644 index fcb792c176..0000000000 --- a/doc/frameworks/mxnet/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -######################## -Apache MXNet -######################## - -A managed environment for MXNet training and hosting on Amazon SageMaker - -.. toctree:: - :maxdepth: 1 - - using_mxnet - -.. toctree:: - :maxdepth: 2 - - sagemaker.mxnet diff --git a/doc/frameworks/mxnet/sagemaker.mxnet.rst b/doc/frameworks/mxnet/sagemaker.mxnet.rst deleted file mode 100644 index 6547a5274d..0000000000 --- a/doc/frameworks/mxnet/sagemaker.mxnet.rst +++ /dev/null @@ -1,27 +0,0 @@ -MXNet Classes -================= - - -MXNet Estimator ---------------------------- - -.. autoclass:: sagemaker.mxnet.estimator.MXNet - :members: - :undoc-members: - :show-inheritance: - -MXNet Model ---------------------------- - -.. autoclass:: sagemaker.mxnet.model.MXNetModel - :members: - :undoc-members: - :show-inheritance: - -MXNet Predictor ---------------------------- - -.. autoclass:: sagemaker.mxnet.model.MXNetPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/mxnet/using_mxnet.rst b/doc/frameworks/mxnet/using_mxnet.rst deleted file mode 100644 index 0e78b92d57..0000000000 --- a/doc/frameworks/mxnet/using_mxnet.rst +++ /dev/null @@ -1,657 +0,0 @@ -####################################### -Use MXNet with the SageMaker Python SDK -####################################### - -With the SageMaker Python SDK, you can train and host MXNet models on Amazon SageMaker. - -For information about supported versions of MXNet, see the `AWS documentation `__. -We recommend that you use the latest supported version because that's where we focus our development efforts. - -For general information about using the SageMaker Python SDK, see :ref:`overview:Using the SageMaker Python SDK`. - -.. contents:: - -************************ -Train a Model with MXNet -************************ - -To train an MXNet model by using the SageMaker Python SDK: - -.. |create mxnet estimator| replace:: Create a ``sagemaker.mxnet.MXNet`` Estimator -.. _create mxnet estimator: #create-an-estimator - -.. |call fit| replace:: Call the estimator's ``fit`` method -.. _call fit: #call-the-fit-method - -1. `Prepare a training script <#prepare-an-mxnet-training-script>`_ -2. |create mxnet estimator|_ -3. |call fit|_ - -Prepare an MXNet Training Script -================================ - -The training script is very similar to a training script you might run outside of Amazon SageMaker, but you can access useful properties about the training environment through various environment variables, including the following: - -* ``SM_MODEL_DIR``: A string that represents the path where the training job writes the model artifacts to. - After training, artifacts in this directory are uploaded to Amazon S3 for model hosting. -* ``SM_NUM_GPUS``: An integer representing the number of GPUs available to the host. -* ``SM_CHANNEL_XXXX``: A string that represents the path to the directory that contains the input data for the specified channel. - For example, if you specify two input channels in the MXNet estimator's ``fit`` call, named 'train' and 'test', the environment variables ``SM_CHANNEL_TRAIN`` and ``SM_CHANNEL_TEST`` are set. -* ``SM_HPS``: A JSON dump of the hyperparameters preserving JSON types (boolean, integer, etc.) - -For the exhaustive list of available environment variables, see the `SageMaker Containers documentation `__. - -A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, and saves a model to ``model_dir`` so that it can be deployed for inference later. -Hyperparameters are passed to your script as arguments and can be retrieved with an ``argparse.ArgumentParser`` instance. -For example, a training script might start with the following: - -.. code:: python - - import argparse - import os - import json - - if __name__ =='__main__': - - parser = argparse.ArgumentParser() - - # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=10) - parser.add_argument('--batch-size', type=int, default=100) - parser.add_argument('--learning-rate', type=float, default=0.1) - - # an alternative way to load hyperparameters via SM_HPS environment variable. - parser.add_argument('--sm-hps', type=json.loads, default=os.environ['SM_HPS']) - - # input data and model directories - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST']) - - args, _ = parser.parse_known_args() - - # ... load from args.train and args.test, train a model, write model to args.model_dir. - -Because Amazon SageMaker imports your training script, you should put your training code in a main guard (``if __name__=='__main__':``) if you are using the same script to host your model, -so that Amazon SageMaker does not inadvertently run your training code at the wrong point in execution. - -Note that Amazon SageMaker doesn't support argparse actions. -If you want to use, for example, boolean hyperparameters, you need to specify ``type`` as ``bool`` in your script and provide an explicit ``True`` or ``False`` value for this hyperparameter when instantiating your MXNet estimator. - -For more on training environment variables, please visit `SageMaker Containers `_. - -.. note:: - If you want to use MXNet 1.2 or lower, see `an older version of this page `_. - -Save a Checkpoint ------------------ - -It is good practice to save the best model after each training epoch, -so that you can resume a training job if it gets interrupted. -This is particularly important if you are using Managed Spot training. - -To save MXNet model checkpoints, do the following in your training script: - -* Set the ``CHECKPOINTS_DIR`` environment variable and enable checkpoints. - - .. code:: python - - CHECKPOINTS_DIR = '/opt/ml/checkpoints' - checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR) - -* Make sure you are emitting a validation metric to test the model. For information, see `Evaluation Metric API `_. -* After each training epoch, test whether the current model performs the best with respect to the validation metric, and if it does, save that model to ``CHECKPOINTS_DIR``. - - .. code:: python - - if checkpoints_enabled and current_host == hosts[0]: - if val_acc > best_accuracy: - best_accuracy = val_acc - logging.info('Saving the model, params and optimizer state') - net.export(CHECKPOINTS_DIR + "/%.4f-cifar10"%(best_accuracy), epoch) - trainer.save_states(CHECKPOINTS_DIR + '/%.4f-cifar10-%d.states'%(best_accuracy, epoch)) - -For a complete example of an MXNet training script that impelements checkpointing, see https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/mxnet_gluon_cifar10/cifar10.py. - -Save the Model --------------- - -There is a default save method that can be imported when training on SageMaker: - -.. code:: python - - from sagemaker_mxnet_training.training_utils import save - - if __name__ == '__main__': - # arg parsing and training (shown above) goes here - - save(args.model_dir, model) - -The default serialization system generates three files: - -- ``model-shapes.json``: A JSON list, containing a serialization of the - ``Module`` ``data_shapes`` property. Each object in the list contains - the serialization of one ``DataShape`` in the returned ``Module``. - Each object has a ``name`` property, containing the ``DataShape`` - name and a ``shape`` property, which is a list of that dimensions for - the shape of that ``DataShape``. For example: - -.. code:: javascript - - [ - {"name":"images", "shape":[100, 1, 28, 28]}, - {"name":"labels", "shape":[100, 1]} - ] - -- ``model-symbol.json``: The MXNet ``Module`` ``Symbol`` serialization, - produced by invoking ``save`` on the ``symbol`` property of the - ``Module`` being saved. -- ``modle.params``: The MXNet ``Module`` parameters, produced by - invoking ``save_params`` on the ``Module`` being saved. - -Use third-party libraries -========================= - -When running your training script on Amazon SageMaker, it has access to some pre-installed third-party libraries, including ``mxnet``, ``numpy``, ``onnx``, and ``keras-mxnet``. -For more information on the runtime environment, including specific package versions, see `SageMaker MXNet Containers <#sagemaker-mxnet-containers>`__. - -If there are other packages you want to use with your script, you can include a ``requirements.txt`` file in the same directory as your training script to install other dependencies at runtime. -Both ``requirements.txt`` and your training script should be put in the same folder. -You must specify this folder in ``source_dir`` argument when creating an MXNet estimator. - -The function of installing packages using ``requirements.txt`` is supported for MXNet versions 1.3.0 and higher during training. - -When serving an MXNet model, support for this function varies with MXNet versions. -For MXNet 1.6.0 or newer, ``requirements.txt`` must be under folder ``code``. -The SageMaker MXNet Estimator automatically saves ``code`` in ``model.tar.gz`` after training (assuming you set up your script and ``requirements.txt`` correctly as stipulated in the previous paragraph). -In the case of bringing your own trained model for deployment, you must save ``requirements.txt`` under folder ``code`` in ``model.tar.gz`` yourself or specify it through ``dependencies``. -For MXNet 0.12.1-1.2.1, 1.4.0-1.4.1, ``requirements.txt`` is not supported for inference. -For MXNet 1.3.0, ``requirements.txt`` must be in ``source_dir``. - -A ``requirements.txt`` file is a text file that contains a list of items that are installed by using ``pip install``. -You can also specify the version of an item to install. -For information about the format of a ``requirements.txt`` file, see `Requirements Files `__ in the pip documentation. - -Create an Estimator -=================== - -You run MXNet training scripts on Amazon SageMaker by creating an ``MXNet`` estimator. -When you call ``fit`` on an ``MXNet`` estimator, Amazon SageMaker starts a training job using your script as training code. -The following code sample shows how you train a custom MXNet script "train.py". - -.. code:: python - - mxnet_estimator = MXNet('train.py', - instance_type='ml.p2.xlarge', - instance_count=1, - framework_version='1.6.0', - py_version='py3', - hyperparameters={'batch-size': 100, - 'epochs': 10, - 'learning-rate': 0.1}) - mxnet_estimator.fit('s3://my_bucket/my_training_data/') - -For more information about the sagemaker.mxnet.MXNet estimator, see `SageMaker MXNet Classes`_. - - -Distributed training -==================== - -If you want to use parameter servers for distributed training, set the following parameter in your ``MXNet`` constructor: - -.. code:: python - - distribution={'parameter_server': {'enabled': True}} - -Then, when writing a distributed training script, use an MXNet kvstore to store and share model parameters. -During training, Amazon SageMaker automatically starts an MXNet kvstore server and scheduler processes on hosts in your training job cluster. -Your script runs as an MXNet worker task, with one server process on each host in your cluster. -One host is selected arbitrarily to run the scheduler process. - -To learn more about writing distributed MXNet programs, please see `Distributed Training `__ in the MXNet docs. - - -Call the fit Method -=================== - -Start your training script by calling ``fit`` on an ``MXNet`` Estimator. -``fit`` takes both required and optional arguments. -For what arguments can be passed into ``fit``, see the `API reference `_. - -******************* -Deploy MXNet models -******************* - -Once you have a trained MXNet model, you can host it in Amazon SageMaker by creating an Amazon SageMaker Endpoint. -The endpoint runs a SageMaker-provided MXNet model server and hosts the model produced by your training script. -This model can be one you trained in Amazon SageMaker or a pretrained one from somewhere else. - -If you use the ``MXNet`` estimator to train the model, you can call ``deploy`` to create an Amazon SageMaker Endpoint: - -.. code:: python - - # Train my estimator - mxnet_estimator = MXNet('train.py', - framework_version='1.6.0', - py_version='py3', - instance_type='ml.p2.xlarge', - instance_count=1) - mxnet_estimator.fit('s3://my_bucket/my_training_data/') - - # Deploy my estimator to an Amazon SageMaker Endpoint and get a Predictor - predictor = mxnet_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1) - -If using a pretrained model, create an ``MXNetModel`` object, and then call ``deploy`` to create the Amazon SageMaker Endpoint: - -.. code:: python - - mxnet_model = MXNetModel(model_data='s3://my_bucket/pretrained_model/model.tar.gz', - role=role, - entry_point='inference.py', - framework_version='1.6.0', - py_version='py3') - predictor = mxnet_model.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1) - -In both cases, ``deploy`` returns a ``Predictor`` object, which you can use to do inference on the endpoint hosting your MXNet model. - -Each ``Predictor`` provides a ``predict`` method, which can do inference with numpy arrays or Python lists. -Inference arrays or lists are serialized and sent to the MXNet model server by an ``InvokeEndpoint`` SageMaker operation. -``predict`` returns the result of inference against your model. -By default, the inference result is either a Python list or dictionary. - -Elastic Inference -================= - -MXNet on Amazon SageMaker has support for `Elastic Inference `_, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance. -In order to attach an Elastic Inference accelerator to your endpoint provide the accelerator type to ``accelerator_type`` to your ``deploy`` call. - -.. code:: python - - predictor = mxnet_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1, - accelerator_type='ml.eia1.medium') - -Model Directory Structure -========================= - -In general, if you use the same version of MXNet for both training and inference with the SageMaker Python SDK, -the SDK should take care of ensuring that the contents of your ``model.tar.gz`` file are organized correctly. - -For versions 1.4 and higher ---------------------------- - -For MXNet versions 1.4 and higher, the contents of ``model.tar.gz`` should be organized as follows: - -- Model files in the top-level directory -- Inference script (and any other source files) in a directory named ``code/`` (for more about the inference script, see `The SageMaker MXNet Model Server <#the-sagemaker-mxnet-model-server>`_) -- Optional requirements file located at ``code/requirements.txt`` (for more about requirements files, see `Use third-party libraries <#use-third-party-libraries>`_) - -For example: - -.. code:: - - model.tar.gz/ - |- model-symbol.json - |- model-shapes.json - |- model-0000.params - |- code/ - |- inference.py - |- requirements.txt # only for versions 1.6.0 and higher - -In this example, ``model-symbol.json``, ``model-shapes.json``, and ``model-0000.params`` are the model files saved from training, -``inference.py`` is the inference script, and ``requirements.txt`` is a requirements file. - -The ``MXNet`` and ``MXNetModel`` classes repack ``model.tar.gz`` to include the inference script (and related files), -as long as the ``framework_version`` is set to 1.4 or higher. - -For versions 1.3 and lower --------------------------- - -For MXNet versions 1.3 and lower, ``model.tar.gz`` should contain only the model files, -while your inference script and optional requirements file are packed in a separate tarball, named ``sourcedir.tar.gz`` by default. - -For example: - -.. code:: - - model.tar.gz/ - |- model-symbol.json - |- model-shapes.json - |- model-0000.params - - sourcedir.tar.gz/ - |- script.py - |- requirements.txt # only for versions 0.12.1-1.3.0 - -In this example, ``model-symbol.json``, ``model-shapes.json``, and ``model-0000.params`` are the model files saved from training, -``script.py`` is the inference script, and ``requirements.txt`` is a requirements file. - -The SageMaker MXNet Model Server -================================ - -The MXNet endpoint you create with ``deploy`` runs a SageMaker MXNet model server. -The model server loads the model provided and performs inference on the model in response to SageMaker ``InvokeEndpoint`` API calls. - -You can configure two components of the model server: model loading and model serving. -Model loading is the process of deserializing your saved model back into an MXNet model. -Serving is the process of translating ``InvokeEndpoint`` requests to inference calls on the loaded model. -These are configured by defining functions in the Python source file you pass to the ``MXNet`` or ``MXNetModel`` constructor. - -Load a Model ------------- - -Before a model can be served, it must be loaded. -The model server loads your model by invoking the ``model_fn`` function in your inference script. -If you don't provide a ``model_fn`` function, the model server uses a default ``model_fn`` function. -The default function works with MXNet Module model objects saved via the default ``save`` function. - -If you wrote your own save logic, then you may need to write a custom ``model_fn`` function. -The ``model_fn`` function must have the following signature: - -.. code:: python - - def model_fn(model_dir) - -Amazon SageMaker injects the directory where your model files and sub-directories have been mounted. -Your model function should return a model object that can be used for model serving. - -The following code snippet shows an example custom ``model_fn`` implementation. -This returns an MXNet Gluon net model for resnet-34 inference. -It loads the model parameters from a ``model.params`` file in the SageMaker model directory. - -.. code:: python - - def model_fn(model_dir): - """Load the Gluon model. Called when the hosting service starts. - - Args: - model_dir (str): The directory where model files are stored. - - Returns: - mxnet.gluon.nn.Block: a Gluon network (for this example) - """ - net = models.get_model('resnet34_v2', ctx=mx.cpu(), pretrained=False, classes=10) - net.load_params('%s/model.params' % model_dir, ctx=mx.cpu()) - return net - -MXNet on Amazon SageMaker has support for `Elastic Inference `__, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance. -In order to load and serve your MXNet model through Amazon Elastic Inference, import the ``eimx`` Python package and make one change in the code to partition your model and optimize it for the ``EIA`` back end, as shown `here `__. - -Based on the example above, the following code-snippet shows an example custom ``model_fn`` implementation, which enables loading and serving our MXNet model through Amazon Elastic Inference. - -.. code:: python - - def model_fn(model_dir): - """Load the Gluon model. Called when the hosting service starts. - - Args: - model_dir (str): The directory where model files are stored. - - Returns: - mxnet.gluon.nn.Block: a Gluon network (for this example) - """ - net = models.get_model('resnet34_v2', ctx=mx.cpu(), pretrained=False, classes=10) - net.load_params('%s/model.params' % model_dir, ctx=mx.cpu()) - net.hybridize(backend='EIA', static_alloc=True, static_shape=True) - return net - -If you are using MXNet 1.5.1 and earlier, the `default_model_fn `__ loads and serve your model through Elastic Inference, if applicable, within the Amazon SageMaker MXNet containers. - -For more information on how to enable MXNet to interact with Amazon Elastic Inference, see `Use Elastic Inference with MXNet `__. - -Serve an MXNet Model --------------------- - -After the MXNet model server loads your model by calling either the default ``model_fn`` or the implementation in your script, it serves your model. -Model serving is the process of responding to inference requests received by SageMaker ``InvokeEndpoint`` API calls. -Defining how to handle these requests can be done in one of two ways: - -- using ``input_fn``, ``predict_fn``, and ``output_fn``, some of which may be your own implementations -- writing your own ``transform_fn`` for handling input processing, prediction, and output processing - -Use ``input_fn``, ``predict_fn``, and ``output_fn`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The SageMaker MXNet model server breaks request handling into three steps: - -- input processing -- prediction -- output processing - -Just like with ``model_fn``, you configure these steps by defining functions in your Python source file. - -Each step has its own Python function, which takes in information about the request and the return value from the previous function in the chain. -Inside the MXNet model server, the process looks like: - -.. code:: python - - # Deserialize the Invoke request body into an object we can perform prediction on - input_object = input_fn(request_body, request_content_type) - - # Perform prediction on the deserialized object, with the loaded model - prediction = predict_fn(input_object, model) - - # Serialize the prediction result into the desired response content type - ouput = output_fn(prediction, response_content_type) - -The above code sample shows the three function definitions that correlate to the three steps mentioned above: - -- ``input_fn``: Takes request data and deserializes the data into an - object for prediction. -- ``predict_fn``: Takes the deserialized request object and performs - inference against the loaded model. -- ``output_fn``: Takes the result of prediction and serializes this - according to the response content type. - -The MXNet model server provides default implementations of these functions. -These work with both Gluon API and Module API model objects. -The following content types are supported: - -- Gluon API: 'application/json', 'application/x-npy' -- Module API: 'application/json', 'application/x-npy', 'text-csv' - -You can also provide your own implementations for these functions in your training script. -If you omit any definition, the MXNet model server uses its default implementation for that function. - -If you rely solely on the SageMaker MXNet model server defaults, you get the following functionality: - -- Prediction on MXNet Gluon API ``net`` and Module API ``Module`` objects. -- Deserialization from CSV and JSON to NDArrayIters. -- Serialization of NDArrayIters to CSV or JSON. - -In the following sections, we describe the default implementations of ``input_fn``, ``predict_fn``, and ``output_fn``. -We describe the input arguments and expected return types of each, so you can define your own implementations. - -Process Model Input -~~~~~~~~~~~~~~~~~~~ - -When an ``InvokeEndpoint`` operation is made against an endpoint running an MXNet model server, the model server receives two pieces of information: - -- The request's content type, e.g. 'application/json' -- The request data body as a byte array - -The MXNet model server invokes ``input_fn``, passing in this information. -If you define an ``input_fn`` function definition, it should return an object that can be passed to ``predict_fn`` and have the following signature: - -.. code:: python - - def input_fn(request_body, request_content_type) - -Where ``request_body`` is a byte buffer and ``request_content_type`` is the content type of the request. - -The MXNet model server provides a default implementation of ``input_fn``. This function deserializes JSON or CSV encoded data into an MXNet ``NDArrayIter`` `(external API docs) `__ multi-dimensional array iterator. This works with the default ``predict_fn`` implementation, which expects an ``NDArrayIter`` as input. - -Default JSON deserialization requires ``request_body`` contain a single JSON list. Sending multiple JSON objects within the same ``request_body`` is not supported. The list must have a dimensionality compatible with the MXNet ``net`` or ``Module`` object. Specifically, after the list is loaded, it's either padded or split to fit the first dimension of the model input shape. The list's shape must be identical to the model's input shape, for all dimensions after the first. - -Default CSV deserialization requires ``request_body`` contain one or more lines of CSV numerical data. The data is loaded into a two-dimensional array, where each line break defines the boundaries of the first dimension. This two-dimensional array is then re-shaped to be compatible with the shape expected by the model object. Specifically, the first dimension is kept unchanged, but the second dimension is reshaped to be consistent with the shape of all dimensions in the model, following the first dimension. - -If you provide your own implementation of input_fn, you should abide by the ``input_fn`` signature. If you want to use this with the default -``predict_fn``, then you should return an ``NDArrayIter``. The ``NDArrayIter`` should have a shape identical to the shape of the model being predicted on. The example below shows a custom ``input_fn`` for preparing pickled numpy arrays. - -.. code:: python - - import numpy as np - import mxnet as mx - - def input_fn(request_body, request_content_type): - """An input_fn that loads a pickled numpy array""" - if request_content_type == 'application/python-pickle': - array = np.load(StringIO(request_body)) - array.reshape(model.data_shapes[0]) - return mx.io.NDArrayIter(mx.ndarray(array)) - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - -Predict from a Deployed Model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -After the inference request has been deserialized by ``input_fn``, the MXNet model server invokes ``predict_fn``. -As with the other functions, you can define your own ``predict_fn`` or use the model server's default. - -The ``predict_fn`` function has the following signature: - -.. code:: python - - def predict_fn(input_object, model) - -Where ``input_object`` is the object returned from ``input_fn`` and -``model`` is the model loaded by ``model_fn``. - -The default implementation of ``predict_fn`` requires ``input_object`` be an ``NDArrayIter``, which is the return-type of the default -``input_fn``. It also requires that ``model`` be either an MXNet Gluon API ``net`` object or a Module API ``Module`` object. - -The default implementation performs inference with the input -``NDArrayIter`` on the Gluon or Module object. If the model is a Gluon -``net`` it performs: ``net.forward(input_object)``. If the model is a Module object it performs ``module.predict(input_object)``. In both cases, it returns the result of that call. - -If you implement your own prediction function, you should take care to ensure that: - -- The first argument is expected to be the return value from input_fn. - If you use the default input_fn, this is an ``NDArrayIter``. -- The second argument is the loaded model. If you use the default - ``model_fn`` implementation, this is an MXNet Module object. - Otherwise, it is the return value of your ``model_fn`` implementation. -- The return value should be of the correct type to be passed as the - first argument to ``output_fn``. If you use the default - ``output_fn``, this should be an ``NDArrayIter``. - -Process Model Output -~~~~~~~~~~~~~~~~~~~~ - -After invoking ``predict_fn``, the model server invokes ``output_fn``, passing in the return value from ``predict_fn`` and the ``InvokeEndpoint`` requested response content type. - -The ``output_fn`` has the following signature: - -.. code:: python - - def output_fn(prediction, content_type) - -Where ``prediction`` is the result of invoking ``predict_fn`` and ``content_type`` is the requested response content type for ``InvokeEndpoint``. -The function should return an array of bytes serialized to the expected content type. - -The default implementation expects ``prediction`` to be an ``NDArray`` and can serialize the result to either JSON or CSV. It accepts response content types of "application/json" and "text/csv". - -Use ``transform_fn`` -^^^^^^^^^^^^^^^^^^^^ - -If you would rather not structure your code around the three methods described above, you can instead define your own ``transform_fn`` to handle inference requests. -An error is thrown if a ``transform_fn`` is present in conjunction with any ``input_fn``, ``predict_fn``, and/or ``output_fn``. -``transform_fn`` has the following signature: - -.. code:: python - - def transform_fn(model, request_body, content_type, accept_type) - -Where ``model`` is the model objected loaded by ``model_fn``, ``request_body`` is the data from the inference request, ``content_type`` is the content type of the request, and ``accept_type`` is the request content type for the response. - -This one function should handle processing the input, performing a prediction, and processing the output. -The return object should be one of the following: - -For versions 1.4 and higher: - -- a tuple with two items: the response data and ``accept_type`` (the content type of the response data), or -- the response data: (the content type of the response is set to either the accept header in the initial request or default to "application/json") - -For versions 1.3 and lower: - -- a tuple with two items: the response data and ``accept_type`` (the content type of the response data), or -- a Flask response object: http://flask.pocoo.org/docs/1.0/api/#response-objects - -For an example inference script using this structure, see the `mxnet_gluon_sentiment `__ notebook. - -*********************************************** -Work with Existing Model Data and Training Jobs -*********************************************** - -Attach to Existing Training Jobs -================================ - -You can attach an MXNet Estimator to an existing training job using the -``attach`` method. - -.. code:: python - - my_training_job_name = 'MyAwesomeMXNetTrainingJob' - mxnet_estimator = MXNet.attach(my_training_job_name) - -After attaching, if the training job's status is "Complete", it can be ``deploy``\ ed to create an Amazon SageMaker Endpoint and return a ``Predictor``. -If the training job is in progress, ``attach`` blocks and displays log messages from the training job until the training job completes. - -For information about arguments that ``attach`` accepts, see `the function's API reference `_. - -Deploy Endpoints from Model Data -================================ - -As well as attaching to existing training jobs, you can deploy models directly from model data in Amazon S3. The following code sample shows how to do this, using the ``MXNetModel`` class. - -.. code:: python - - mxnet_model = MXNetModel(model_data='s3://bucket/model.tar.gz', role='SageMakerRole', entry_point='trasform_script.py') - - predictor = mxnet_model.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1) - -For information about arguments that the ``MXNetModel`` constructor accepts, see `the class's API reference `_. - -Your model data must be a .tar.gz file in Amazon S3. Amazon SageMaker Training Job model data is saved to .tar.gz files in Amazon S3, however if you have local data you want to deploy, you can prepare the data yourself. - -Assuming you have a local directory containing your model data named "my_model" you can tar and gzip compress the file and upload to Amazon S3 using the following commands: - -:: - - tar -czf model.tar.gz my_model - aws s3 cp model.tar.gz s3://my-bucket/my-path/model.tar.gz - -This uploads the contents of my_model to a gzip-compressed tar file to Amazon S3 in the bucket "my-bucket", with the key "my-path/model.tar.gz". - -To run this command, you need the AWS CLI tool installed. Please refer to our `FAQ <#FAQ>`__ for more information on installing this. - -******** -Examples -******** - -Amazon provides several example Jupyter notebooks that demonstrate end-to-end training on Amazon SageMaker using MXNet. Please refer to: - -https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk - -These are also available in Amazon SageMaker Notebook Instance hosted Jupyter notebooks under the "sample notebooks" folder. - -*********************** -SageMaker MXNet Classes -*********************** - -For information about the different MXNet-related classes in the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/sagemaker.mxnet.html. - -************************** -SageMaker MXNet Containers -************************** - -For information about the SageMaker MXNet containers, see: - -- `SageMaker MXNet training toolkit `_ -- `SageMaker MXNet serving toolkit `_ -- `Deep Learning Container (DLC) Dockerfiles for MXNet `_ -- `Deep Learning Container (DLC) Images `_ and `release notes `_ diff --git a/doc/frameworks/pytorch/index.rst b/doc/frameworks/pytorch/index.rst deleted file mode 100644 index aa6c69baa2..0000000000 --- a/doc/frameworks/pytorch/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -########################## -PyTorch -########################## - -.. toctree:: - :maxdepth: 1 - :glob: - - * diff --git a/doc/frameworks/pytorch/sagemaker.pytorch.rst b/doc/frameworks/pytorch/sagemaker.pytorch.rst deleted file mode 100644 index d36b2c7e50..0000000000 --- a/doc/frameworks/pytorch/sagemaker.pytorch.rst +++ /dev/null @@ -1,26 +0,0 @@ -PyTorch -======= - -PyTorch Estimator ------------------ - -.. autoclass:: sagemaker.pytorch.estimator.PyTorch - :members: - :undoc-members: - :show-inheritance: - -PyTorch Model -------------- - -.. autoclass:: sagemaker.pytorch.model.PyTorchModel - :members: - :undoc-members: - :show-inheritance: - -PyTorch Predictor ------------------ - -.. autoclass:: sagemaker.pytorch.model.PyTorchPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/pytorch/using_pytorch.rst b/doc/frameworks/pytorch/using_pytorch.rst deleted file mode 100644 index 9d4a4de3de..0000000000 --- a/doc/frameworks/pytorch/using_pytorch.rst +++ /dev/null @@ -1,723 +0,0 @@ -######################################### -Use PyTorch with the SageMaker Python SDK -######################################### - -With PyTorch Estimators and Models, you can train and host PyTorch models on Amazon SageMaker. - -For information about supported versions of PyTorch, see the `AWS documentation `__. - -We recommend that you use the latest supported version because that's where we focus our development efforts. - -You can visit the PyTorch repository at https://github.com/pytorch/pytorch. - -.. contents:: - -************************** -Train a Model with PyTorch -************************** - -To train a PyTorch model by using the SageMaker Python SDK: - -.. |create pytorch estimator| replace:: Create a ``sagemaker.pytorch.PyTorch`` Estimator -.. _create pytorch estimator: #create-an-estimator - -.. |call fit| replace:: Call the estimator's ``fit`` method -.. _call fit: #call-the-fit-method - -1. `Prepare a training script <#prepare-a-pytorch-training-script>`_ -2. |create pytorch estimator|_ -3. |call fit|_ - -Prepare a PyTorch Training Script -================================= - -Your PyTorch training script must be a Python 3.6 compatible source file. - -Prepare your script in a separate source file than the notebook, terminal session, or source file you're -using to submit the script to SageMaker via a ``PyTorch`` Estimator. This will be discussed in further detail below. - -The training script is very similar to a training script you might run outside of SageMaker, but you -can access useful properties about the training environment through various environment variables. -For example: - -* ``SM_NUM_GPUS``: An integer representing the number of GPUs available to the host. -* ``SM_MODEL_DIR``: A string representing the path to the directory to write model artifacts to. - These artifacts are uploaded to S3 for model hosting. -* ``SM_OUTPUT_DATA_DIR``: A string representing the filesystem path to write output artifacts to. Output artifacts may - include checkpoints, graphs, and other files to save, not including model artifacts. These artifacts are compressed - and uploaded to S3 to the same S3 prefix as the model artifacts. -* ``SM_CHANNEL_XXXX``: A string that represents the path to the directory that contains the input data for the specified channel. - For example, if you specify two input channels in the PyTorch estimator's ``fit`` call, named 'train' and 'test', - the environment variables ``SM_CHANNEL_TRAIN`` and ``SM_CHANNEL_TEST`` are set. - -A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, -and saves a model to ``model_dir`` so that it can be hosted later. Hyperparameters are passed to your script as arguments -and can be retrieved with an argparse.ArgumentParser instance. For example, a training script might start -with the following: - -.. code:: python - - import argparse - import os - - if __name__ =='__main__': - - parser = argparse.ArgumentParser() - - # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=50) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--learning-rate', type=float, default=0.05) - parser.add_argument('--use-cuda', type=bool, default=False) - - # Data, model, and output directories - parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST']) - - args, _ = parser.parse_known_args() - - # ... load from args.train and args.test, train a model, write model to args.model_dir. - -Because SageMaker imports your training script, you should put your training code in a main guard -(``if __name__=='__main__':``) if you are using the same script to host your model, so that SageMaker does not -inadvertently run your training code at the wrong point in execution. - -Note that SageMaker doesn't support argparse actions. If you want to use, for example, boolean hyperparameters, -you need to specify `type` as `bool` in your script and provide an explicit `True` or `False` value for this hyperparameter -when instantiating PyTorch Estimator. - -For more on training environment variables, see the `SageMaker Training Toolkit `_. - -Save the Model --------------- - -In order to save your trained PyTorch model for deployment on SageMaker, your training script should save your model -to a certain filesystem path called ``model_dir``. This value is accessible through the environment variable -``SM_MODEL_DIR``. The following code demonstrates how to save a trained PyTorch model named ``model`` as -``model.pth`` at the : - -.. code:: python - - import argparse - import os - import torch - - if __name__=='__main__': - # default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable. - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - args, _ = parser.parse_known_args() - - # ... train `model`, then save it to `model_dir` - with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f: - torch.save(model.state_dict(), f) - -After your training job is complete, SageMaker compresses and uploads the serialized model to S3, and your model data -will be available in the S3 ``output_path`` you specified when you created the PyTorch Estimator. - -If you are using Elastic Inference, you must convert your models to the TorchScript format and use ``torch.jit.save`` to save the model. -For example: - -.. code:: python - - import os - import torch - - # ... train `model`, then save it to `model_dir` - model_dir = os.path.join(model_dir, "model.pt") - torch.jit.save(model, model_dir) - -Using third-party libraries ---------------------------- - -When running your training script on SageMaker, it will have access to some pre-installed third-party libraries including ``torch``, ``torchvision``, and ``numpy``. -For more information on the runtime environment, including specific package versions, see `SageMaker PyTorch Docker containers `_. - -If there are other packages you want to use with your script, you can include a ``requirements.txt`` file in the same directory as your training script to install other dependencies at runtime. Both ``requirements.txt`` and your training script should be put in the same folder. You must specify this folder in ``source_dir`` argument when creating PyTorch estimator. - -The function of installing packages using ``requirements.txt`` is supported for all PyTorch versions during training. When serving a PyTorch model, support for this function varies with PyTorch versions. For PyTorch 1.3.1 or newer, ``requirements.txt`` must be under folder ``code``. The SageMaker PyTorch Estimator will automatically save ``code`` in ``model.tar.gz`` after training (assuming you set up your script and ``requirements.txt`` correctly as stipulated in the previous paragraph). In the case of bringing your own trained model for deployment, you must save ``requirements.txt`` under folder ``code`` in ``model.tar.gz`` yourself or specify it through ``dependencies``. For PyTorch 1.2.0, ``requirements.txt`` is not supported for inference. For PyTorch 0.4.0 to 1.1.0, ``requirements.txt`` must be in ``source_dir``. - -A ``requirements.txt`` file is a text file that contains a list of items that are installed by using ``pip install``. You can also specify the version of an item to install. For information about the format of a ``requirements.txt`` file, see `Requirements Files `__ in the pip documentation. - -Create an Estimator -=================== - -You run PyTorch training scripts on SageMaker by creating ``PyTorch`` Estimators. -SageMaker training of your script is invoked when you call ``fit`` on a ``PyTorch`` Estimator. -The following code sample shows how you train a custom PyTorch script "pytorch-train.py", passing -in three hyperparameters ('epochs', 'batch-size', and 'learning-rate'), and using two input channel -directories ('train' and 'test'). - -.. code:: python - - pytorch_estimator = PyTorch('pytorch-train.py', - instance_type='ml.p3.2xlarge', - instance_count=1, - framework_version='1.8.0', - py_version='py3', - hyperparameters = {'epochs': 20, 'batch-size': 64, 'learning-rate': 0.1}) - pytorch_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data', - 'test': 's3://my-data-bucket/path/to/my/test/data'}) - - - - -Call the fit Method -=================== - -You start your training script by calling ``fit`` on a ``PyTorch`` Estimator. ``fit`` takes both required and optional -arguments. - -fit Required Arguments ----------------------- - -- ``inputs``: This can take one of the following forms: A string - S3 URI, for example ``s3://my-bucket/my-training-data``. In this - case, the S3 objects rooted at the ``my-training-data`` prefix will - be available in the default ``train`` channel. A dict from - string channel names to S3 URIs. In this case, the objects rooted at - each S3 prefix will be available as files in each channel directory. - -For example: - -.. code:: python - - {'train':'s3://my-bucket/my-training-data', - 'eval':'s3://my-bucket/my-evaluation-data'} - -.. optional-arguments-1: - -fit Optional Arguments ----------------------- - -- ``wait``: Defaults to True, whether to block and wait for the - training script to complete before returning. -- ``logs``: Defaults to True, whether to show logs produced by training - job in the Python session. Only meaningful when wait is True. - - -Distributed PyTorch Training -============================ - -You can run a multi-machine, distributed PyTorch training using the PyTorch Estimator. By default, PyTorch objects will -submit single-machine training jobs to SageMaker. If you set ``instance_count`` to be greater than one, multi-machine -training jobs will be launched when ``fit`` is called. When you run multi-machine training, SageMaker will import your -training script and run it on each host in the cluster. - -To initialize distributed training in your script you would call ``dist.init_process_group`` providing desired backend -and rank and setting 'WORLD_SIZE' environment variable similar to how you would do it outside of SageMaker using -environment variable initialization: - -.. code:: python - - if args.distributed: - # Initialize the distributed environment. - world_size = len(args.hosts) - os.environ['WORLD_SIZE'] = str(world_size) - host_rank = args.hosts.index(args.current_host) - dist.init_process_group(backend=args.backend, rank=host_rank) - -SageMaker sets 'MASTER_ADDR' and 'MASTER_PORT' environment variables for you, but you can overwrite them. - -Supported backends: -- `gloo` and `tcp` for cpu instances -- `gloo` and `nccl` for gpu instances - - -********************* -Deploy PyTorch Models -********************* - -After a PyTorch Estimator has been fit, you can host the newly created model in SageMaker. - -After calling ``fit``, you can call ``deploy`` on a ``PyTorch`` Estimator to create a SageMaker Endpoint. -The Endpoint runs a SageMaker-provided PyTorch model server and hosts the model produced by your training script, -which was run when you called ``fit``. This was the model you saved to ``model_dir``. - -``deploy`` returns a ``Predictor`` object, which you can use to do inference on the Endpoint hosting your PyTorch model. -Each ``Predictor`` provides a ``predict`` method which can do inference with numpy arrays or Python lists. -Inference arrays or lists are serialized and sent to the PyTorch model server by an ``InvokeEndpoint`` SageMaker -operation. - -``predict`` returns the result of inference against your model. By default, the inference result a NumPy array. - -.. code:: python - - # Train my estimator - pytorch_estimator = PyTorch(entry_point='train_and_deploy.py', - instance_type='ml.p3.2xlarge', - instance_count=1, - framework_version='1.8.0', - py_version='py3') - pytorch_estimator.fit('s3://my_bucket/my_training_data/') - - # Deploy my estimator to a SageMaker Endpoint and get a Predictor - predictor = pytorch_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1) - - # `data` is a NumPy array or a Python list. - # `response` is a NumPy array. - response = predictor.predict(data) - -You use the SageMaker PyTorch model server to host your PyTorch model when you call ``deploy`` on an ``PyTorch`` -Estimator. The model server runs inside a SageMaker Endpoint, which your call to ``deploy`` creates. -You can access the name of the Endpoint by the ``name`` property on the returned ``Predictor``. - -Elastic Inference -================= - -PyTorch on Amazon SageMaker has support for `Elastic Inference `_, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance. -In order to attach an Elastic Inference accelerator to your endpoint provide the accelerator type to ``accelerator_type`` to your ``deploy`` call. - -.. code:: python - - predictor = pytorch_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1, - accelerator_type='ml.eia2.medium') - -Model Directory Structure -========================= - -In general, if you use the same version of PyTorch for both training and inference with the SageMaker Python SDK, -the SDK should take care of ensuring that the contents of your ``model.tar.gz`` file are organized correctly. - -For versions 1.2 and higher ---------------------------- - -For PyTorch versions 1.2 and higher, the contents of ``model.tar.gz`` should be organized as follows: - -- Model files in the top-level directory -- Inference script (and any other source files) in a directory named ``code/`` (for more about the inference script, see `The SageMaker PyTorch Model Server <#the-sagemaker-pytorch-model-server>`_) -- Optional requirements file located at ``code/requirements.txt`` (for more about requirements files, see `Using third-party libraries <#using-third-party-libraries>`_) - -For example: - -.. code:: - - model.tar.gz/ - |- model.pth - |- code/ - |- inference.py - |- requirements.txt # only for versions 1.3.1 and higher - -In this example, ``model.pth`` is the model file saved from training, ``inference.py`` is the inference script, and ``requirements.txt`` is a requirements file. - -The ``PyTorch`` and ``PyTorchModel`` classes repack ``model.tar.gz`` to include the inference script (and related files), -as long as the ``framework_version`` is set to 1.2 or higher. - -For versions 1.1 and lower --------------------------- - -For PyTorch versions 1.1 and lower, ``model.tar.gz`` should contain only the model files, -while your inference script and optional requirements file are packed in a separate tarball, named ``sourcedir.tar.gz`` by default. - -For example: - -.. code:: - - model.tar.gz/ - |- model.pth - - sourcedir.tar.gz/ - |- script.py - |- requirements.txt - -In this example, ``model.pth`` is the model file saved from training, ``script.py`` is the inference script, and ``requirements.txt`` is a requirements file. - -The SageMaker PyTorch Model Server -================================== - -The PyTorch Endpoint you create with ``deploy`` runs a SageMaker PyTorch model server. -The model server loads the model that was saved by your training script and performs inference on the model in response -to SageMaker InvokeEndpoint API calls. - -You can configure two components of the SageMaker PyTorch model server: Model loading and model serving. -Model loading is the process of deserializing your saved model back into a PyTorch model. -Serving is the process of translating InvokeEndpoint requests to inference calls on the loaded model. - -You configure the PyTorch model server by defining functions in the Python source file you passed to the PyTorch constructor. - -Load a Model ------------- - -Before a model can be served, it must be loaded. The SageMaker PyTorch model server loads your model by invoking a -``model_fn`` function that you must provide in your script when you are not using Elastic Inference. The ``model_fn`` should have the following signature: - -.. code:: python - - def model_fn(model_dir) - -SageMaker will inject the directory where your model files and sub-directories, saved by ``save``, have been mounted. -Your model function should return a model object that can be used for model serving. - -The following code-snippet shows an example ``model_fn`` implementation. -It loads the model parameters from a ``model.pth`` file in the SageMaker model directory ``model_dir``. - -.. code:: python - - import torch - import os - - def model_fn(model_dir): - model = Your_Model() - with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: - model.load_state_dict(torch.load(f)) - return model - -However, if you are using PyTorch Elastic Inference 1.3.1, you do not have to provide a ``model_fn`` since the PyTorch serving -container has a default one for you. But please note that if you are utilizing the default ``model_fn``, please save -your ScriptModule as ``model.pt``. If you are implementing your own ``model_fn``, please use TorchScript and ``torch.jit.save`` -to save your ScriptModule, then load it in your ``model_fn`` with ``torch.jit.load(..., map_location=torch.device('cpu'))``. - -If you are using PyTorch Elastic Inference 1.5.1, you should provide ``model_fn`` like below in your script to use new api ``attach_eia``. Reference can be find in `Elastic Inference documentation `_. - - -.. code:: python - - import torch - - - def model_fn(model_dir): - model = torch.jit.load('model.pth', map_location=torch.device('cpu')) - if torch.__version__ == '1.5.1': - import torcheia - model = model.eval() - # attach_eia() is introduced in PyTorch Elastic Inference 1.5.1, - model = torcheia.jit.attach_eia(model, 0) - return model - - -The client-side Elastic Inference framework is CPU-only, even though inference still happens in a CUDA context on the server. Thus, the default ``model_fn`` for Elastic Inference loads the model to CPU. Tracing models may lead to tensor creation on a specific device, which may cause device-related errors when loading a model onto a different device. Providing an explicit ``map_location=torch.device('cpu')`` argument forces all tensors to CPU. - -For more information on the default inference handler functions, please refer to: -`SageMaker PyTorch Default Inference Handler `_. - -Serve a PyTorch Model ---------------------- - -After the SageMaker model server has loaded your model by calling ``model_fn``, SageMaker will serve your model. -Model serving is the process of responding to inference requests, received by SageMaker InvokeEndpoint API calls. -The SageMaker PyTorch model server breaks request handling into three steps: - - -- input processing, -- prediction, and -- output processing. - -In a similar way to model loading, you configure these steps by defining functions in your Python source file. - -Each step involves invoking a python function, with information about the request and the return value from the previous -function in the chain. Inside the SageMaker PyTorch model server, the process looks like: - -.. code:: python - - # Deserialize the Invoke request body into an object we can perform prediction on - input_object = input_fn(request_body, request_content_type) - - # Perform prediction on the deserialized object, with the loaded model - prediction = predict_fn(input_object, model) - - # Serialize the prediction result into the desired response content type - output = output_fn(prediction, response_content_type) - -The above code sample shows the three function definitions: - -- ``input_fn``: Takes request data and deserializes the data into an - object for prediction. -- ``predict_fn``: Takes the deserialized request object and performs - inference against the loaded model. -- ``output_fn``: Takes the result of prediction and serializes this - according to the response content type. - -The SageMaker PyTorch model server provides default implementations of these functions. -You can provide your own implementations for these functions in your hosting script. -If you omit any definition then the SageMaker PyTorch model server will use its default implementation for that -function. -If you use PyTorch Elastic Inference 1.5.1, remember to implement ``predict_fn`` yourself. - -The ``Predictor`` used by PyTorch in the SageMaker Python SDK serializes NumPy arrays to the `NPY `_ format -by default, with Content-Type ``application/x-npy``. The SageMaker PyTorch model server can deserialize NPY-formatted -data (along with JSON and CSV data). - -If you rely solely on the SageMaker PyTorch model server defaults, you get the following functionality: - -- Prediction on models that implement the ``__call__`` method -- Serialization and deserialization of torch.Tensor. - -The default ``input_fn`` and ``output_fn`` are meant to make it easy to predict on torch.Tensors. If your model expects -a torch.Tensor and returns a torch.Tensor, then these functions do not have to be overridden when sending NPY-formatted -data. - -In the following sections we describe the default implementations of input_fn, predict_fn, and output_fn. -We describe the input arguments and expected return types of each, so you can define your own implementations. - -Process Model Input -^^^^^^^^^^^^^^^^^^^ - -When an InvokeEndpoint operation is made against an Endpoint running a SageMaker PyTorch model server, -the model server receives two pieces of information: - -- The request Content-Type, for example "application/x-npy" -- The request data body, a byte array - -The SageMaker PyTorch model server will invoke an ``input_fn`` function in your hosting script, -passing in this information. If you define an ``input_fn`` function definition, -it should return an object that can be passed to ``predict_fn`` and have the following signature: - -.. code:: python - - def input_fn(request_body, request_content_type) - -Where ``request_body`` is a byte buffer and ``request_content_type`` is a Python string - -The SageMaker PyTorch model server provides a default implementation of ``input_fn``. -This function deserializes JSON, CSV, or NPY encoded data into a torch.Tensor. - -Default NPY deserialization requires ``request_body`` to follow the `NPY `_ format. For PyTorch, the Python SDK -defaults to sending prediction requests with this format. - -Default JSON deserialization requires ``request_body`` contain a single json list. -Sending multiple JSON objects within the same ``request_body`` is not supported. -The list must have a dimensionality compatible with the model loaded in ``model_fn``. -The list's shape must be identical to the model's input shape, for all dimensions after the first (which first -dimension is the batch size). - -Default csv deserialization requires ``request_body`` contain one or more lines of CSV numerical data. -The data is loaded into a two-dimensional array, where each line break defines the boundaries of the first dimension. - -The example below shows a custom ``input_fn`` for preparing pickled torch.Tensor. - -.. code:: python - - import numpy as np - import torch - from six import BytesIO - - def input_fn(request_body, request_content_type): - """An input_fn that loads a pickled tensor""" - if request_content_type == 'application/python-pickle': - return torch.load(BytesIO(request_body)) - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - - - -Get Predictions from a PyTorch Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After the inference request has been deserialized by ``input_fn``, the SageMaker PyTorch model server invokes -``predict_fn`` on the return value of ``input_fn``. - -As with ``input_fn``, you can define your own ``predict_fn`` or use the SageMaker PyTorch model server default. - -The ``predict_fn`` function has the following signature: - -.. code:: python - - def predict_fn(input_object, model) - -Where ``input_object`` is the object returned from ``input_fn`` and -``model`` is the model loaded by ``model_fn``. - -The default implementation of ``predict_fn`` invokes the loaded model's ``__call__`` function on ``input_object``, -and returns the resulting value. The return-type should be a torch.Tensor to be compatible with the default -``output_fn``. - -The example below shows an overridden ``predict_fn``: - -.. code:: python - - import torch - import numpy as np - - def predict_fn(input_data, model): - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model.to(device) - model.eval() - with torch.no_grad(): - return model(input_data.to(device)) - -If you implement your own prediction function, you should take care to ensure that: - -- The first argument is expected to be the return value from input_fn. - If you use the default input_fn, this will be a torch.Tensor. -- The second argument is the loaded model. -- The return value should be of the correct type to be passed as the - first argument to ``output_fn``. If you use the default - ``output_fn``, this should be a torch.Tensor. - -The default Elastic Inference ``predict_fn`` is similar but runs the TorchScript model using ``torch.jit.optimized_execution``. -If you are implementing your own ``predict_fn``, please also use the ``torch.jit.optimized_execution`` -block, for example: - -.. code:: python - - import torch - import numpy as np - - def predict_fn(input_data, model): - device = torch.device("cpu") - model = model.to(device) - input_data = data.to(device) - model.eval() - with torch.jit.optimized_execution(True, {"target_device": "eia:0"}): - output = model(input_data) - -If you use PyTorch Elastic Inference 1.5.1, please implement your own ``predict_fn`` like below. - -.. code:: python - - import numpy as np - import torch - - - def predict_fn(input_data, model): - device = torch.device("cpu") - input_data = data.to(device) - # make sure torcheia is imported so that Elastic Inference api call will be invoked - import torcheia - # we need to set the profiling executor for EIA - torch._C._jit_set_profiling_executor(False) - with torch.jit.optimized_execution(True): - output = model.forward(input_data) - - -Process Model Output -^^^^^^^^^^^^^^^^^^^^ - -After invoking ``predict_fn``, the model server invokes ``output_fn``, passing in the return value from ``predict_fn`` -and the content type for the response, as specified by the InvokeEndpoint request. - -The ``output_fn`` has the following signature: - -.. code:: python - - def output_fn(prediction, content_type) - -Where ``prediction`` is the result of invoking ``predict_fn`` and -the content type for the response, as specified by the InvokeEndpoint request. -The function should return a byte array of data serialized to content_type. - -The default implementation expects ``prediction`` to be a torch.Tensor and can serialize the result to JSON, CSV, or NPY. -It accepts response content types of "application/json", "text/csv", and "application/x-npy". - - -Bring your own model -==================== - -You can deploy a PyTorch model that you trained outside of SageMaker by using the ``PyTorchModel`` class. -Typically, you save a PyTorch model as a file with extension ``.pt`` or ``.pth``. -To do this, you need to: - -* Write an inference script. -* Create the directory structure for your model files. -* Create the ``PyTorchModel`` object. - -Write an inference script -------------------------- - -You must create an inference script that implements (at least) the ``model_fn`` function that calls the loaded model to get a prediction. - -**Note**: If you use elastic inference with PyTorch, you can use the default ``model_fn`` implementation provided in the serving container. - -Optionally, you can also implement ``input_fn`` and ``output_fn`` to process input and output, -and ``predict_fn`` to customize how the model server gets predictions from the loaded model. -For information about how to write an inference script, see `Serve a PyTorch Model <#serve-a-pytorch-model>`_. -Save the inference script in the same folder where you saved your PyTorch model. -Pass the filename of the inference script as the ``entry_point`` parameter when you create the ``PyTorchModel`` object. - -Create the directory structure for your model files ---------------------------------------------------- - -You have to create a directory structure and place your model files in the correct location. -The ``PyTorchModel`` constructor packs the files into a ``tar.gz`` file and uploads it to S3. - -The directory structure where you saved your PyTorch model should look something like the following: - -**Note:** This directory struture is for PyTorch versions 1.2 and higher. -For the directory structure for versions 1.1 and lower, -see `For versions 1.1 and lower <#for-versions-1.1-and-lower>`_. - -:: - - | my_model - | |--model.pth - | - | code - | |--inference.py - | |--requirements.txt - -Where ``requirments.txt`` is an optional file that specifies dependencies on third-party libraries. - -Create a ``PyTorchModel`` object --------------------------------- - -Now call the :class:`sagemaker.pytorch.model.PyTorchModel` constructor to create a model object, and then call its ``deploy()`` method to deploy your model for inference. - -.. code:: python - - from sagemaker import get_execution_role - role = get_execution_role() - - pytorch_model = PyTorchModel(model_data='s3://my-bucket/my-path/model.tar.gz', role=role, - entry_point='inference.py') - - predictor = pytorch_model.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1) - - -Now you can call the ``predict()`` method to get predictions from your deployed model. - -*********************************************** -Attach an estimator to an existing training job -*********************************************** - -You can attach a PyTorch Estimator to an existing training job using the -``attach`` method. - -.. code:: python - - my_training_job_name = 'MyAwesomePyTorchTrainingJob' - pytorch_estimator = PyTorch.attach(my_training_job_name) - -After attaching, if the training job has finished with job status "Completed", it can be -``deploy``\ ed to create a SageMaker Endpoint and return a -``Predictor``. If the training job is in progress, -attach will block and display log messages from the training job, until the training job completes. - -The ``attach`` method accepts the following arguments: - -- ``training_job_name:`` The name of the training job to attach - to. -- ``sagemaker_session:`` The Session used - to interact with SageMaker - -************************* -PyTorch Training Examples -************************* - -Amazon provides several example Jupyter notebooks that demonstrate end-to-end training on Amazon SageMaker using PyTorch. -Please refer to: - -https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk - -These are also available in SageMaker Notebook Instance hosted Jupyter notebooks under the sample notebooks folder. - -************************* -SageMaker PyTorch Classes -************************* - -For information about the different PyTorch-related classes in the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/sagemaker.pytorch.html. - -*********************************** -SageMaker PyTorch Docker Containers -*********************************** - -For information about the SageMaker PyTorch containers, see: - -- `SageMaker PyTorch training toolkit `_ -- `SageMaker PyTorch serving toolkit `_ -- `Deep Learning Container (DLC) Dockerfiles for PyTorch `_ -- `Deep Learning Container (DLC) Images `_ and `release notes `_ diff --git a/doc/frameworks/rl/index.rst b/doc/frameworks/rl/index.rst deleted file mode 100644 index ce8385aabb..0000000000 --- a/doc/frameworks/rl/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -######################################### -Reinforcement Learning -######################################### - -A managed environment for Reinforcement Learning (RL) on Amazon SageMaker - -.. toctree:: - :maxdepth: 1 - - using_rl - -.. toctree:: - :maxdepth: 2 - - sagemaker.rl diff --git a/doc/frameworks/rl/sagemaker.rl.rst b/doc/frameworks/rl/sagemaker.rl.rst deleted file mode 100644 index 123ca87b4e..0000000000 --- a/doc/frameworks/rl/sagemaker.rl.rst +++ /dev/null @@ -1,10 +0,0 @@ -RLEstimator -=========== - -RLEstimator Estimator ---------------------- - -.. autoclass:: sagemaker.rl.estimator.RLEstimator - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/rl/using_rl.rst b/doc/frameworks/rl/using_rl.rst deleted file mode 100644 index 8937f18cb3..0000000000 --- a/doc/frameworks/rl/using_rl.rst +++ /dev/null @@ -1,215 +0,0 @@ -========================================================== -Using Reinforcement Learning with the SageMaker Python SDK -========================================================== - -.. contents:: - -With Reinforcement Learning (RL) Estimators, you can train reinforcement learning models on Amazon SageMaker. - -For supported RL toolkits and their versions, see https://github.com/aws/sagemaker-rl-container/#rl-images-provided-by-sagemaker - -RL Training ------------ - -Training RL models using ``RLEstimator`` is a two-step process: - -1. Prepare a training script to run on SageMaker -2. Run this script on SageMaker via an ``RLEstimator``. - -You should prepare your script in a separate source file than the notebook, terminal session, or source file you're -using to submit the script to SageMaker via an ``RLEstimator``. This will be discussed in further detail below. - -Suppose that you already have a training script called ``coach-train.py``. -You can then create an ``RLEstimator`` with keyword arguments to point to this script and define how SageMaker runs it: - -.. code:: python - - from sagemaker.rl import RLEstimator, RLToolkit, RLFramework - - rl_estimator = RLEstimator(entry_point='coach-train.py', - toolkit=RLToolkit.COACH, - toolkit_version='0.11.1', - framework=RLFramework.TENSORFLOW, - role='SageMakerRole', - instance_type='ml.p3.2xlarge', - instance_count=1) - -After that, you simply tell the estimator to start a training job: - -.. code:: python - - rl_estimator.fit() - -In the following sections, we'll discuss how to prepare a training script for execution on SageMaker -and how to run that script on SageMaker using ``RLEstimator``. - - -Preparing the RL Training Script -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Your RL training script must be a Python 3.5 compatible source file from MXNet framework or Python 3.6 for TensorFlow. - -The training script is very similar to a training script you might run outside of SageMaker, but you -can access useful properties about the training environment through various environment variables, such as - -* ``SM_MODEL_DIR``: A string representing the path to the directory to write model artifacts to. - These artifacts are uploaded to S3 for model hosting. -* ``SM_NUM_GPUS``: An integer representing the number of GPUs available to the host. -* ``SM_OUTPUT_DATA_DIR``: A string representing the filesystem path to write output artifacts to. Output artifacts may - include checkpoints, graphs, and other files to save, not including model artifacts. These artifacts are compressed - and uploaded to S3 to the same S3 prefix as the model artifacts. - -For the exhaustive list of available environment variables, see the -`SageMaker Containers documentation `__. - - -RL Estimators -------------- - -The ``RLEstimator`` constructor takes both required and optional arguments. - -Required arguments -~~~~~~~~~~~~~~~~~~ - -The following are required arguments to the ``RLEstimator`` constructor. When you create an instance of ``RLEstimator``, you must include -these in the constructor, either positionally or as keyword arguments. - -- ``entry_point`` Path (absolute or relative) to the Python file which - should be executed as the entry point to training. -- ``role`` An AWS IAM role (either name or full ARN). The Amazon - SageMaker training jobs and APIs that create Amazon SageMaker - endpoints use this role to access training data and model artifacts. - After the endpoint is created, the inference code might use the IAM - role, if accessing AWS resource. -- ``instance_count`` Number of Amazon EC2 instances to use for - training. -- ``instance_type`` Type of EC2 instance to use for training, for - example, 'ml.m4.xlarge'. - -You must as well include either: - -- ``toolkit`` RL toolkit (Ray RLlib or Coach) you want to use for executing your model training code. - -- ``toolkit_version`` RL toolkit version you want to be use for executing your model training code. - -- ``framework`` Framework (MXNet or TensorFlow) you want to be used as - a toolkit backed for reinforcement learning training. - -or provide: - -- ``image_uri`` An alternative Docker image to use for training and - serving. If specified, the estimator will use this image for training and - hosting, instead of selecting the appropriate SageMaker official image based on - framework_version and py_version. Refer to: `SageMaker RL Docker Containers - <#sagemaker-rl-docker-containers>`_ for details on what the Official images support - and where to find the source code to build your custom image. - - -Optional arguments -~~~~~~~~~~~~~~~~~~ - -When you create an ``RLEstimator`` object, you can specify a number of optional arguments. -For more information, see :class:`sagemaker.rl.estimator.RLEstimator`. - -Calling fit -~~~~~~~~~~~ - -You start your training script by calling ``fit`` on an ``RLEstimator``. -For more information about what arguments can be passed to ``fit``, see :func:`sagemaker.estimator.EstimatorBase.fit`. - -Distributed RL Training ------------------------ - -Amazon SageMaker RL supports multi-core and multi-instance distributed training. -Depending on your use case, training and/or environment rollout can be distributed. - -Please see the `Amazon SageMaker examples `_ -on how it can be done using different RL toolkits. - - -Saving models -------------- - -In order to save your trained PyTorch model for deployment on SageMaker, your training script should save your model -to a certain filesystem path ``/opt/ml/model``. This value is also accessible through the environment variable -``SM_MODEL_DIR``. - -Deploying RL Models -------------------- - -After an RL Estimator has been fit, you can host the newly created model in SageMaker. - -After calling ``fit``, you can call ``deploy`` on an ``RLEstimator`` Estimator to create a SageMaker Endpoint. -The Endpoint runs one of the SageMaker-provided model server based on the ``framework`` parameter -specified in the ``RLEstimator`` constructor and hosts the model produced by your training script, -which was run when you called ``fit``. This was the model you saved to ``model_dir``. -In case if ``image_uri`` was specified it would use provided image for the deployment. - -``deploy`` returns a ``sagemaker.mxnet.MXNetPredictor`` for MXNet or -``sagemaker.tensorflow.TensorFlowPredictor`` for TensorFlow. - -``predict`` returns the result of inference against your model. - -.. code:: python - - # Train my estimator - rl_estimator = RLEstimator(entry_point='coach-train.py', - toolkit=RLToolkit.COACH, - toolkit_version='0.11.0', - framework=RLFramework.MXNET, - role='SageMakerRole', - instance_type='ml.c4.2xlarge', - instance_count=1) - - rl_estimator.fit() - - # Deploy my estimator to a SageMaker Endpoint and get a MXNetPredictor - predictor = rl_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1) - - response = predictor.predict(data) - -For more information please see `The SageMaker MXNet Model Server `_ -and `Deploying to TensorFlow Serving Endpoints `_ documentation. - - -Working with Existing Training Jobs ------------------------------------ - -Attaching to existing training jobs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can attach an RL Estimator to an existing training job using the -``attach`` method. - -.. code:: python - - my_training_job_name = 'MyAwesomeRLTrainingJob' - rl_estimator = RLEstimator.attach(my_training_job_name) - -After attaching, if the training job has finished with job status "Completed", it can be -``deploy``\ ed to create a SageMaker Endpoint and return a ``Predictor``. If the training job is in progress, -attach will block and display log messages from the training job, until the training job completes. - -The ``attach`` method accepts the following arguments: - -- ``training_job_name:`` The name of the training job to attach - to. -- ``sagemaker_session:`` The Session used - to interact with SageMaker - -RL Training Examples --------------------- - -Amazon provides several example Jupyter notebooks that demonstrate end-to-end training on Amazon SageMaker using RL. -Please refer to: - -https://github.com/awslabs/amazon-sagemaker-examples/tree/master/reinforcement_learning - -These are also available in SageMaker Notebook Instance hosted Jupyter notebooks under the sample notebooks folder. - - -SageMaker RL Docker Containers ------------------------------- - -For more about the Docker images themselves, visit `the SageMaker RL containers repository `_. diff --git a/doc/frameworks/sklearn/index.rst b/doc/frameworks/sklearn/index.rst deleted file mode 100644 index 638ce6e286..0000000000 --- a/doc/frameworks/sklearn/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -############################### -Scikit-Learn -############################### - -A managed environment for Scikit-Learn training and hosting on Amazon SageMaker - -.. toctree:: - :maxdepth: 1 - - using_sklearn - -.. toctree:: - :maxdepth: 2 - - sagemaker.sklearn diff --git a/doc/frameworks/sklearn/sagemaker.sklearn.rst b/doc/frameworks/sklearn/sagemaker.sklearn.rst deleted file mode 100644 index 3ca55113e8..0000000000 --- a/doc/frameworks/sklearn/sagemaker.sklearn.rst +++ /dev/null @@ -1,34 +0,0 @@ -Scikit Learn -============ - -Scikit Learn Estimator ----------------------- - -.. autoclass:: sagemaker.sklearn.estimator.SKLearn - :members: - :undoc-members: - :show-inheritance: - -Scikit Learn Model ------------------- - -.. autoclass:: sagemaker.sklearn.model.SKLearnModel - :members: - :undoc-members: - :show-inheritance: - -Scikit Learn Predictor ----------------------- - -.. autoclass:: sagemaker.sklearn.model.SKLearnPredictor - :members: - :undoc-members: - :show-inheritance: - -Scikit Learn Processor ----------------------- - -.. autoclass:: sagemaker.sklearn.processing.SKLearnProcessor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/sklearn/using_sklearn.rst b/doc/frameworks/sklearn/using_sklearn.rst deleted file mode 100644 index 8e82a1d86c..0000000000 --- a/doc/frameworks/sklearn/using_sklearn.rst +++ /dev/null @@ -1,518 +0,0 @@ -################################################ -Using Scikit-learn with the SageMaker Python SDK -################################################ - -With Scikit-learn Estimators, you can train and host Scikit-learn models on Amazon SageMaker. - -For information about supported versions of Scikit-learn, see the `AWS documentation `__. -We recommend that you use the latest supported version because that's where we focus most of our development efforts. - -You can visit the Scikit-learn repository at https://github.com/scikit-learn/scikit-learn. -For general information about using the SageMaker Python SDK, see :ref:`overview:Using the SageMaker Python SDK`. - -.. contents:: - -******************************* -Train a Model with Scikit-learn -******************************* - -To train a Scikit-learn model by using the SageMaker Python SDK: - -.. |create sklearn estimator| replace:: Create a ``sagemaker.sklearn.SKLearn`` Estimator -.. _create sklearn estimator: #create-an-estimator - -.. |call fit| replace:: Call the estimator's ``fit`` method -.. _call fit: #call-the-fit-method - -1. `Prepare a training script <#prepare-a-scikit-learn-training-script>`_ -2. |create sklearn estimator|_ -3. |call fit|_ - -Prepare a Scikit-learn Training Script -====================================== - -Your Scikit-learn training script must be a Python 3.6 compatible source file. - -The training script is similar to a training script you might run outside of SageMaker, but you -can access useful properties about the training environment through various environment variables. -For example: - -* ``SM_MODEL_DIR``: A string representing the path to the directory to write model artifacts to. - These artifacts are uploaded to S3 for model hosting. -* ``SM_OUTPUT_DATA_DIR``: A string representing the filesystem path to write output artifacts to. Output artifacts may - include checkpoints, graphs, and other files to save, not including model artifacts. These artifacts are compressed - and uploaded to S3 to the same S3 prefix as the model artifacts. - -Supposing two input channels, 'train' and 'test', were used in the call to the Scikit-learn estimator's ``fit()`` method, -the following will be set, following the format "SM_CHANNEL_[channel_name]": - -* ``SM_CHANNEL_TRAIN``: A string representing the path to the directory containing data in the 'train' channel -* ``SM_CHANNEL_TEST``: Same as above, but for the 'test' channel. - -A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, -and saves a model to model_dir so that it can be hosted later. Hyperparameters are passed to your script as arguments -and can be retrieved with an argparse.ArgumentParser instance. For example, a training script might start -with the following: - -.. code:: python - - import argparse - import os - - if __name__ =='__main__': - - parser = argparse.ArgumentParser() - - # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=50) - parser.add_argument('--batch-size', type=int, default=64) - parser.add_argument('--learning-rate', type=float, default=0.05) - - # Data, model, and output directories - parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR')) - parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR')) - parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) - parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST')) - - args, _ = parser.parse_known_args() - - # ... load from args.train and args.test, train a model, write model to args.model_dir. - -Because the SageMaker imports your training script, you should put your training code in a main guard -(``if __name__=='__main__':``) if you are using the same script to host your model, so that SageMaker does not -inadvertently run your training code at the wrong point in execution. - -For more on training environment variables, please visit https://github.com/aws/sagemaker-containers. - -.. important:: - The sagemaker-containers repository has been deprecated, - however it is still used to define Scikit-learn and XGBoost environment variables. - -Save the Model --------------- - -In order to save your trained Scikit-learn model for deployment on SageMaker, your training script should save your -model to a certain filesystem path called `model_dir`. This value is accessible through the environment variable -``SM_MODEL_DIR``. The following code demonstrates how to save a trained Scikit-learn model named ``model`` as -``model.joblib`` at the end of training: - -.. code:: python - - from sklearn.externals import joblib - import argparse - import os - - if __name__=='__main__': - # default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable. - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - args, _ = parser.parse_known_args() - - # ... train classifier `clf`, then save it to `model_dir` as file 'model.joblib' - joblib.dump(clf, os.path.join(args.model_dir, "model.joblib")) - -After your training job is complete, SageMaker will compress and upload the serialized model to S3, and your model data -will available in the s3 ``output_path`` you specified when you created the Scikit-learn Estimator. - -Using third-party libraries ---------------------------- - -When running your training script on SageMaker, it has access to some pre-installed third-party libraries including ``scikit-learn``, ``numpy``, and ``pandas``. -For more information on the runtime environment, including specific package versions, see `SageMaker Scikit-learn Docker Container `__. - -If there are other packages you want to use with your script, you can include a ``requirements.txt`` file in the same directory as your training script to install other dependencies at runtime. -Both ``requirements.txt`` and your training script should be put in the same folder. -You must specify this folder in ``source_dir`` argument when creating a Scikit-learn estimator. -A ``requirements.txt`` file is a text file that contains a list of items that are installed by using ``pip install``. -You can also specify the version of an item to install. -For information about the format of a ``requirements.txt`` file, see `Requirements Files `__ in the pip documentation. - -Create an Estimator -=================== - -You run Scikit-learn training scripts on SageMaker by creating ``SKLearn`` Estimators. -Call the ``fit`` method on a ``SKLearn`` Estimator to start a SageMaker training job. -The following code sample shows how you train a custom Scikit-learn script named "sklearn-train.py", passing -in three hyperparameters ('epochs', 'batch-size', and 'learning-rate'), and using two input channel -directories ('train' and 'test'). - -.. code:: python - - sklearn_estimator = SKLearn('sklearn-train.py', - instance_type='ml.m4.xlarge', - framework_version='0.20.0', - hyperparameters = {'epochs': 20, 'batch-size': 64, 'learning-rate': 0.1}) - sklearn_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data', - 'test': 's3://my-data-bucket/path/to/my/test/data'}) - - - - - -Call the fit Method -=================== - -You start your training script by calling ``fit`` on a ``SKLearn`` Estimator. ``fit`` takes both required and optional -arguments. - -fit Required arguments ----------------------- - -- ``inputs``: This can take one of the following forms: A string - s3 URI, for example ``s3://my-bucket/my-training-data``. In this - case, the s3 objects rooted at the ``my-training-data`` prefix will - be available in the default ``train`` channel. A dict from - string channel names to s3 URIs. In this case, the objects rooted at - each s3 prefix will available as files in each channel directory. - -For example: - -.. code:: python - - {'train':'s3://my-bucket/my-training-data', - 'eval':'s3://my-bucket/my-evaluation-data'} - -.. optional-arguments-1: - -fit Optional arguments ----------------------- - -- ``wait``: Defaults to True, whether to block and wait for the - training script to complete before returning. -- ``logs``: Defaults to True, whether to show logs produced by training - job in the Python session. Only meaningful when wait is True. - -*************************** -Deploy a Scikit-learn Model -*************************** - -After you fit a Scikit-learn Estimator, you can host the newly created model in SageMaker. - -After you call ``fit``, you can call ``deploy`` on an ``SKLearn`` estimator to create a SageMaker endpoint. -The endpoint runs a SageMaker-provided Scikit-learn model server and hosts the model produced by your training script, -which was run when you called ``fit``. This was the model you saved to ``model_dir``. - -``deploy`` returns a ``Predictor`` object, which you can use to do inference on the Endpoint hosting your Scikit-learn -model. Each ``Predictor`` provides a ``predict`` method which can do inference with numpy arrays or Python lists. -Inference arrays or lists are serialized and sent to the Scikit-learn model server by an ``InvokeEndpoint`` SageMaker -operation. - -``predict`` returns the result of inference against your model. By default, the inference result a NumPy array. - -.. code:: python - - # Train my estimator - sklearn_estimator = SKLearn(entry_point='train_and_deploy.py', - instance_type='ml.m4.xlarge', - framework_version='0.20.0') - sklearn_estimator.fit('s3://my_bucket/my_training_data/') - - # Deploy my estimator to a SageMaker Endpoint and get a Predictor - predictor = sklearn_estimator.deploy(instance_type='ml.m4.xlarge', - initial_instance_count=1) - - # `data` is a NumPy array or a Python list. - # `response` is a NumPy array. - response = predictor.predict(data) - -You use the SageMaker Scikit-learn model server to host your Scikit-learn model when you call ``deploy`` -on an ``SKLearn`` Estimator. The model server runs inside a SageMaker Endpoint, which your call to ``deploy`` creates. -You can access the name of the Endpoint by the ``name`` property on the returned ``Predictor``. - - -SageMaker Scikit-learn Model Server -=================================== - -The Scikit-learn Endpoint you create with ``deploy`` runs a SageMaker Scikit-learn model server. -The model server loads the model that was saved by your training script and performs inference on the model in response -to SageMaker InvokeEndpoint API calls. - -You can configure two components of the SageMaker Scikit-learn model server: Model loading and model serving. -Model loading is the process of deserializing your saved model back into an Scikit-learn model. -Serving is the process of translating InvokeEndpoint requests to inference calls on the loaded model. - -You configure the Scikit-learn model server by defining functions in the Python source file you passed to the -Scikit-learn constructor. - -Load a Model ------------- - -Before a model can be served, it must be loaded. The SageMaker Scikit-learn model server loads your model by invoking a -``model_fn`` function that you must provide in your script. The ``model_fn`` should have the following signature: - -.. code:: python - - def model_fn(model_dir) - -SageMaker will inject the directory where your model files and sub-directories, saved by ``save``, have been mounted. -Your model function should return a model object that can be used for model serving. - -SageMaker provides automated serving functions that work with Gluon API ``net`` objects and Module API ``Module`` objects. If you return either of these types of objects, then you will be able to use the default serving request handling functions. - -The following code-snippet shows an example ``model_fn`` implementation. -This loads returns a Scikit-learn Classifier from a ``model.joblib`` file in the SageMaker model directory -``model_dir``. - -.. code:: python - - from sklearn.externals import joblib - import os - - def model_fn(model_dir): - clf = joblib.load(os.path.join(model_dir, "model.joblib")) - return clf - -Serve a Model -------------- - -After the SageMaker model server has loaded your model by calling ``model_fn``, SageMaker will serve your model. -Model serving is the process of responding to inference requests, received by SageMaker InvokeEndpoint API calls. -The SageMaker Scikit-learn model server breaks request handling into three steps: - - -- input processing, -- prediction, and -- output processing. - -In a similar way to model loading, you configure these steps by defining functions in your Python source file. - -Each step involves invoking a python function, with information about the request and the return-value from the previous -function in the chain. Inside the SageMaker Scikit-learn model server, the process looks like: - -.. code:: python - - # Deserialize the Invoke request body into an object we can perform prediction on - input_object = input_fn(request_body, request_content_type) - - # Perform prediction on the deserialized object, with the loaded model - prediction = predict_fn(input_object, model) - - # Serialize the prediction result into the desired response content type - output = output_fn(prediction, response_content_type) - -The above code-sample shows the three function definitions: - -- ``input_fn``: Takes request data and deserializes the data into an - object for prediction. -- ``predict_fn``: Takes the deserialized request object and performs - inference against the loaded model. -- ``output_fn``: Takes the result of prediction and serializes this - according to the response content type. - -The SageMaker Scikit-learn model server provides default implementations of these functions. -You can provide your own implementations for these functions in your hosting script. -If you omit any definition then the SageMaker Scikit-learn model server will use its default implementation for that -function. - -The ``Predictor`` used by Scikit-learn in the SageMaker Python SDK serializes NumPy arrays to the `NPY `_ format -by default, with Content-Type ``application/x-npy``. The SageMaker Scikit-learn model server can deserialize NPY-formatted -data (along with JSON and CSV data). - -If you rely solely on the SageMaker Scikit-learn model server defaults, you get the following functionality: - -- Prediction on models that implement the ``__call__`` method -- Serialization and deserialization of NumPy arrays. - -The default ``input_fn`` and ``output_fn`` are meant to make it easy to predict on NumPy arrays. If your model expects -a NumPy array and returns a NumPy array, then these functions do not have to be overridden when sending NPY-formatted -data. - -In the following sections we describe the default implementations of input_fn, predict_fn, and output_fn. -We describe the input arguments and expected return types of each, so you can define your own implementations. - -Process Input -^^^^^^^^^^^^^ - -When an InvokeEndpoint operation is made against an Endpoint running a SageMaker Scikit-learn model server, -the model server receives two pieces of information: - -- The request Content-Type, for example "application/x-npy" -- The request data body, a byte array - -The SageMaker Scikit-learn model server will invoke an "input_fn" function in your hosting script, -passing in this information. If you define an ``input_fn`` function definition, -it should return an object that can be passed to ``predict_fn`` and have the following signature: - -.. code:: python - - def input_fn(request_body, request_content_type) - -Where ``request_body`` is a byte buffer and ``request_content_type`` is a Python string - -The SageMaker Scikit-learn model server provides a default implementation of ``input_fn``. -This function deserializes JSON, CSV, or NPY encoded data into a NumPy array. - -Default NPY deserialization requires ``request_body`` to follow the `NPY `_ format. For Scikit-learn, the Python SDK -defaults to sending prediction requests with this format. - -Default json deserialization requires ``request_body`` contain a single json list. -Sending multiple json objects within the same ``request_body`` is not supported. -The list must have a dimensionality compatible with the model loaded in ``model_fn``. -The list's shape must be identical to the model's input shape, for all dimensions after the first (which first -dimension is the batch size). - -Default csv deserialization requires ``request_body`` contain one or more lines of CSV numerical data. -The data is loaded into a two-dimensional array, where each line break defines the boundaries of the first dimension. - -The example below shows a custom ``input_fn`` for preparing pickled NumPy arrays. - -.. code:: python - - import numpy as np - - def input_fn(request_body, request_content_type): - """An input_fn that loads a pickled numpy array""" - if request_content_type == "application/python-pickle": - array = np.load(StringIO(request_body)) - return array - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - - - -Get Predictions ---------------- - -After the inference request has been deserialized by ``input_fn``, the SageMaker Scikit-learn model server invokes -``predict_fn`` on the return value of ``input_fn``. - -As with ``input_fn``, you can define your own ``predict_fn`` or use the SageMaker Scikit-learn model server default. - -The ``predict_fn`` function has the following signature: - -.. code:: python - - def predict_fn(input_object, model) - -Where ``input_object`` is the object returned from ``input_fn`` and -``model`` is the model loaded by ``model_fn``. - -The default implementation of ``predict_fn`` invokes the loaded model's ``predict`` function on ``input_object``, -and returns the resulting value. The return-type should be a NumPy array to be compatible with the default -``output_fn``. - -The example below shows an overridden ``predict_fn`` for a Logistic Regression classifier. This model accepts a -Python list and returns a tuple of predictions and prediction probabilities from the model in a NumPy array. -This ``predict_fn`` can rely on the default ``input_fn`` and ``output_fn`` because ``input_data`` is a NumPy array, -and the return value of this function is a NumPy array. - -.. code:: python - - import sklearn - import numpy as np - - def predict_fn(input_data, model): - prediction = model.predict(input_data) - pred_prob = model.predict_proba(input_data) - return np.array([prediction, pred_prob]) - -If you implement your own prediction function, you should take care to ensure that: - -- The first argument is expected to be the return value from input_fn. - If you use the default input_fn, this will be a NumPy array. -- The second argument is the loaded model. -- The return value should be of the correct type to be passed as the - first argument to ``output_fn``. If you use the default - ``output_fn``, this should be a NumPy array. - -Process Output -^^^^^^^^^^^^^^ - -After invoking ``predict_fn``, the model server invokes ``output_fn``, passing in the return-value from ``predict_fn`` -and the InvokeEndpoint requested response content-type. - -The ``output_fn`` has the following signature: - -.. code:: python - - def output_fn(prediction, content_type) - -Where ``prediction`` is the result of invoking ``predict_fn`` and -``content_type`` is the InvokeEndpoint requested response content-type. -The function should return a byte array of data serialized to content_type. - -The default implementation expects ``prediction`` to be an NumPy and can serialize the result to JSON, CSV, or NPY. -It accepts response content types of "application/json", "text/csv", and "application/x-npy". - -Working with existing model data and training jobs -================================================== - -Attach to Existing Training Jobs --------------------------------- - -You can attach an Scikit-learn Estimator to an existing training job using the -``attach`` method. - -.. code:: python - - my_training_job_name = "MyAwesomeSKLearnTrainingJob" - sklearn_estimator = SKLearn.attach(my_training_job_name) - -After attaching, if the training job is in a Complete status, it can be -``deploy``\ ed to create a SageMaker Endpoint and return a -``Predictor``. If the training job is in progress, -attach will block and display log messages from the training job, until the training job completes. - -The ``attach`` method accepts the following arguments: - -- ``training_job_name (str):`` The name of the training job to attach - to. -- ``sagemaker_session (sagemaker.Session or None):`` The Session used - to interact with SageMaker - -Deploy an Endpoint from Model Data ----------------------------------- - -As well as attaching to existing training jobs, you can deploy models directly from model data in S3. -The following code sample shows how to do this, using the ``SKLearnModel`` class. - -.. code:: python - - sklearn_model = SKLearnModel(model_data="s3://bucket/model.tar.gz", - role="SageMakerRole", - entry_point="transform_script.py", - framework_version="0.20.0") - - predictor = sklearn_model.deploy(instance_type="ml.c4.xlarge", initial_instance_count=1) - -To see what arguments are accepted by the ``SKLearnModel`` constructor, see :class:`sagemaker.sklearn.model.SKLearnModel`. - -Your model data must be a .tar.gz file in S3. SageMaker Training Job model data is saved to .tar.gz files in S3, -however if you have local data you want to deploy, you can prepare the data yourself. - -Assuming you have a local directory containg your model data named "my_model" you can tar and gzip compress the file and -upload to S3 using the following commands: - -:: - - tar -czf model.tar.gz my_model - aws s3 cp model.tar.gz s3://my-bucket/my-path/model.tar.gz - -This uploads the contents of my_model to a gzip compressed tar file to S3 in the bucket "my-bucket", with the key -"my-path/model.tar.gz". - -To run this command, you'll need the aws cli tool installed. Please refer to our `FAQ <#FAQ>`__ for more information on -installing this. - -****************************** -Scikit-learn Training Examples -****************************** - -Amazon provides an example Jupyter notebook that demonstrate end-to-end training on Amazon SageMaker using Scikit-learn: - -https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk - -These are also available in SageMaker Notebook Instance hosted Jupyter notebooks under the "sample notebooks" folder. - -****************************** -SageMaker scikit-learn Classes -****************************** - -For information about the different scikit-learn classes in the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html. - -**************************************** -SageMaker Scikit-learn Docker Containers -**************************************** - -You can visit the SageMaker Scikit-Learn containers repository here: https://github.com/aws/sagemaker-scikit-learn-container diff --git a/doc/frameworks/sparkml/index.rst b/doc/frameworks/sparkml/index.rst deleted file mode 100644 index 3001318e06..0000000000 --- a/doc/frameworks/sparkml/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -################################## -SparkML Serving -################################## - -A managed environment for SparkML Serving on Amazon SageMaker - -.. toctree:: - :maxdepth: 2 - - sagemaker.sparkml diff --git a/doc/frameworks/sparkml/sagemaker.sparkml.rst b/doc/frameworks/sparkml/sagemaker.sparkml.rst deleted file mode 100644 index f3a0eae2ef..0000000000 --- a/doc/frameworks/sparkml/sagemaker.sparkml.rst +++ /dev/null @@ -1,18 +0,0 @@ -SparkML Serving -=============== - -SparkML Model -------------- - -.. autoclass:: sagemaker.sparkml.model.SparkMLModel - :members: - :undoc-members: - :show-inheritance: - -SparkML Predictor ------------------ - -.. autoclass:: sagemaker.sparkml.model.SparkMLPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/tensorflow/deploying_tensorflow_serving.rst b/doc/frameworks/tensorflow/deploying_tensorflow_serving.rst deleted file mode 100644 index 8926d36486..0000000000 --- a/doc/frameworks/tensorflow/deploying_tensorflow_serving.rst +++ /dev/null @@ -1,655 +0,0 @@ -Deploying to TensorFlow Serving Endpoints -========================================= - -Table of Contents -~~~~~~~~~~~~~~~~~ - -- `Deploying from an Estimator`_ -- `Deploying directly from model artifacts`_ -- `Making predictions against a SageMaker Endpoint`_ -- `Deploying more than one model to your Endpoint`_ -- `Making predictions with the AWS CLI`_ - -Deploying from an Estimator -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -After a TensorFlow estimator has been fit, it saves a TensorFlow -`SavedModel `_ bundle in -the S3 location defined by ``output_path``. You can call ``deploy`` on a TensorFlow -estimator object to create a SageMaker Endpoint: - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - estimator = TensorFlow( - entry_point="tf-train.py", - ..., - instance_count=1, - instance_type="ml.c4.xlarge", - framework_version="2.2", - py_version="py37", - ) - - estimator.fit(inputs) - - predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.c5.xlarge") - - -The code block above deploys a SageMaker Endpoint with one instance of the type "ml.c5.xlarge". - -What happens when deploy is called -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Calling ``deploy`` starts the process of creating a SageMaker Endpoint. This process includes the following steps. - -- Starts ``initial_instance_count`` EC2 instances of the type ``instance_type``. -- On each instance, it will do the following steps: - - - start a Docker container optimized for TensorFlow Serving, see `SageMaker TensorFlow Serving containers `_. - - start a `TensorFlow Serving` process configured to run your model. - - start an HTTP server that provides access to TensorFlow Server through the SageMaker InvokeEndpoint API. - - -When the ``deploy`` call finishes, the created SageMaker Endpoint is ready for prediction requests. The -`Making predictions against a SageMaker Endpoint`_ section will explain how to make prediction requests -against the Endpoint. - -Deploying directly from model artifacts -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If you already have existing model artifacts in S3, you can skip training and deploy them directly to an endpoint: - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(model_data='s3://mybucket/model.tar.gz', role='MySageMakerRole') - - predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge') - -Python-based TensorFlow serving on SageMaker has support for `Elastic Inference `__, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance. In order to attach an Elastic Inference accelerator to your endpoint provide the accelerator type to accelerator_type to your deploy call. - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(model_data='s3://mybucket/model.tar.gz', role='MySageMakerRole') - - predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge', accelerator_type='ml.eia1.medium') - -Making predictions against a SageMaker Endpoint -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once you have the ``Predictor`` instance returned by ``model.deploy(...)`` or ``estimator.deploy(...)``, you -can send prediction requests to your Endpoint. - -The following code shows how to make a prediction request: - -.. code:: python - - input = { - 'instances': [1.0, 2.0, 5.0] - } - result = predictor.predict(input) - -The result object will contain a Python dict like this: - -.. code:: python - - { - 'predictions': [3.5, 4.0, 5.5] - } - -The formats of the input and the output data correspond directly to the request and response formats -of the ``Predict`` method in the `TensorFlow Serving REST API `_. - -If your SavedModel includes the right ``signature_def``, you can also make Classify or Regress requests: - -.. code:: python - - # input matches the Classify and Regress API - input = { - 'signature_name': 'tensorflow/serving/regress', - 'examples': [{'x': 1.0}, {'x': 2.0}] - } - - result = predictor.regress(input) # or predictor.classify(...) - - # result contains: - { - 'results': [3.5, 4.0] - } - -You can include multiple ``instances`` in your predict request (or multiple ``examples`` in -classify/regress requests) to get multiple prediction results in one request to your Endpoint: - -.. code:: python - - input = { - 'instances': [ - [1.0, 2.0, 5.0], - [1.0, 2.0, 5.0], - [1.0, 2.0, 5.0] - ] - } - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -If your application allows request grouping like this, it is **much** more efficient than making separate requests. - -Other input formats -^^^^^^^^^^^^^^^^^^^ - -SageMaker's TensforFlow Serving endpoints can also accept some additional input formats that are not part of the -TensorFlow REST API, including a simplified json format, line-delimited json objects ("jsons" or "jsonlines"), and -CSV data. - -**Simplified JSON Input** - -The Endpoint will accept simplified JSON input that doesn't match the TensorFlow REST API's Predict request format. -When the Endpoint receives data like this, it will attempt to transform it into a valid -Predict request, using a few simple rules: - -- python value, dict, or one-dimensional arrays are treated as the input value in a single 'instance' Predict request. -- multidimensional arrays are treated as a multiple values in a multi-instance Predict request. - -Combined with the client-side ``Predictor`` object's JSON serialization, this allows you to make simple -requests like this: - -.. code:: python - - input = [ - [1.0, 2.0, 5.0], - [1.0, 2.0, 5.0] - ] - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -Or this: - -.. code:: python - - # 'x' must match name of input tensor in your SavedModel graph - # for models with multiple named inputs, just include all the keys in the input dict - input = { - 'x': [1.0, 2.0, 5.0] - } - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5] - ] - } - - -**Line-delimited JSON** - -The Endpoint will accept line-delimited JSON objects (also known as "jsons" or "jsonlines" data). -The Endpoint treats each line as a separate instance in a multi-instance Predict request. To use -this feature from your python code, you need to create a ``Predictor`` instance that does not -try to serialize your input to JSON: - -.. code:: python - - # create a Predictor without JSON serialization - - predictor = Predictor('endpoint-name', serializer=None, content_type='application/jsonlines') - - input = '''{'x': [1.0, 2.0, 5.0]} - {'x': [1.0, 2.0, 5.0]} - {'x': [1.0, 2.0, 5.0]}''' - - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -This feature is especially useful if you are reading data from a file containing jsonlines data. - -**CSV (comma-separated values)** - -The Endpoint will accept CSV data. Each line is treated as a separate instance. This is a -compact format for representing multiple instances of 1-d array data. To use this feature -from your python code, you need to create a ``Predictor`` instance that can serialize -your input data to CSV format: - -.. code:: python - - # create a Predictor with JSON serialization - - predictor = Predictor('endpoint-name', serializer=sagemaker.serializers.CSVSerializer()) - - # CSV-formatted string input - input = '1.0,2.0,5.0\n1.0,2.0,5.0\n1.0,2.0,5.0' - - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -You can also use python arrays or numpy arrays as input and let the ``CSVSerializer`` object -convert them to CSV, but the client-size CSV conversion is more sophisticated than the -CSV parsing on the Endpoint, so if you encounter conversion problems, try using one of the -JSON options instead. - - -Specifying the output of a prediction request -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The structure of the prediction ``result`` is determined at the end of the training process before SavedModel is created. For example, if -you are using TensorFlow's Estimator API for training, you control inference outputs using the ``export_outputs`` parameter of the `tf.estimator.EstimatorSpec `_ that you return from your ``model_fn``. - -More information on how to create ``export_outputs`` can be found in `specifying the outputs of a custom model `_. You can also -refer to TensorFlow's `Save and Restore `_ documentation for other ways to control the -inference-time behavior of your SavedModels. - -Providing Python scripts for pre/pos-processing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can add your customized Python code to process your input and output data. -This customized Python code must be named ``inference.py`` and specified through the ``entry_point`` parameter: - -.. code:: - - from sagemaker.tensorflow import TensorFlowModel - - model = Model(entry_point='inference.py', - model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole') - -How to implement the pre- and/or post-processing handler(s) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Your entry point file must be named ``inference.py`` and should implement - either a pair of ``input_handler`` and ``output_handler`` functions or - a single ``handler`` function. - Note that if ``handler`` function is implemented, ``input_handler`` - and ``output_handler`` are ignored. - -To implement pre- and/or post-processing handler(s), use the Context -object that the Python service creates. The Context object is a namedtuple with the following attributes: - -- ``model_name (string)``: the name of the model to use for - inference. For example, 'half-plus-three' - -- ``model_version (string)``: version of the model. For example, '5' - -- ``method (string)``: inference method. For example, 'predict', - 'classify' or 'regress', for more information on methods, please see - `Classify and Regress - API `__ - and `Predict - API `__ - -- ``rest_uri (string)``: the TFS REST uri generated by the Python - service. For example, - 'http://localhost:8501/v1/models/half_plus_three:predict' - -- ``grpc_uri (string)``: the GRPC port number generated by the Python - service. For example, '9000' - -- ``custom_attributes (string)``: content of - 'X-Amzn-SageMaker-Custom-Attributes' header from the original - request. For example, - 'tfs-model-name=half*plus*\ three,tfs-method=predict' - -- ``request_content_type (string)``: the original request content type, - defaulted to 'application/json' if not provided - -- ``accept_header (string)``: the original request accept type, - defaulted to 'application/json' if not provided - -- ``content_length (int)``: content length of the original request - -The following code example implements ``input_handler`` and -``output_handler``. By providing these, the Python service posts the -request to the TFS REST URI with the data pre-processed by ``input_handler`` -and passes the response to ``output_handler`` for post-processing. - -.. code:: - - import json - - def input_handler(data, context): - """ Pre-process request input before it is sent to TensorFlow Serving REST API - Args: - data (obj): the request data, in format of dict or string - context (Context): an object containing request and configuration details - Returns: - (dict): a JSON-serializable dict that contains request body and headers - """ - if context.request_content_type == 'application/json': - # pass through json (assumes it's correctly formed) - d = data.read().decode('utf-8') - return d if len(d) else '' - - if context.request_content_type == 'text/csv': - # very simple csv handler - return json.dumps({ - 'instances': [float(x) for x in data.read().decode('utf-8').split(',')] - }) - - raise ValueError('{{"error": "unsupported content type {}"}}'.format( - context.request_content_type or "unknown")) - - - def output_handler(data, context): - """Post-process TensorFlow Serving output before it is returned to the client. - Args: - data (obj): the TensorFlow serving response - context (Context): an object containing request and configuration details - Returns: - (bytes, string): data to return to client, response content type - """ - if data.status_code != 200: - raise ValueError(data.content.decode('utf-8')) - - response_content_type = context.accept_header - prediction = data.content - return prediction, response_content_type - -You might want to have complete control over the request. -For example, you might want to make a TFS request (REST or GRPC) to the first model, -inspect the results, and then make a request to a second model. In this case, implement -the ``handler`` method instead of the ``input_handler`` and ``output_handler`` methods, as demonstrated -in the following code: - -.. code:: - - import json - import requests - - - def handler(data, context): - """Handle request. - Args: - data (obj): the request data - context (Context): an object containing request and configuration details - Returns: - (bytes, string): data to return to client, (optional) response content type - """ - processed_input = _process_input(data, context) - response = requests.post(context.rest_uri, data=processed_input) - return _process_output(response, context) - - - def _process_input(data, context): - if context.request_content_type == 'application/json': - # pass through json (assumes it's correctly formed) - d = data.read().decode('utf-8') - return d if len(d) else '' - - if context.request_content_type == 'text/csv': - # very simple csv handler - return json.dumps({ - 'instances': [float(x) for x in data.read().decode('utf-8').split(',')] - }) - - raise ValueError('{{"error": "unsupported content type {}"}}'.format( - context.request_content_type or "unknown")) - - - def _process_output(data, context): - if data.status_code != 200: - raise ValueError(data.content.decode('utf-8')) - - response_content_type = context.accept_header - prediction = data.content - return prediction, response_content_type - -You can also bring in external dependencies to help with your data -processing. There are 2 ways to do this: - -1. If you included ``requirements.txt`` in your ``source_dir``, the container installs the Python dependencies at runtime using ``pip install -r``: - -.. code:: - - from sagemaker.tensorflow import TensorFlowModel - - model = Model(entry_point='inference.py', - source_dir='source/directory', - model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole') - - -2. If you are working in a network-isolation situation or if you don't - want to install dependencies at runtime every time your endpoint starts or a batch - transform job runs, you might want to put - pre-downloaded dependencies under a ``lib`` directory and this - directory as dependency. The container adds the modules to the Python - path. Note that if both ``lib`` and ``requirements.txt`` - are present in the model archive, the ``requirements.txt`` is ignored: - -.. code:: - - from sagemaker.tensorflow import TensorFlowModel - - model = Model(entry_point='inference.py', - dependencies=['/path/to/folder/named/lib'], - model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole') - -For more information, see: https://github.com/aws/sagemaker-tensorflow-serving-container#prepost-processing - -Deploying more than one model to your Endpoint -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TensorFlow Serving Endpoints allow you to deploy multiple models to the same Endpoint when you create the endpoint. - -To use this feature, you will need to: - -#. create a multi-model archive file -#. create a SageMaker Model and deploy it to an Endpoint -#. create Predictor instances that direct requests to a specific model - -Creating a multi-model archive file -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Creating an archive file that contains multiple SavedModels is simple, but involves a few -steps: - -- obtaining some models -- repackaging the models into a new archive file -- uploading the new archive to S3 - -**Obtaining model files** - -Let's imagine you have already run two Tensorflow training jobs in SageMaker, and they exported -SavedModels to ``s3://mybucket/models/model1.tar.gz`` and ``s3://mybucket/models/model2.tar.gz``. - -First, download the models and extract them: - -.. code:: bash - - aws s3 cp s3://mybucket/models/model1/model.tar.gz model1.tar.gz - aws s3 cp s3://mybucket/models/model2/model.tar.gz model2.tar.gz - mkdir -p multi/model1 - mkdir -p multi/model2 - - tar xvf model1.tar.gz -C ./multi/model1 - tar xvf model2.tar.gz -C ./multi/model2 - -**Repackaging the models** - -Next, examine the directories in ``multi``. If you trained the models using SageMaker's TensorFlow containers, -you are likely to have ``./multi/model1/export/Servo/...`` and ``./multi/model2/export/Servo/...``. In both cases, -"Servo" is the base name for the SaveModel files. When serving multiple models, each model needs a unique -basename, so one or both of these will need to be changed. The ``/export/`` part of the path isn't needed -either, so you can simplify the layout at the same time: - -.. code:: bash - - mv multi/model1/export/Servo/* multi/model1/ - mv multi/model2/export/Servo/* multi/model2/ - rm -fr multi/model1/export - rm -fr multi/model2/export - -You should now have a directory structure like this: - -:: - - └── multi - ├── model1 - │   └── - │   ├── saved_model.pb - │   └── variables - │   └── ... - └── model2 - └── - ├── saved_model.pb - └── variables - └── ... - -To repackage the files into a new archive, use ``tar`` again: - -.. code:: bash - - tar -C "$PWD/multi/" -czvf multi.tar.gz multi/ - -The ``multi.tar.gz`` file is now ready to use. - -**Uploading the new archive to S3** - -.. code:: bash - - aws s3 cp multi.tar.gz s3://mybucket/models/multi.tar.gz - -Creating and Deploying a SageMaker Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For the remaining steps, let's return to python code using the SageMaker Python SDK. - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel, TensorFlowPredictor - - # change this to the name or ARN of your SageMaker execution role - role = 'SageMakerRole' - - model_data = 's3://mybucket/models/multi.tar.gz' - - # For multi-model endpoints, you should set the default model name in - # an environment variable. If it isn't set, the endpoint will work, - # but the model it will select as default is unpredictable. - env = { - 'SAGEMAKER_TFS_DEFAULT_MODEL_NAME': 'model1' - } - - model = Model(model_data=model_data, role=role, framework_version='1.11', env=env) - predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge') - -The ``predictor`` object returned by the deploy function is ready to use to make predictions -using the default model (``model1`` in this example). - -Creating Predictor instances for different models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``predictor`` returned by the ``model.deploy(...)`` function can only send requests to -the default model. To use other models deployed to the same Endpoint, you need to create -additional ``Predictor`` instances. Here's how: - -.. code:: python - - # ... continuing from the previous example - - # get the endpoint name from the default predictor - endpoint = predictor.endpoint_name - - # get a predictor for 'model2' - model2_predictor = Predictor(endpoint, model_name='model2') - - # note: that will for actual SageMaker endpoints, but if you are using - # local mode you need to create the new Predictor this way: - # - # model2_predictor = Predictor(endpoint, model_name='model2' - # sagemaker_session=predictor.sagemaker_session) - - - # result is prediction from 'model2' - result = model2_predictor.predict(...) - -Making predictions with the AWS CLI -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The SageMaker Python SDK is not the only way to access your Endpoint. The AWS CLI is simple to -use and a convenient way to test your endpoint. Here are a few examples that show how to use -different features of SageMaker TensorFlow Serving Endpoints using the CLI. - -Note: The ``invoke-endpoint`` command usually writes prediction results to a file. In the examples -below, the ``>(cat) 1>/dev/null`` part is a shell trick to redirect the result to stdout so it -can be seen. - -.. code:: bash - - # TensorFlow Serving REST API - predict request - aws sagemaker-runtime invoke-endpoint \ - --endpoint-name my-endpoint \ - --content-type 'application/json' \ - --body '{"instances": [1.0, 2.0, 5.0]}' \ - >(cat) 1>/dev/null - - # Predict request for specific model name - aws sagemaker-runtime invoke-endpoint \ - --endpoint-name my-endpoint \ - --content-type 'application/json' \ - --body '{"instances": [1.0, 2.0, 5.0]}' \ - --custom-attributes 'tfs-model-name=other_model' \ - >(cat) 1>/dev/null - - # TensorFlow Serving REST API - regress request - aws sagemaker-runtime invoke-endpoint \ - --endpoint-name my-endpoint \ - --content-type 'application/json' \ - --body '{"signature_name": "tensorflow/serving/regress","examples": [{"x": 1.0}]}' \ - --custom-attributes 'tfs-method=regress' \ - >(cat) 1>/dev/null - - # Simple json request (2 instances) - aws sagemaker-runtime invoke-endpoint \ - --endpoint-name my-endpoint \ - --content-type 'application/json' \ - --body '[[1.0, 2.0, 5.0],[2.0, 3.0, 4.0]]' \ - >(cat) 1>/dev/null - - # CSV request (2 rows) - aws sagemaker-runtime invoke-endpoint \ - --endpoint-name my-endpoint \ - --content-type 'text/csv' \ - --body "1.0,2.0,5.0"$'\n'"2.0,3.0,4.0" \ - >(cat) 1>/dev/null - - # Line delimited JSON from an input file - aws sagemaker-runtime invoke-endpoint \ - --endpoint-name my-endpoint \ - --content-type 'application/jsons' \ - --body "$(cat input.jsons)" \ - results.json diff --git a/doc/frameworks/tensorflow/index.rst b/doc/frameworks/tensorflow/index.rst deleted file mode 100644 index 46f535ba1d..0000000000 --- a/doc/frameworks/tensorflow/index.rst +++ /dev/null @@ -1,17 +0,0 @@ -########## -TensorFlow -########## - -A managed environment for TensorFlow training and hosting on Amazon SageMaker - -.. toctree:: - :maxdepth: 1 - - using_tf - deploying_tensorflow_serving - upgrade_from_legacy - -.. toctree:: - :maxdepth: 2 - - sagemaker.tensorflow diff --git a/doc/frameworks/tensorflow/sagemaker.tensorflow.rst b/doc/frameworks/tensorflow/sagemaker.tensorflow.rst deleted file mode 100644 index c9187ffa04..0000000000 --- a/doc/frameworks/tensorflow/sagemaker.tensorflow.rst +++ /dev/null @@ -1,27 +0,0 @@ -TensorFlow -========== - - -TensorFlow Estimator --------------------- - -.. autoclass:: sagemaker.tensorflow.estimator.TensorFlow - :members: - :undoc-members: - :show-inheritance: - -TensorFlow Serving Model ------------------------- - -.. autoclass:: sagemaker.tensorflow.model.TensorFlowModel - :members: - :undoc-members: - :show-inheritance: - -TensorFlow Serving Predictor ----------------------------- - -.. autoclass:: sagemaker.tensorflow.model.TensorFlowPredictor - :members: - :undoc-members: - :show-inheritance: diff --git a/doc/frameworks/tensorflow/upgrade_from_legacy.rst b/doc/frameworks/tensorflow/upgrade_from_legacy.rst deleted file mode 100644 index 143a1180d0..0000000000 --- a/doc/frameworks/tensorflow/upgrade_from_legacy.rst +++ /dev/null @@ -1,253 +0,0 @@ -###################################### -Upgrade from Legacy TensorFlow Support -###################################### - -With version 2.0 and later of the SageMaker Python SDK, support for legacy SageMaker TensorFlow images has been deprecated. -This guide explains how to upgrade your SageMaker Python SDK usage. - -For more information about using TensorFlow with the SageMaker Python SDK, see `Use TensorFlow with the SageMaker Python SDK `_. - -.. contents:: - -******************************************** -What Constitutes "Legacy TensorFlow Support" -******************************************** - -This guide is relevant if one of the following applies: - -#. You are using TensorFlow versions 1.4-1.10 -#. You are using TensorFlow versions 1.11-1.12 with Python 2, and - - - you do *not* have ``script_mode=True`` when creating your estimator - - you are using ``sagemaker.tensorflow.model.TensorFlowModel`` and/or ``sagemaker.tensorflow.model.TensorFlowPredictor`` - -#. You are using a pre-built SageMaker image whose URI looks like ``520713654638.dkr.ecr..amazonaws.com/sagemaker-tensorflow:`` - -If one of the above applies, then keep reading. - -************** -How to Upgrade -************** - -We recommend that you use the latest supported version of TensorFlow because that's where we focus our development efforts. -For information about supported versions of TensorFlow, see the `AWS documentation `_. - -For general information about using TensorFlow with the SageMaker Python SDK, see `Use TensorFlow with the SageMaker Python SDK `_. - -Training Script -=============== - -Newer versions of TensorFlow require your training script to be runnable as a command-line script, similar to what you might run outside of SageMaker. For more information, including how to adapt a locally-runnable script, see `Prepare a Training Script `_. - -In addition, your training script needs to save your model. If you have your own ``serving_input_fn`` implementation, then that can be passed to an exporter: - -.. code:: python - - import tensorflow as tf - - exporter = tf.estimator.LatestExporter("Servo", serving_input_receiver_fn=serving_input_fn) - -For an example of how to repackage your legacy TensorFlow training script for use with a newer version of TensorFlow, -see `this example notebook `_. - -Inference Script -================ - -Newer versions of TensorFlow Serving require a different format for the inference script. Some key differences: - -- The script must be named ``inference.py``. -- ``input_fn`` has been replaced by ``input_handler``. -- ``output_fn`` has been replaced by ``output_handler``. - -Like with the legacy versions, the pre-built SageMaker TensorFlow Serving images do have default implementations for pre- and post-processing. - -For more information about implementing your own handlers, see `How to implement the pre- and/or post-processing handler(s) `_. - -***************************** -Continue with Legacy Versions -***************************** - -While not recommended, you can still use a legacy TensorFlow version with version 2.0 and later of the SageMaker Python SDK. -In order to do so, you need to change how a few parameters are defined. - -Training -======== - -When creating an estimator, the Python SDK version 2.0 and later requires the following changes: - -#. Explicitly specify the ECR image URI via ``image_uri``. - To determine the URI, you can use :func:`sagemaker.fw_utils.create_image_uri`. -#. Specify ``model_dir=False``. -#. Use hyperparameters for ``training_steps``, ``evaluation_steps``, ``checkpoint_path``, and ``requirements_file``. - -For example, if using TF 1.10.0 with an ml.m4.xlarge instance in us-west-2, -the difference in code would be as follows: - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - # v1.x - estimator = TensorFlow( - ... - source_dir="code", - framework_version="1.10.0", - train_instance_type="ml.m4.xlarge", - training_steps=100, - evaluation_steps=10, - checkpoint_path="s3://bucket/path", - requirements_file="requirements.txt", - ) - - # v2.0 and later - estimator = TensorFlow( - ... - source_dir="code", - framework_version="1.10.0", - py_version="py2", - instance_type="ml.m4.xlarge", - image_uri="520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2", - hyperparameters={ - "training_steps": 100, - "evaluation_steps": 10, - "checkpoint_path": "s3://bucket/path", - "sagemaker_requirements": "requirements.txt", - }, - model_dir=False, - ) - -Requirements File with Training -------------------------------- - -To provide a requirements file, define a hyperparameter named "sagemaker_requirements" that contains the relative path to the requirements file from ``source_dir``. - -Inference -========= - -Using a legacy TensorFlow version for endpoints and batch transform can be achieved with version 2.0 and later of the SageMaker Python SDK with some minor changes to your code. - -From an Estimator ------------------ - -If you are starting with a training job, you can call :func:`sagemaker.estimator.EstimatorBase.deploy` or :func:`sagemaker.tensorflow.estimator.Estimator.transformer` from your estimator for inference. - -To specify the number of model server workers, you need to set it through an environment variable named ``MODEL_SERVER_WORKERS``: - -.. code:: python - - # v1.x - estimator.deploy(..., model_server_workers=4) - - # v2.0 and later - estimator.deploy(..., env={"MODEL_SERVER_WORKERS": 4}) - -From a Trained Model --------------------- - -If you are starting with a trained model, the Python SDK version 2.0 and later requires the following changes: - -#. Use the the :class:`sagemaker.model.FrameworkModel` class. -#. Explicitly specify the ECR image URI via ``image_uri``. - To determine the URI, you can use :func:`sagemaker.fw_utils.create_image_uri`. -#. Use an environment variable for ``model_server_workers``. - -For example, if using TF 1.10.0 with a CPU instance in us-west-2, -the difference in code would be as follows: - -.. code:: python - - # v1.x - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel( - ... - py_version="py2", - framework_version="1.10.0", - model_server_workers=4, - ) - - # v2.0 and later - from sagemaker.model import FrameworkModel - - model = FrameworkModel( - ... - image_uri="520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2", - env={"MODEL_SERVER_WORKERS": 4}, - ) - -Requirements File with Inference --------------------------------- - -To provide a requirements file, define an environment variable named ``SAGEMAKER_REQUIREMENTS`` that contains the relative path to the requirements file from ``source_dir``. - -From an estimator: - -.. code:: python - - # for an endpoint - estimator.deploy(..., env={"SAGEMAKER_REQUIREMENTS": "requirements.txt"}) - - # for batch transform - estimator.transformer(..., env={"SAGEMAKER_REQUIREMENTS": "requirements.txt"}) - -From a model: - -.. code:: python - - from sagemaker.model import FrameworkModel - - model = FrameworkModel( - ... - source_dir="code", - env={"SAGEMAKER_REQUIREMENTS": "requirements.txt"}, - ) - - -Predictors ----------- - -If you want to use your model for endpoints, then you can use the :class:`sagemaker.predictor.Predictor` class instead of the legacy ``sagemaker.tensorflow.TensorFlowPredictor`` class: - -.. code:: python - - from sagemaker.model import FrameworkModel - from sagemaker.predictor import Predictor - - model = FrameworkModel( - ... - predictor_cls=Predictor, - ) - - predictor = model.deploy(...) - -If you are using protobuf prediction data, then you need to serialize and deserialize the data yourself. - -For example: - -.. code:: python - - from google.protobuf import json_format - from protobuf_to_dict import protobuf_to_dict - from tensorflow.core.framework import tensor_pb2 - - # Serialize the prediction data - json_format.MessageToJson(data) - - # Get the prediction result - result = predictor.predict(data) - - # Deserialize the prediction result - protobuf_to_dict(json_format.Parse(result, tensor_pb2.TensorProto())) - -Otherwise, you can use the serializers and deserialzers available in the SageMaker Python SDK or write your own. - -For example, if you want to use JSON serialization and deserialization: - -.. code:: python - - from sagemaker.deserializers import JSONDeserializer - from sagemaker.serializers import JSONSerializer - - predictor = model.deploy(..., serializer=JSONSerializer(), deserializer=JSONDeserializer()) - - predictor.predict(data) diff --git a/doc/frameworks/tensorflow/using_tf.rst b/doc/frameworks/tensorflow/using_tf.rst deleted file mode 100644 index bd6cd36dcf..0000000000 --- a/doc/frameworks/tensorflow/using_tf.rst +++ /dev/null @@ -1,961 +0,0 @@ -############################################ -Use TensorFlow with the SageMaker Python SDK -############################################ - -With the SageMaker Python SDK, you can train and host TensorFlow models on Amazon SageMaker. - -For information about supported versions of TensorFlow, see the `AWS documentation `_. -We recommend that you use the latest supported version because that's where we focus our development efforts. - -For general information about using the SageMaker Python SDK, see :ref:`overview:Using the SageMaker Python SDK`. - -.. warning:: - Support for TensorFlow versions 1.4-1.10 has been deprecated. - For information on how to upgrade, see `Upgrade from Legacy TensorFlow Support `_. - -.. contents:: - -***************************** -Train a Model with TensorFlow -***************************** - -To train a TensorFlow model by using the SageMaker Python SDK: - -.. |create tf estimator| replace:: Create a ``sagemaker.tensorflow.TensorFlow estimator`` -.. _create tf estimator: #create-an-estimator - -.. |call fit| replace:: Call the estimator's ``fit`` method -.. _call fit: #call-the-fit-method - -1. `Prepare a training script <#prepare-a-training-script>`_ -2. |create tf estimator|_ -3. |call fit|_ - -Prepare a Training Script -========================= - -The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following: - -* ``SM_MODEL_DIR``: A string that represents the local path where the training job writes the model artifacts to. - After training, artifacts in this directory are uploaded to S3 for model hosting. This is different than the ``model_dir`` - argument passed in your training script, which can be an S3 location. ``SM_MODEL_DIR`` is always set to ``/opt/ml/model``. -* ``SM_NUM_GPUS``: An integer representing the number of GPUs available to the host. -* ``SM_OUTPUT_DATA_DIR``: A string that represents the path to the directory to write output artifacts to. - Output artifacts might include checkpoints, graphs, and other files to save, but do not include model artifacts. - These artifacts are compressed and uploaded to S3 to an S3 bucket with the same prefix as the model artifacts. -* ``SM_CHANNEL_XXXX``: A string that represents the path to the directory that contains the input data for the specified channel. - For example, if you specify two input channels in the TensorFlow estimator's ``fit`` call, named 'train' and 'test', the environment variables ``SM_CHANNEL_TRAIN`` and ``SM_CHANNEL_TEST`` are set. - -For the exhaustive list of available environment variables, see the `SageMaker Containers documentation `_. - -A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, and saves a model to ``SM_MODEL_DIR`` so that it can be deployed for inference later. -Hyperparameters are passed to your script as arguments and can be retrieved with an ``argparse.ArgumentParser`` instance. -For example, a training script might start with the following: - -.. code:: python - - import argparse - import os - - if __name__ =='__main__': - - parser = argparse.ArgumentParser() - - # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=10) - parser.add_argument('--batch_size', type=int, default=100) - parser.add_argument('--learning_rate', type=float, default=0.1) - - # input data and model directories - parser.add_argument('--model_dir', type=str) - parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) - parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST')) - - args, _ = parser.parse_known_args() - - # ... load from args.train and args.test, train a model, write model to args.model_dir. - -Because the SageMaker imports your training script, putting your training launching code in a main guard (``if __name__=='__main__':``) -is good practice. - -Note that SageMaker doesn't support argparse actions. -For example, if you want to use a boolean hyperparameter, specify ``type`` as ``bool`` in your script and provide an explicit ``True`` or ``False`` value for this hyperparameter when you create the TensorFlow estimator. - -For a complete example of a TensorFlow training script, see `mnist.py `__. - - -Adapting your local TensorFlow script -------------------------------------- - -If you have a TensorFlow training script that runs outside of SageMaker, do the following to adapt the script to run in SageMaker: - -1. Make sure your script can handle ``--model_dir`` as an additional command line argument. If you did not specify a -location when you created the TensorFlow estimator, an S3 location under the default training job bucket is used. -Distributed training with parameter servers requires you to use the ``tf.estimator.train_and_evaluate`` API and -to provide an S3 location as the model directory during training. Here is an example: - -.. code:: python - - estimator = tf.estimator.Estimator(model_fn=my_model_fn, model_dir=args.model_dir) - ... - train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=1000) - eval_spec = tf.estimator.EvalSpec(eval_input_fn) - tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec) - -2. Load input data from the input channels. The input channels are defined when ``fit`` is called. For example: - -.. code:: python - - estimator.fit({'train':'s3://my-bucket/my-training-data', - 'eval':'s3://my-bucket/my-evaluation-data'}) - -In your training script the channels will be stored in environment variables ``SM_CHANNEL_TRAIN`` and -``SM_CHANNEL_EVAL``. You can add them to your argument parsing logic like this: - -.. code:: python - - parser = argparse.ArgumentParser() - parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN')) - parser.add_argument('--eval', type=str, default=os.environ.get('SM_CHANNEL_EVAL')) - -3. Export your final model to path stored in environment variable ``SM_MODEL_DIR`` which should always be - ``/opt/ml/model``. At end of training SageMaker will upload the model file under ``/opt/ml/model`` to - ``output_path``. - - -Use third-party libraries -------------------------- - -If there are other packages you want to use with your script, you can use a ``requirements.txt`` to install other dependencies at runtime. - -For training, support for installing packages using ``requirements.txt`` varies by TensorFlow version as follows: - -- For TensorFlow 1.15.2 with Python 3.7 or newer, and TensorFlow 2.2 or newer: - - Include a ``requirements.txt`` file in the same directory as your training script. - - You must specify this directory using the ``source_dir`` argument when creating a TensorFlow estimator. -- For TensorFlow versions 1.11-1.15.2, 2.0-2.1 with Python 2.7 or 3.6: - - Write a shell script for your entry point that first calls ``pip install -r requirements.txt``, then runs your training script. - - For an example of using shell scripts, see `this example notebook `__. -- For legacy versions of TensorFlow: - - See `Upgrade from Legacy TensorFlow Support `_. - -For serving, support for installing packages using ``requirements.txt`` varies by TensorFlow version as follows: - -- For TensorFlow 1.11 or newer: - - Include a ``requirements.txt`` file in the ``code`` directory. -- For legacy versions of TensorFlow: - - See `Upgrade from Legacy TensorFlow Support `_. - -A ``requirements.txt`` file is a text file that contains a list of items that are installed by using ``pip install``. -You can also specify the version of an item to install. -For information about the format of a ``requirements.txt`` file, see `Requirements Files `__ in the pip documentation. - - -Create an Estimator -=================== - -After you create your training script, create an instance of the :class:`sagemaker.tensorflow.TensorFlow` estimator. - -To use Python 3.7, please specify both of the args: - -- ``py_version='py37'`` -- ``framework_version='1.15.2'`` - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - tf_estimator = TensorFlow( - entry_point="tf-train.py", - role="SageMakerRole", - instance_count=1, - instance_type="ml.p2.xlarge", - framework_version="2.2", - py_version="py37", - ) - tf_estimator.fit("s3://bucket/path/to/training/data") - -Where the S3 url is a path to your training data within Amazon S3. -The constructor keyword arguments define how SageMaker runs your training script. - -Specify a Docker image using an Estimator ------------------------------------------ - -There are use cases, such as extending an existing pre-built Amazon SageMaker images, that require specifing a Docker image when creating an Estimator by directly specifying the ECR URI instead of the Python and framework version. For a full list of available container URIs, see `Available Deep Learning Containers Images `__ For more information on using Docker containers, see `Use Your Own Algorithms or Models with Amazon SageMaker `__. - -When specifying the image, you must use the ``image_name=''`` arg to replace the following arg: - -- ``py_version=''`` - -You should still specify the ``framework_version=''`` arg because the SageMaker Python SDK accomodates for differences in the images based on the version. - -The following example uses the ``image_name=''`` arg to specify the container image, Python version, and framework version. - -.. code:: python - - tf_estimator = TensorFlow(entry_point='tf-train.py', - role='SageMakerRole', - train_instance_count=1, - train_instance_type='ml.p2.xlarge', - image_name='763104351884.dkr.ecr..amazonaws.com/-:---ubuntu18.04', - script_mode=True) - -For more information about the sagemaker.tensorflow.TensorFlow estimator, see `SageMaker TensorFlow Classes`_. - -Call the fit Method -=================== - -You start your training script by calling the ``fit`` method on a ``TensorFlow`` estimator. - -Calling ``fit`` starts a SageMaker training job. The training job will execute the following. - -- Starts ``instance_count`` EC2 instances of the type ``instance_type``. -- On each instance, it will do the following steps: - - - starts a Docker container optimized for TensorFlow. - - downloads the dataset. - - setup up training related environment varialbes - - setup up distributed training environment if configured to use parameter server - - starts asynchronous training - -If the ``wait=False`` flag is passed to ``fit``, then it returns immediately. The training job continues running -asynchronously. Later, a TensorFlow estimator can be obtained by attaching to the existing training job. -If the training job is not finished, it starts showing the standard output of training and wait until it completes. -After attaching, the estimator can be deployed as usual. - -.. code:: python - - tf_estimator.fit(your_input_data, wait=False) - training_job_name = tf_estimator.latest_training_job.name - - # after some time, or in a separate Python notebook, we can attach to it again. - - tf_estimator = TensorFlow.attach(training_job_name=training_job_name) - -For more information about the options available for ``fit``, see the `API documentation `_. - -Distributed Training -==================== - -To run your training job with multiple instances in a distributed fashion, set ``instance_count`` -to a number larger than 1. We support two different types of distributed training, parameter server and Horovod. -The ``distribution`` parameter is used to configure which distributed training strategy to use. - -Training with parameter servers -------------------------------- - -If you specify parameter_server as the value of the distribution parameter, the container launches a parameter server -thread on each instance in the training cluster, and then executes your training code. You can find more information on -TensorFlow distributed training at `TensorFlow docs `__. -To enable parameter server training: - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - tf_estimator = TensorFlow( - entry_point="tf-train.py", - role="SageMakerRole", - instance_count=2, - instance_type="ml.p2.xlarge", - framework_version="2.2", - py_version="py37", - distribution={"parameter_server": {"enabled": True}}, - ) - tf_estimator.fit("s3://bucket/path/to/training/data") - -Training with Horovod ---------------------- - -Horovod is a distributed training framework based on MPI. Horovod is only available with TensorFlow version ``1.12`` or newer. -You can find more details at `Horovod README `__. - -The container sets up the MPI environment and executes the ``mpirun`` command, enabling you to run any Horovod -training script. - -Training with ``MPI`` is configured by specifying following fields in ``distribution``: - -- ``enabled (bool)``: If set to ``True``, the MPI setup is performed and ``mpirun`` command is executed. -- ``processes_per_host (int)``: Number of processes MPI should launch on each host. Note, this should not be - greater than the available slots on the selected instance type. This flag should be set for the multi-cpu/gpu - training. -- ``custom_mpi_options (str)``: Any ``mpirun`` flag(s) can be passed in this field that will be added to the ``mpirun`` - command executed by SageMaker to launch distributed horovod training. - - -In the below example we create an estimator to launch Horovod distributed training with 4 processes on one host: - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - tf_estimator = TensorFlow( - entry_point="tf-train.py", - role="SageMakerRole", - instance_count=1, - instance_type="ml.p3.8xlarge", - framework_version="2.1.0", - py_version="py3", - distribution={ - "mpi": { - "enabled": True, - "processes_per_host": 4, - "custom_mpi_options": "--NCCL_DEBUG INFO" - }, - }, - ) - tf_estimator.fit("s3://bucket/path/to/training/data") - - -Training with Pipe Mode using PipeModeDataset -============================================= - -Amazon SageMaker allows users to create training jobs using Pipe input mode. -With Pipe input mode, your dataset is streamed directly to your training instances instead of being downloaded first. -This means that your training jobs start sooner, finish quicker, and need less disk space. - -SageMaker TensorFlow provides an implementation of ``tf.data.Dataset`` that makes it easy to take advantage of Pipe -input mode in SageMaker. You can replace your ``tf.data.Dataset`` with a ``sagemaker_tensorflow.PipeModeDataset`` to -read TFRecords as they are streamed to your training instances. - -In your ``entry_point`` script, you can use ``PipeModeDataset`` like a ``Dataset``. In this example, we create a -``PipeModeDataset`` to read TFRecords from the 'training' channel: - - -.. code:: python - - from sagemaker_tensorflow import PipeModeDataset - - features = { - 'data': tf.FixedLenFeature([], tf.string), - 'labels': tf.FixedLenFeature([], tf.int64), - } - - def parse(record): - parsed = tf.parse_single_example(record, features) - return ({ - 'data': tf.decode_raw(parsed['data'], tf.float64) - }, parsed['labels']) - - def train_input_fn(training_dir, hyperparameters): - ds = PipeModeDataset(channel='training', record_format='TFRecord') - ds = ds.repeat(20) - ds = ds.prefetch(10) - ds = ds.map(parse, num_parallel_calls=10) - ds = ds.batch(64) - return ds - - -To run training job with Pipe input mode, pass in ``input_mode='Pipe'`` to your TensorFlow Estimator: - - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - tf_estimator = TensorFlow( - entry_point="tf-train-with-pipemodedataset.py", - role="SageMakerRole", - training_steps=10000, - evaluation_steps=100, - instance_count=1, - instance_type="ml.p2.xlarge", - framework_version="1.10.0", - py_version="py3", - input_mode="Pipe", - ) - - tf_estimator.fit("s3://bucket/path/to/training/data") - - -If your TFRecords are compressed, you can train on Gzipped TF Records by passing in ``compression='Gzip'`` to the call to -``fit()``, and SageMaker will automatically unzip the records as data is streamed to your training instances: - -.. code:: python - - from sagemaker.inputs import TrainingInput - - train_s3_input = TrainingInput('s3://bucket/path/to/training/data', compression='Gzip') - tf_estimator.fit(train_s3_input) - - -You can learn more about ``PipeModeDataset`` in the sagemaker-tensorflow-extensions repository: https://github.com/aws/sagemaker-tensorflow-extensions - - -Training with MKL-DNN disabled -============================== - -SageMaker TensorFlow CPU images use TensorFlow built with Intel® MKL-DNN optimization. - -In certain cases you might be able to get a better performance by disabling this optimization -(for example when using small models). - -You can disable MKL-DNN optimization for TensorFlow ``1.8.0`` and above by setting two following environment variables: - -.. code:: python - - import os - - os.environ['TF_DISABLE_MKL'] = '1' - os.environ['TF_DISABLE_POOL_ALLOCATOR'] = '1' - -******************************** -Deploy TensorFlow Serving models -******************************** - -After a TensorFlow estimator has been fit, it saves a TensorFlow SavedModel in -the S3 location defined by ``output_path``. You can call ``deploy`` on a TensorFlow -estimator to create a SageMaker Endpoint, or you can call ``transformer`` to create a ``Transformer`` that you can use to run a batch transform job. - -Your model will be deployed to a TensorFlow Serving-based server. The server provides a super-set of the -`TensorFlow Serving REST API `_. - - -Deploy to a SageMaker Endpoint -============================== - -Deploying from an Estimator ---------------------------- - -After a TensorFlow estimator has been fit, it saves a TensorFlow -`SavedModel `_ bundle in -the S3 location defined by ``output_path``. You can call ``deploy`` on a TensorFlow -estimator object to create a SageMaker Endpoint: - -.. code:: python - - from sagemaker.tensorflow import TensorFlow - - estimator = TensorFlow( - entry_point="tf-train.py", - ..., - instance_count=1, - instance_type="ml.c4.xlarge", - framework_version="2.2", - py_version="py37", - ) - - estimator.fit(inputs) - - predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.c5.xlarge") - - -The code block above deploys a SageMaker Endpoint with one instance of the type 'ml.c5.xlarge'. - -What happens when deploy is called -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Calling ``deploy`` starts the process of creating a SageMaker Endpoint. This process includes the following steps. - -- Starts ``initial_instance_count`` EC2 instances of the type ``instance_type``. -- On each instance, it will do the following steps: - - - start a Docker container optimized for TensorFlow Serving, see `SageMaker TensorFlow Serving containers `_. - - start a `TensorFlow Serving` process configured to run your model. - - start an HTTP server that provides access to TensorFlow Server through the SageMaker InvokeEndpoint API. - - -When the ``deploy`` call finishes, the created SageMaker Endpoint is ready for prediction requests. The -`Making predictions against a SageMaker Endpoint`_ section will explain how to make prediction requests -against the Endpoint. - -Deploying directly from model artifacts ---------------------------------------- - -If you already have existing model artifacts in S3, you can skip training and deploy them directly to an endpoint: - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(model_data='s3://mybucket/model.tar.gz', role='MySageMakerRole') - - predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge') - -Python-based TensorFlow serving on SageMaker has support for `Elastic Inference `__, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance. In order to attach an Elastic Inference accelerator to your endpoint provide the accelerator type to accelerator_type to your deploy call. - -.. code:: python - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(model_data='s3://mybucket/model.tar.gz', role='MySageMakerRole') - - predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge', accelerator_type='ml.eia1.medium') - -Making predictions against a SageMaker Endpoint ------------------------------------------------ - -Once you have the ``Predictor`` instance returned by ``model.deploy(...)`` or ``estimator.deploy(...)``, you -can send prediction requests to your Endpoint. - -The following code shows how to make a prediction request: - -.. code:: python - - input = { - 'instances': [1.0, 2.0, 5.0] - } - result = predictor.predict(input) - -The result object will contain a Python dict like this: - -.. code:: python - - { - 'predictions': [3.5, 4.0, 5.5] - } - -The formats of the input and the output data correspond directly to the request and response formats -of the ``Predict`` method in the `TensorFlow Serving REST API `_. - -If your SavedModel includes the right ``signature_def``, you can also make Classify or Regress requests: - -.. code:: python - - # input matches the Classify and Regress API - input = { - 'signature_name': 'tensorflow/serving/regress', - 'examples': [{'x': 1.0}, {'x': 2.0}] - } - - result = predictor.regress(input) # or predictor.classify(...) - - # result contains: - { - 'results': [3.5, 4.0] - } - -You can include multiple ``instances`` in your predict request (or multiple ``examples`` in -classify/regress requests) to get multiple prediction results in one request to your Endpoint: - -.. code:: python - - input = { - 'instances': [ - [1.0, 2.0, 5.0], - [1.0, 2.0, 5.0], - [1.0, 2.0, 5.0] - ] - } - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -If your application allows request grouping like this, it is **much** more efficient than making separate requests. - -See `Deploying to TensorFlow Serving Endpoints `_ to learn how to deploy your model and make inference requests. - -Run a Batch Transform Job -========================= - -Batch transform allows you to get inferences for an entire dataset that is stored in an S3 bucket. - -For general information about using batch transform with the SageMaker Python SDK, see :ref:`overview:SageMaker Batch Transform`. -For information about SageMaker batch transform, see `Get Inferences for an Entire Dataset with Batch Transform ` in the AWS documentation. - -To run a batch transform job, you first create a ``Transformer`` object, and then call that object's ``transform`` method. - -Create a Transformer Object ---------------------------- - -If you used an estimator to train your model, you can call the ``transformer`` method of the estimator to create a ``Transformer`` object. - -For example: - -.. code:: python - - bucket = myBucket # The name of the S3 bucket where the results are stored - prefix = 'batch-results' # The folder in the S3 bucket where the results are stored - - batch_output = 's3://{}/{}/results'.format(bucket, prefix) # The location to store the results - - tf_transformer = tf_estimator.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output) - -To use a model trained outside of SageMaker, you can package the model as a SageMaker model, and call the ``transformer`` method of the SageMaker model. - -For example: - -.. code:: python - - bucket = myBucket # The name of the S3 bucket where the results are stored - prefix = 'batch-results' # The folder in the S3 bucket where the results are stored - - batch_output = 's3://{}/{}/results'.format(bucket, prefix) # The location to store the results - - tf_transformer = tensorflow_serving_model.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output) - -For information about how to package a model as a SageMaker model, see :ref:`overview:BYO Model`. -When you call the ``tranformer`` method, you specify the type and number of instances to use for the batch transform job, and the location where the results are stored in S3. - - - -Call transform --------------- - -After you create a ``Transformer`` object, you call that object's ``transform`` method to start a batch transform job. -For example: - -.. code:: python - - batch_input = 's3://{}/{}/test/examples'.format(bucket, prefix) # The location of the input dataset - - tf_transformer.transform(data=batch_input, data_type='S3Prefix', content_type='text/csv', split_type='Line') - -In the example, the content type is CSV, and each line in the dataset is treated as a record to get a predition for. - -Batch Transform Supported Data Formats --------------------------------------- - -When you call the ``tranform`` method to start a batch transform job, -you specify the data format by providing a MIME type as the value for the ``content_type`` parameter. - -The following content formats are supported without custom intput and output handling: - -* CSV - specify ``text/csv`` as the value of the ``content_type`` parameter. -* JSON - specify ``application/json`` as the value of the ``content_type`` parameter. -* JSON lines - specify ``application/jsonlines`` as the value of the ``content_type`` parameter. - -For detailed information about how TensorFlow Serving formats these data types for input and output, see :ref:`tensorflow-serving-input-output`. - -You can also accept any custom data format by writing input and output functions, and include them in the ``inference.py`` file in your model. -For information, see :ref:`custom-input-output`. - - -.. _tensorflow-serving-input-output: - -TensorFlow Serving Input and Output -=================================== - -The following sections describe the data formats that TensorFlow Serving endpoints and batch transform jobs accept, -and how to write input and output functions to input and output custom data formats. - -Supported Formats ------------------ - -SageMaker's TensforFlow Serving endpoints can also accept some additional input formats that are not part of the -TensorFlow REST API, including a simplified json format, line-delimited json objects ("jsons" or "jsonlines"), and -CSV data. - -Simplified JSON Input -^^^^^^^^^^^^^^^^^^^^^ - -The Endpoint will accept simplified JSON input that doesn't match the TensorFlow REST API's Predict request format. -When the Endpoint receives data like this, it will attempt to transform it into a valid -Predict request, using a few simple rules: - -- python value, dict, or one-dimensional arrays are treated as the input value in a single 'instance' Predict request. -- multidimensional arrays are treated as a multiple values in a multi-instance Predict request. - -Combined with the client-side ``Predictor`` object's JSON serialization, this allows you to make simple -requests like this: - -.. code:: python - - input = [ - [1.0, 2.0, 5.0], - [1.0, 2.0, 5.0] - ] - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -Or this: - -.. code:: python - - # 'x' must match name of input tensor in your SavedModel graph - # for models with multiple named inputs, just include all the keys in the input dict - input = { - 'x': [1.0, 2.0, 5.0] - } - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5] - ] - } - - -Line-delimited JSON -^^^^^^^^^^^^^^^^^^^ - -The Endpoint will accept line-delimited JSON objects (also known as "jsons" or "jsonlines" data). -The Endpoint treats each line as a separate instance in a multi-instance Predict request. To use -this feature from your python code, you need to create a ``Predictor`` instance that does not -try to serialize your input to JSON: - -.. code:: python - - # create a Predictor without JSON serialization - - predictor = Predictor('endpoint-name', serializer=None, content_type='application/jsonlines') - - input = '''{'x': [1.0, 2.0, 5.0]} - {'x': [1.0, 2.0, 5.0]} - {'x': [1.0, 2.0, 5.0]}''' - - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -This feature is especially useful if you are reading data from a file containing jsonlines data. - -**CSV (comma-separated values)** - -The Endpoint will accept CSV data. Each line is treated as a separate instance. This is a -compact format for representing multiple instances of 1-d array data. To use this feature -from your python code, you need to create a ``Predictor`` instance that can serialize -your input data to CSV format: - -.. code:: python - - # create a Predictor with JSON serialization - - predictor = Predictor('endpoint-name', serializer=sagemaker.serializers.CSVSerializer()) - - # CSV-formatted string input - input = '1.0,2.0,5.0\n1.0,2.0,5.0\n1.0,2.0,5.0' - - result = predictor.predict(input) - - # result contains: - { - 'predictions': [ - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5], - [3.5, 4.0, 5.5] - ] - } - -You can also use python arrays or numpy arrays as input and let the ``CSVSerializer`` object -convert them to CSV, but the client-size CSV conversion is more sophisticated than the -CSV parsing on the Endpoint, so if you encounter conversion problems, try using one of the -JSON options instead. - -.. _custom-input-output: - -Create Python Scripts for Custom Input and Output Formats ---------------------------------------------------------- - -You can add your customized Python code to process your input and output data. -This customized Python code must be named ``inference.py`` and specified through the ``entry_point`` parameter: - -.. code:: - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(entry_point='inference.py', - model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole') - -How to implement the pre- and/or post-processing handler(s) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Your entry point file must be named ``inference.py`` and should implement -either a pair of ``input_handler`` and ``output_handler`` functions or -a single ``handler`` function. -Note that if ``handler`` function is implemented, ``input_handler`` -and ``output_handler`` are ignored. - -To implement pre- and/or post-processing handler(s), use the Context -object that the Python service creates. The Context object is a namedtuple with the following attributes: - -- ``model_name (string)``: the name of the model to use for - inference. For example, 'half-plus-three' - -- ``model_version (string)``: version of the model. For example, '5' - -- ``method (string)``: inference method. For example, 'predict', - 'classify' or 'regress', for more information on methods, please see - `Classify and Regress - API `__ - and `Predict - API `__ - -- ``rest_uri (string)``: the TFS REST uri generated by the Python - service. For example, - 'http://localhost:8501/v1/models/half_plus_three:predict' - -- ``grpc_uri (string)``: the GRPC port number generated by the Python - service. For example, '9000' - -- ``custom_attributes (string)``: content of - 'X-Amzn-SageMaker-Custom-Attributes' header from the original - request. For example, - 'tfs-model-name=half*plus*\ three,tfs-method=predict' - -- ``request_content_type (string)``: the original request content type, - defaulted to 'application/json' if not provided - -- ``accept_header (string)``: the original request accept type, - defaulted to 'application/json' if not provided - -- ``content_length (int)``: content length of the original request - -The following code example implements ``input_handler`` and -``output_handler``. By providing these, the Python service posts the -request to the TFS REST URI with the data pre-processed by ``input_handler`` -and passes the response to ``output_handler`` for post-processing. - -.. code:: - - import json - - def input_handler(data, context): - """ Pre-process request input before it is sent to TensorFlow Serving REST API - Args: - data (obj): the request data, in format of dict or string - context (Context): an object containing request and configuration details - Returns: - (dict): a JSON-serializable dict that contains request body and headers - """ - if context.request_content_type == 'application/json': - # pass through json (assumes it's correctly formed) - d = data.read().decode('utf-8') - return d if len(d) else '' - - if context.request_content_type == 'text/csv': - # very simple csv handler - return json.dumps({ - 'instances': [float(x) for x in data.read().decode('utf-8').split(',')] - }) - - raise ValueError('{{"error": "unsupported content type {}"}}'.format( - context.request_content_type or "unknown")) - - - def output_handler(data, context): - """Post-process TensorFlow Serving output before it is returned to the client. - Args: - data (obj): the TensorFlow serving response - context (Context): an object containing request and configuration details - Returns: - (bytes, string): data to return to client, response content type - """ - if data.status_code != 200: - raise ValueError(data.content.decode('utf-8')) - - response_content_type = context.accept_header - prediction = data.content - return prediction, response_content_type - -You might want to have complete control over the request. -For example, you might want to make a TFS request (REST or GRPC) to the first model, -inspect the results, and then make a request to a second model. In this case, implement -the ``handler`` method instead of the ``input_handler`` and ``output_handler`` methods, as demonstrated -in the following code: - -.. code:: - - import json - import requests - - - def handler(data, context): - """Handle request. - Args: - data (obj): the request data - context (Context): an object containing request and configuration details - Returns: - (bytes, string): data to return to client, (optional) response content type - """ - processed_input = _process_input(data, context) - response = requests.post(context.rest_uri, data=processed_input) - return _process_output(response, context) - - - def _process_input(data, context): - if context.request_content_type == 'application/json': - # pass through json (assumes it's correctly formed) - d = data.read().decode('utf-8') - return d if len(d) else '' - - if context.request_content_type == 'text/csv': - # very simple csv handler - return json.dumps({ - 'instances': [float(x) for x in data.read().decode('utf-8').split(',')] - }) - - raise ValueError('{{"error": "unsupported content type {}"}}'.format( - context.request_content_type or "unknown")) - - - def _process_output(data, context): - if data.status_code != 200: - raise ValueError(data.content.decode('utf-8')) - - response_content_type = context.accept_header - prediction = data.content - return prediction, response_content_type - -You can also bring in external dependencies to help with your data -processing. There are 2 ways to do this: - -1. If your model archive contains ``code/requirements.txt``, the container will install the Python dependencies at runtime using ``pip install -r``. - -.. code:: - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(entry_point='inference.py', - dependencies=['requirements.txt'], - model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole') - - -2. If you are working in a network-isolation situation or if you don't - want to install dependencies at runtime every time your endpoint starts or a batch - transform job runs, you might want to put - pre-downloaded dependencies under a ``lib`` directory and this - directory as dependency. The container adds the modules to the Python - path. Note that if both ``lib`` and ``requirements.txt`` - are present in the model archive, the ``requirements.txt`` is ignored: - -.. code:: - - from sagemaker.tensorflow import TensorFlowModel - - model = TensorFlowModel(entry_point='inference.py', - dependencies=['/path/to/folder/named/lib'], - model_data='s3://mybucket/model.tar.gz', - role='MySageMakerRole') - -For more information, see: https://github.com/aws/sagemaker-tensorflow-serving-container#prepost-processing - -**************************** -SageMaker TensorFlow Classes -**************************** - -For information about the different TensorFlow-related classes in the SageMaker Python SDK, see https://sagemaker.readthedocs.io/en/stable/sagemaker.tensorflow.html. - -************************************** -SageMaker TensorFlow Docker containers -************************************** - -For information about the SageMaker TensorFlow containers, see: - -- `SageMaker TensorFlow training toolkit `_ -- `SageMaker TensorFlow serving toolkit `_ -- `Deep Learning Container (DLC) Dockerfiles for TensorFlow `_ -- `Deep Learning Container (DLC) Images `_ and `release notes `_ diff --git a/doc/frameworks/xgboost/index.rst b/doc/frameworks/xgboost/index.rst deleted file mode 100644 index d81a74cc09..0000000000 --- a/doc/frameworks/xgboost/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -########################## -XGBoost -########################## - -.. toctree:: - :maxdepth: 1 - :glob: - - * diff --git a/doc/frameworks/xgboost/using_xgboost.rst b/doc/frameworks/xgboost/using_xgboost.rst deleted file mode 100644 index 8abdd589c0..0000000000 --- a/doc/frameworks/xgboost/using_xgboost.rst +++ /dev/null @@ -1,473 +0,0 @@ -######################################### -Use XGBoost with the SageMaker Python SDK -######################################### - -.. contents:: - -eXtreme Gradient Boosting (XGBoost) is a popular and efficient machine learning algorithm used for regression and classification tasks on tabular datasets. -It implements a technique known as gradient boosting on trees, which performs remarkably well in machine learning competitions. - -Amazon SageMaker supports two ways to use the XGBoost algorithm: - - * XGBoost built-in algorithm - * XGBoost open source algorithm - -The XGBoost open source algorithm provides the following benefits over the built-in algorithm: - -* Latest version - The open source XGBoost algorithm typically supports a more recent version of XGBoost. - To see the XGBoost version that is currently supported, - see `XGBoost SageMaker Estimators and Models `__. -* Flexibility - Take advantage of the full range of XGBoost functionality, such as cross-validation support. - You can add custom pre- and post-processing logic and run additional code after training. -* Scalability - The XGBoost open source algorithm has a more efficient implementation of distributed training, - which enables it to scale out to more instances and reduce out-of-memory errors. -* Extensibility - Because the open source XGBoost container is open source, - you can extend the container to install additional libraries and change the version of XGBoost that the container uses. - For an example notebook that shows how to extend SageMaker containers, see `Extending our PyTorch containers `__. - - -*********************************** -Use XGBoost as a Built-in Algortihm -*********************************** - -Amazon SageMaker provides XGBoost as a built-in algorithm that you can use like other built-in algorithms. -Using the built-in algorithm version of XGBoost is simpler than using the open source version, because you don't have to write a training script. -If you don't need the features and flexibility of open source XGBoost, consider using the built-in version. -For information about using the Amazon SageMaker XGBoost built-in algorithm, see `XGBoost Algorithm `__ -in the *Amazon SageMaker Developer Guide*. - -************************************* -Use the Open Source XGBoost Algorithm -************************************* - -If you want the flexibility and additional features that it provides, use the SageMaker open source XGBoost algorithm. - -For which XGBoost versions are supported, see `the AWS documentation `_. -We recommend that you use the latest supported version because that's where we focus most of our development efforts. - -For a complete example of using the open source XGBoost algorithm, see the sample notebook at -https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_abalone_dist_script_mode.ipynb. - -For more information about XGBoost, see `the XGBoost documentation `_. - -Train a Model with Open Source XGBoost -====================================== - -To train a model by using the Amazon SageMaker open source XGBoost algorithm: - -.. |create xgboost estimator| replace:: Create a ``sagemaker.xgboost.XGBoost estimator`` -.. _create xgboost estimator: #create-an-estimator - -.. |call fit| replace:: Call the estimator's ``fit`` method -.. _call fit: #call-the-fit-method - -1. `Prepare a training script <#prepare-a-training-script>`_ -2. |create xgboost estimator|_ -3. |call fit|_ - -Prepare a Training Script -------------------------- - -A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, -and saves a model to ``model_dir`` so that it can be hosted later. -Hyperparameters are passed to your script as arguments and can be retrieved with an ``argparse.ArgumentParser`` instance. -For information about ``argparse.ArgumentParser``, see `argparse `__ in the Python documentation. - - -For a complete example of an XGBoost training script, see https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/xgboost_abalone/abalone.py. - -The training script is very similar to a training script you might run outside of Amazon SageMaker, -but you can access useful properties about the training environment through various environment variables, including the following: - -* ``SM_MODEL_DIR``: A string that represents the path where the training job writes the model artifacts to. - After training, artifacts in this directory are uploaded to Amazon S3 for model hosting. -* ``SM_NUM_GPUS``: An integer representing the number of GPUs available to the host. -* ``SM_CHANNEL_XXXX``: A string that represents the path to the directory that contains the input data for the specified channel. - For example, if you specify two input channels in the MXNet estimator's ``fit`` call, named 'train' and 'test', the environment variables ``SM_CHANNEL_TRAIN`` and ``SM_CHANNEL_TEST`` are set. -* ``SM_HPS``: A JSON dump of the hyperparameters preserving JSON types (boolean, integer, etc.) - -For the exhaustive list of available environment variables, see the `SageMaker Containers documentation `__. - -.. important:: - The sagemaker-containers repository has been deprecated, - however it is still used to define Scikit-learn and XGBoost environment variables. - -Let's look at the main elements of the script. Starting with the ``__main__`` guard, -use a parser to read the hyperparameters passed to the estimator when creating the training job. -These hyperparameters are made available as arguments to our input script. -We also parse a number of Amazon SageMaker-specific environment variables to get information about the training environment, -such as the location of input data and location where we want to save the model. - -.. code:: python - - if __name__ == '__main__': - parser = argparse.ArgumentParser() - - # Hyperparameters are described here - parser.add_argument('--num_round', type=int) - parser.add_argument('--max_depth', type=int, default=5) - parser.add_argument('--eta', type=float, default=0.2) - parser.add_argument('--objective', type=str, default='reg:squarederror') - - # SageMaker specific arguments. Defaults are set in the environment variables. - parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR')) - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) - parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION']) - - args = parser.parse_args() - - train_hp = { - 'max_depth': args.max_depth, - 'eta': args.eta, - 'gamma': args.gamma, - 'min_child_weight': args.min_child_weight, - 'subsample': args.subsample, - 'silent': args.silent, - 'objective': args.objective - } - - dtrain = xgb.DMatrix(args.train) - dval = xgb.DMatrix(args.validation) - watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')] - - callbacks = [] - prev_checkpoint, n_iterations_prev_run = add_checkpointing(callbacks) - # If checkpoint is found then we reduce num_boost_round by previously run number of iterations - - bst = xgb.train( - params=train_hp, - dtrain=dtrain, - evals=watchlist, - num_boost_round=(args.num_round - n_iterations_prev_run), - xgb_model=prev_checkpoint, - callbacks=callbacks - ) - - # Save the model to the location specified by ``model_dir`` - model_location = args.model_dir + '/xgboost-model' - pkl.dump(bst, open(model_location, 'wb')) - logging.info("Stored trained model at {}".format(model_location)) - -Create an Estimator -------------------- -After you create your training script, create an instance of the :class:`sagemaker.xgboost.estimator.XGBoost` estimator. -Pass an IAM role that has the permissions necessary to run an Amazon SageMaker training job, -the type and number of instances to use for the training job, -and a dictionary of the hyperparameters to pass to the training script. - -.. code:: - - from sagemaker.xgboost.estimator import XGBoost - - xgb_estimator = XGBoost( - entry_point="abalone.py", - hyperparameters=hyperparameters, - role=role, - instance_count=1, - instance_type="ml.m5.2xlarge", - framework_version="1.0-1", - ) - - -Call the fit Method -------------------- - -After you create an estimator, call the ``fit`` method to run the training job. - -.. code:: - - xgb_script_mode_estimator.fit({"train": train_input}) - - - -Deploy Open Source XGBoost Models -================================= - -After you fit an XGBoost Estimator, you can host the newly created model in SageMaker. - -After you call ``fit``, you can call ``deploy`` on an ``XGBoost`` estimator to create a SageMaker endpoint. -The endpoint runs a SageMaker-provided XGBoost model server and hosts the model produced by your training script, -which was run when you called ``fit``. This was the model you saved to ``model_dir``. - -``deploy`` returns a ``Predictor`` object, which you can use to do inference on the Endpoint hosting your XGBoost model. -Each ``Predictor`` provides a ``predict`` method which can do inference with numpy arrays, Python lists, or strings. -After inference arrays or lists are serialized and sent to the XGBoost model server, ``predict`` returns the result of -inference against your model. - -.. code:: - - serializer = StringSerializer(content_type="text/libsvm") - - predictor = estimator.deploy( - initial_instance_count=1, - instance_type="ml.m5.xlarge", - serializer=serializer - ) - - with open("abalone") as f: - payload = f.read() - - predictor.predict(payload) - -SageMaker XGBoost Model Server ------------------------------------ - -You can configure two components of the SageMaker XGBoost model server: Model loading and model serving. -Model loading is the process of deserializing your saved model back into an XGBoost model. -Model serving is the process of translating endpoint requests to inference calls on the loaded model. - -You configure the XGBoost model server by defining functions in the Python source file you passed to the XGBoost constructor. - -Load a Model -^^^^^^^^^^^^ - -Before a model can be served, it must be loaded. The SageMaker XGBoost model server loads your model by invoking a -``model_fn`` function that you must provide in your script. The ``model_fn`` should have the following signature: - -.. code:: python - - def model_fn(model_dir) - -SageMaker will inject the directory where your model files and sub-directories, saved by ``save``, have been mounted. -Your model function should return a ``xgboost.Booster`` object that can be used for model serving. - -The following code-snippet shows an example ``model_fn`` implementation. -It loads and returns a pickled XGBoost model from a ``xgboost-model`` file in the SageMaker model directory ``model_dir``. - -.. code:: python - - import pickle as pkl - - def model_fn(model_dir): - with open(os.path.join(model_dir, "xgboost-model"), "rb") as f: - booster = pkl.load(f) - return booster - -Serve a Model -^^^^^^^^^^^^^ - -After the SageMaker model server has loaded your model by calling ``model_fn``, SageMaker will serve your model. -The SageMaker Scikit-learn model server breaks request handling into three steps: - -- input processing, -- prediction, and -- output processing. - -In a similar way to model loading, you can customize the inference behavior by defining functions in your inference -script, which can be either in the same file as your training script or in a separate file, - -Each step involves invoking a python function, with information about the request and the return-value from the previous -function in the chain. -Inside the SageMaker XGBoost model server, the process looks like: - -.. code:: python - - # Deserialize the Invoke request body into an object we can perform prediction on - input_object = input_fn(request_body, request_content_type) - - # Perform prediction on the deserialized object, with the loaded model - prediction = predict_fn(input_object, model) - - # Serialize the prediction result into the desired response content type - output = output_fn(prediction, response_content_type) - -The above code-sample shows the three function definitions: - -- ``input_fn``: Takes request data and deserializes the data into an object for prediction. -- ``predict_fn``: Takes the deserialized request object and performs inference against the loaded model. -- ``output_fn``: Takes the result of prediction and serializes this according to the response content type. - -These functions are optional. -The SageMaker XGBoost model server provides default implementations of these functions. -You can provide your own implementations for these functions in your hosting script. -If you omit any definition then the SageMaker XGBoost model server will use its default implementation for that -function. - -In the following sections we describe the default implementations of ``input_fn``, ``predict_fn``, and ``output_fn``. -We describe the input arguments and expected return types of each, so you can define your own implementations. - -Process Input -""""""""""""" - -When a request is made against an endpoint running a SageMaker XGBoost model server, the model server receives two -pieces of information: - -- The request Content-Type, for example "application/x-npy" or "text/libsvm" -- The request data body, a byte array - -The SageMaker XGBoost model server will invoke an ``input_fn`` function in your inference script, passing in this -information. If you define an ``input_fn`` function definition, it should return an object that can be passed -to ``predict_fn`` and have the following signature: - -.. code:: python - - def input_fn(request_body, request_content_type) - -where ``request_body`` is a byte buffer and ``request_content_type`` is a Python string. - -The SageMaker XGBoost model server provides a default implementation of ``input_fn``. -This function deserializes CSV, LIBSVM, or protobuf recordIO into a ``xgboost.DMatrix``. - -Default csv deserialization requires ``request_body`` contain one or more lines of CSV numerical data. -The data is first loaded into a two-dimensional array, where each line break defines the boundaries of the first -dimension, and then it is converted to an `xgboost.Dmatrix`. It assumes that CSV input does not have the -label column. - -Default LIBSVM deserialization requires ``request_body`` to follow the `LIBSVM `_ format. - -The example below shows a custom ``input_fn`` for preparing pickled NumPy arrays. - -.. code:: python - - from io import BytesIO - import numpy as np - import xgboost as xgb - - def input_fn(request_body, request_content_type): - """An input_fn that loads a numpy array""" - if request_content_type == "application/npy": - array = np.load(BytesIO(request_body)) - return xgb.DMatrix(array) - else: - # Handle other content-types here or raise an Exception - # if the content type is not supported. - pass - -Get Predictions -""""""""""""""" - -After the inference request has been deserialized by ``input_fn``, the SageMaker XGBoost model server invokes -``predict_fn`` on the return value of ``input_fn``. - -As with ``input_fn``, you can define your own ``predict_fn`` or use the SageMaker XGBoost model server default. - -The ``predict_fn`` function has the following signature: - -.. code:: python - - def predict_fn(input_object, model) - -Where ``input_object`` is the object returned from ``input_fn`` and ``model`` is the model loaded by ``model_fn``. - -The default implementation of ``predict_fn`` invokes the loaded model's ``predict`` function on ``input_object``, -and returns the resulting value. The return-type should be a NumPy array to be compatible with the default -``output_fn``. - -The example below shows an overriden ``predict_fn`` that returns a two-dimensional NumPy array where -the first columns are predictions and the remaining columns are the feature contributions -(`SHAP values `_) for that prediction. -When ``pred_contribs`` is ``True`` in ``xgboost.Booster.predict()``, the output will be a matrix of size -(nsample, nfeats + 1) with each record indicating the feature contributions for that prediction. -Note the final column is the bias term. - -.. code:: python - - import numpy as np - - def predict_fn(input_data, model): - prediction = model.predict(input_data) - feature_contribs = model.predict(input_data, pred_contribs=True) - output = np.hstack((prediction[:, np.newaxis], feature_contribs)) - return output - -If you implement your own prediction function, you should take care to ensure that: - -- The first argument is expected to be the return value from input_fn. -- The second argument is the loaded model. -- The return value should be of the correct type to be passed as the first argument to ``output_fn``. - If you use the default ``output_fn``, this should be a NumPy array. - -Process Output -"""""""""""""" - -After invoking ``predict_fn``, the model server invokes ``output_fn``, passing in the return value from -``predict_fn`` and the requested response content-type. - -The ``output_fn`` has the following signature: - -.. code:: python - - def output_fn(prediction, content_type) - -``prediction`` is the result of invoking ``predict_fn`` and ``content_type`` is the requested response content-type. -The function should return a byte array of data serialized to ``content_type``. - -The default implementation expects ``prediction`` to be a NumPy array and can serialize the result to JSON, CSV, or NPY. -It accepts response content types of "application/json", "text/csv", and "application/x-npy". - -Bring Your Own Model --------------------- - -You can deploy an XGBoost model that you trained outside of SageMaker by using the Amazon SageMaker XGBoost container. -Typically, you save an XGBoost model by pickling the ``Booster`` object or calling ``booster.save_model``. -The XGBoost `built-in algorithm mode `_ -supports both a pickled ``Booster`` object and a model produced by ``booster.save_model``. -You can also deploy an XGBoost model by using XGBoost as a framework. -By using XGBoost as a framework, you have more flexibility. -To deploy an XGBoost model by using XGBoost as a framework, you need to: - -- Write an inference script. -- Create the XGBoostModel object. - -Write an Inference Script -^^^^^^^^^^^^^^^^^^^^^^^^^ - -You must create an inference script that implements (at least) the ``model_fn`` function that calls the loaded model to get a prediction. - -Optionally, you can also implement ``input_fn`` and ``output_fn`` to process input and output, -and ``predict_fn`` to customize how the model server gets predictions from the loaded model. -For information about how to write an inference script, see `SageMaker XGBoost Model Server <#sagemaker-xgboost-model-server>`_. -Pass the filename of the inference script as the ``entry_point`` parameter when you create the `XGBoostModel` object. - -Create an XGBoostModel Object -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To create a model object, call the ``sagemaker.xgboost.model.XGBoostModel`` constructor, -and then call its ``deploy()`` method to deploy your model for inference. - -.. code:: python - - xgboost_model = XGBoostModel( - model_data="s3://my-bucket/my-path/model.tar.gz", - role="my-role", - entry_point="inference.py", - framework_version="1.0-1" - ) - - predictor = xgboost_model.deploy( - instance_type='ml.c4.xlarge', - initial_instance_count=1 - ) - - # If payload is a string in LIBSVM format, we need to change serializer. - predictor.serializer = str - predictor.predict("