diff --git a/.github/unittest/linux/scripts/environment.yml b/.github/unittest/linux/scripts/environment.yml deleted file mode 100644 index 3283867e9bc..00000000000 --- a/.github/unittest/linux/scripts/environment.yml +++ /dev/null @@ -1,37 +0,0 @@ -channels: - - pytorch - - defaults -dependencies: - - pip - - protobuf - - pip: - - hypothesis - - future - - cloudpickle - - pygame - - moviepy<2.0.0 - - tqdm - - pytest - - pytest-cov - - pytest-mock - - pytest-instafail - - pytest-rerunfailures - - pytest-timeout - - pytest-asyncio - - expecttest - - pybind11[global] - - pyyaml - - scipy - - hydra-core - - tensorboard - - imageio==2.26.0 - - wandb - - dm_control - - mujoco<3.3.6 - - mlflow - - av - - coverage - - ray - - transformers - - ninja - - timm diff --git a/.github/unittest/linux/scripts/post_process.sh b/.github/unittest/linux/scripts/post_process.sh index e97bf2a7b1b..df82332cd84 100755 --- a/.github/unittest/linux/scripts/post_process.sh +++ b/.github/unittest/linux/scripts/post_process.sh @@ -1,6 +1,3 @@ #!/usr/bin/env bash set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh index 94bb8a98e09..b1401a07457 100755 --- a/.github/unittest/linux/scripts/run_all.sh +++ b/.github/unittest/linux/scripts/run_all.sh @@ -6,30 +6,33 @@ set -v # =============================================================================== # # ================================ Init ========================================= # - if [[ $OSTYPE != 'darwin'* ]]; then - apt-get update && apt-get upgrade -y - apt-get install -y vim git wget cmake + # Prevent interactive prompts (notably tzdata) in CI. + export DEBIAN_FRONTEND=noninteractive + export TZ="${TZ:-Etc/UTC}" + ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime || true + echo "${TZ}" > /etc/timezone || true + + apt-get update + apt-get install -y --no-install-recommends tzdata + dpkg-reconfigure -f noninteractive tzdata || true - # Enable universe repository - # apt-get install -y software-properties-common - # add-apt-repository universe - # apt-get update + apt-get upgrade -y + apt-get install -y vim git wget cmake curl python3-dev - # apt-get install -y libsdl2-dev libsdl2-2.0-0 + # SDL2 and freetype needed for building pygame from source (Python 3.14+) + apt-get install -y libsdl2-dev libsdl2-2.0-0 libsdl2-mixer-dev libsdl2-image-dev libsdl2-ttf-dev + apt-get install -y libfreetype6-dev pkg-config - apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev - apt-get install -y libglvnd0 libgl1 libglx0 libegl1 libgles2 xvfb + apt-get install -y libglfw3 libosmesa6 libglew-dev + apt-get install -y libglvnd0 libgl1 libglx0 libglx-mesa0 libegl1 libgles2 xvfb if [ "${CU_VERSION:-}" == cpu ] ; then - # solves version `GLIBCXX_3.4.29' not found for tensorboard -# apt-get install -y gcc-4.9 apt-get upgrade -y libstdc++6 apt-get dist-upgrade -y else apt-get install -y g++ gcc fi - fi this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" @@ -45,38 +48,26 @@ fi # Avoid error: "fatal: unsafe repository" git config --global --add safe.directory '*' root_dir="$(git rev-parse --show-toplevel)" -conda_dir="${root_dir}/conda" -env_dir="${root_dir}/env" -lib_dir="${env_dir}/lib" +env_dir="${root_dir}/venv" cd "${root_dir}" -case "$(uname -s)" in - Darwin*) os=MacOSX;; - *) os=Linux -esac +# Install uv +curl -LsSf https://astral.sh/uv/install.sh | sh +export PATH="$HOME/.local/bin:$PATH" -# 1. Install conda at ./conda -if [ ! -d "${conda_dir}" ]; then - printf "* Installing conda\n" - wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh" - bash ./miniconda.sh -b -f -p "${conda_dir}" -fi -eval "$(${conda_dir}/bin/conda shell.bash hook)" +# Create venv with uv +printf "* Creating venv with Python ${PYTHON_VERSION}\n" +uv venv --python "${PYTHON_VERSION}" "${env_dir}" +source "${env_dir}/bin/activate" +uv_pip_install() { + uv pip install --no-progress --python "${env_dir}/bin/python" "$@" +} -# 2. Create test environment at ./env -printf "python: ${PYTHON_VERSION}\n" -if [ ! -d "${env_dir}" ]; then - printf "* Creating a test environment\n" - conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" -fi -conda activate "${env_dir}" - -# 3. Install Conda dependencies -printf "* Installing dependencies (except PyTorch)\n" -echo " - python=${PYTHON_VERSION}" >> "${this_dir}/environment.yml" -cat "${this_dir}/environment.yml" +# Verify CPython +python -c "import sys; assert sys.implementation.name == 'cpython', f'Expected CPython, got {sys.implementation.name}'" +# Set environment variables if [ "${CU_VERSION:-}" == cpu ] ; then export MUJOCO_GL=glfw else @@ -84,37 +75,97 @@ else fi export SDL_VIDEODRIVER=dummy +export PYOPENGL_PLATFORM=$MUJOCO_GL +export DISPLAY=:99 +export LAZY_LEGACY_OP=False +export RL_LOGGING_LEVEL=INFO +export TOKENIZERS_PARALLELISM=true +export MAX_IDLE_COUNT=1000 +export MKL_THREADING_LAYER=GNU +export CKPT_BACKEND=torch +export BATCHED_PIPE_TIMEOUT=60 -# legacy from bash scripts: remove? -conda env config vars set \ - MAX_IDLE_COUNT=1000 \ - MUJOCO_GL=$MUJOCO_GL PYOPENGL_PLATFORM=$MUJOCO_GL DISPLAY=:99 SDL_VIDEODRIVER=dummy LAZY_LEGACY_OP=False RL_LOGGING_LEVEL=INFO TOKENIZERS_PARALLELISM=true - -pip3 install pip --upgrade -pip install virtualenv +# ==================================================================================== # +# ================================ Install dependencies ============================== # + +printf "* Installing dependencies\n" + +# Install base dependencies +uv_pip_install \ + hypothesis \ + future \ + cloudpickle \ + pygame \ + "moviepy<2.0.0" \ + tqdm \ + pytest \ + pytest-cov \ + pytest-mock \ + pytest-instafail \ + pytest-rerunfailures \ + pytest-timeout \ + pytest-asyncio \ + expecttest \ + "pybind11[global]>=2.13" \ + pyyaml \ + scipy \ + hydra-core \ + tensorboard \ + "imageio==2.26.0" \ + "huggingface-hub>=0.34.0,<1.0" \ + wandb \ + mlflow \ + av \ + coverage \ + transformers \ + ninja \ + timm + +# Install dm_control for Python < 3.13 +# labmaze (dm_control dependency) doesn't have Python 3.13+ wheels +if [[ "$PYTHON_VERSION" != "3.13" && "$PYTHON_VERSION" != "3.14" ]]; then + echo "installing dm_control" + uv_pip_install dm_control +fi -conda env update --file "${this_dir}/environment.yml" --prune +# Install ray for Python < 3.14 (ray doesn't support Python 3.14 yet) +if [[ "$PYTHON_VERSION" != "3.14" ]]; then + echo "installing ray" + uv_pip_install ray +fi -# Reset conda env variables -conda deactivate -conda activate "${env_dir}" +# Install mujoco for Python < 3.14 (mujoco doesn't have Python 3.14 wheels yet) +if [[ "$PYTHON_VERSION" != "3.14" ]]; then + echo "installing mujoco" + uv_pip_install "mujoco>=3.3.7" +fi +# Install gymnasium echo "installing gymnasium" -if [[ "$PYTHON_VERSION" == "3.12" ]]; then - pip3 install ale-py - pip3 install sympy - pip3 install "gymnasium[mujoco]>=1.1" mo-gymnasium[mujoco] +if [[ "$PYTHON_VERSION" == "3.14" ]]; then + # Python 3.14: no mujoco wheels available, ale_py also failing + uv_pip_install "gymnasium>=1.1" +elif [[ "$PYTHON_VERSION" == "3.12" ]]; then + uv_pip_install ale-py sympy + uv_pip_install "gymnasium[mujoco]>=1.1" "mo-gymnasium[mujoco]" else - pip3 install "gymnasium[atari,mujoco]>=1.1" mo-gymnasium[mujoco] + uv_pip_install "gymnasium[atari,mujoco]>=1.1" "mo-gymnasium[mujoco]" fi -# sanity check: remove? -python -c """ +# sanity check +if [[ "$PYTHON_VERSION" != "3.13" && "$PYTHON_VERSION" != "3.14" ]]; then + python -c " import dm_control from dm_control import composer from tensorboard import * from google.protobuf import descriptor as _descriptor -""" +" +else + python -c " +from tensorboard import * +from google.protobuf import descriptor as _descriptor +" +fi # ============================================================================================ # # ================================ PyTorch & TorchRL ========================================= # @@ -122,7 +173,6 @@ from google.protobuf import descriptor as _descriptor unset PYTORCH_VERSION if [ "${CU_VERSION:-}" == cpu ] ; then - version="cpu" echo "Using cpu build" else if [[ ${#CU_VERSION} -eq 4 ]]; then @@ -131,7 +181,6 @@ else CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}" fi echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION ($CU_VERSION)" - version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")" fi # submodules @@ -140,15 +189,15 @@ git submodule sync && git submodule update --init --recursive printf "Installing PyTorch with %s\n" "${CU_VERSION}" if [[ "$TORCH_VERSION" == "nightly" ]]; then if [ "${CU_VERSION:-}" == cpu ] ; then - pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu -U + uv_pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu else - pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/$CU_VERSION -U + uv_pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/$CU_VERSION fi elif [[ "$TORCH_VERSION" == "stable" ]]; then - if [ "${CU_VERSION:-}" == cpu ] ; then - pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu -U + if [ "${CU_VERSION:-}" == cpu ] ; then + uv_pip_install torch torchvision --index-url https://download.pytorch.org/whl/cpu else - pip3 install torch torchvision --index-url https://download.pytorch.org/whl/$CU_VERSION -U + uv_pip_install torch torchvision --index-url https://download.pytorch.org/whl/$CU_VERSION fi else printf "Failed to install pytorch" @@ -158,73 +207,89 @@ fi # smoke test python -c "import functorch" -## install snapshot -#if [[ "$TORCH_VERSION" == "nightly" ]]; then -# pip3 install git+https://github.com/pytorch/torchsnapshot -#else -# pip3 install torchsnapshot -#fi +# Help CMake find pybind11 when building tensordict from source. +# pybind11 ships a CMake package; its location can be obtained via `python -m pybind11 --cmakedir`. +pybind11_DIR="$(python -m pybind11 --cmakedir)" +export pybind11_DIR # install tensordict if [[ "$RELEASE" == 0 ]]; then - pip3 install git+https://github.com/pytorch/tensordict.git + uv_pip_install --no-build-isolation git+https://github.com/pytorch/tensordict.git else - pip3 install tensordict + uv_pip_install tensordict fi printf "* Installing torchrl\n" -python -m pip install -e . --no-build-isolation - +uv_pip_install -e . --no-build-isolation if [ "${CU_VERSION:-}" != cpu ] ; then printf "* Installing VC1\n" - python -c """ -from torchrl.envs.transforms.vc1 import VC1Transform -VC1Transform.install_vc_models(auto_exit=True) -""" + # Install vc_models directly via uv. + # VC1Transform.install_vc_models() uses `setup.py develop` which expects `pip` + # to be present in the environment, but uv-created venvs do not necessarily + # ship with pip. + uv_pip_install "git+https://github.com/facebookresearch/eai-vc.git#subdirectory=vc_models" printf "* Upgrading timm\n" - pip3 install --upgrade "timm>=0.9.0" + uv_pip_install --upgrade "timm>=0.9.0" - python -c """ + python -c " import vc_models from vc_models.models.vit import model_utils print(model_utils) -""" +" fi # ==================================================================================== # # ================================ Run tests ========================================= # - export PYTORCH_TEST_WITH_SLOW='1' python -m torch.utils.collect_env -## Avoid error: "fatal: unsafe repository" -#git config --global --add safe.directory '*' -#root_dir="$(git rev-parse --show-toplevel)" - -# solves ImportError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found -#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir -export MKL_THREADING_LAYER=GNU -export CKPT_BACKEND=torch -export MAX_IDLE_COUNT=100 -export BATCHED_PIPE_TIMEOUT=60 Xvfb :99 -screen 0 1024x768x24 & pytest test/smoke_test.py -v --durations 200 pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb' + +# Track if any tests fail +EXIT_STATUS=0 + +# Run TestNonTensorEnv.test_parallel in ISOLATION first to check for test pollution issues. +# This test deadlocks when run after other tests due to resource_sharer thread interference. +# See: https://bugs.python.org/issue30289 +python .github/unittest/helpers/coverage_run_parallel.py -m pytest \ + test/test_env.py::TestNonTensorEnv::test_parallel \ + --instafail --durations 200 -vv --capture no \ + --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? + +# Run distributed tests first (GPU only) to surface errors early +if [ "${CU_VERSION:-}" != cpu ] ; then + python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py \ + --instafail --durations 200 -vv --capture no \ + --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? +fi + +# Run remaining tests (always run even if distributed tests failed) +# Note: TestNonTensorEnv::test_parallel is excluded since it was run in isolation above if [ "${CU_VERSION:-}" != cpu ] ; then python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \ --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \ + --ignore test/test_distributed.py \ --ignore test/llm \ - --timeout=120 --mp_fork_if_no_cuda + -k "not (TestNonTensorEnv and test_parallel)" \ + --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? else python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \ --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \ --ignore test/test_distributed.py \ --ignore test/llm \ - --timeout=120 --mp_fork_if_no_cuda + -k "not (TestNonTensorEnv and test_parallel)" \ + --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? +fi + +# Fail the workflow if any tests failed +if [ $EXIT_STATUS -ne 0 ]; then + echo "Some tests failed with exit status $EXIT_STATUS" fi coverage combine @@ -234,3 +299,6 @@ coverage xml -i # ================================ Post-proc ========================================= # bash ${this_dir}/post_process.sh + +# Exit with failure if any tests failed +exit $EXIT_STATUS diff --git a/.github/unittest/linux_optdeps/scripts/run_all.sh b/.github/unittest/linux_optdeps/scripts/run_all.sh index c73efc50834..108f52a8527 100755 --- a/.github/unittest/linux_optdeps/scripts/run_all.sh +++ b/.github/unittest/linux_optdeps/scripts/run_all.sh @@ -9,11 +9,21 @@ set -e if [[ $OSTYPE != 'darwin'* ]]; then - apt-get update && apt-get upgrade -y + # Prevent interactive prompts (notably tzdata) in CI. + export DEBIAN_FRONTEND=noninteractive + export TZ="${TZ:-Etc/UTC}" + ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime || true + echo "${TZ}" > /etc/timezone || true + + apt-get update + apt-get install -y --no-install-recommends tzdata + dpkg-reconfigure -f noninteractive tzdata || true + + apt-get upgrade -y apt-get install -y vim git wget cmake - apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev - apt-get install -y libglvnd0 libgl1 libglx0 libegl1 libgles2 + apt-get install -y libglfw3 libosmesa6 libglew-dev + apt-get install -y libglvnd0 libgl1 libglx0 libglx-mesa0 libegl1 libgles2 if [ "${CU_VERSION:-}" == cpu ] ; then # solves version `GLIBCXX_3.4.29' not found for tensorboard diff --git a/.github/unittest/linux_sota/scripts/run_all.sh b/.github/unittest/linux_sota/scripts/run_all.sh index cf1a9be33a8..a81c562a446 100755 --- a/.github/unittest/linux_sota/scripts/run_all.sh +++ b/.github/unittest/linux_sota/scripts/run_all.sh @@ -7,7 +7,16 @@ set -v # ================================ Init ============================================== # -apt-get update && apt-get upgrade -y +export DEBIAN_FRONTEND=noninteractive +export TZ="${TZ:-Etc/UTC}" +ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime || true +echo "${TZ}" > /etc/timezone || true + +apt-get update +apt-get install -y --no-install-recommends tzdata +dpkg-reconfigure -f noninteractive tzdata || true + +apt-get upgrade -y apt-get install -y vim git wget cmake apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev libosmesa6-dev diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml index 5e6c6a3bb91..8069b6a689d 100644 --- a/.github/workflows/nightly_build.yml +++ b/.github/workflows/nightly_build.yml @@ -42,11 +42,11 @@ jobs: matrix: os: [['linux', 'ubuntu-22.04'], ['macos', 'macos-latest']] python_version: [ - ["3.9", "cp39-cp39"], ["3.10", "cp310-cp310"], ["3.11", "cp311-cp311"], ["3.12", "cp312-cp312"], ["3.13", "cp313-cp313"], + ["3.14", "cp314-cp314"], ] cuda_support: [["", "cpu", "cpu"]] steps: @@ -88,11 +88,11 @@ jobs: matrix: os: [['linux', 'ubuntu-22.04'], ['macos', 'macos-latest']] python_version: [ - ["3.9", "cp39-cp39"], ["3.10", "cp310-cp310"], ["3.11", "cp311-cp311"], ["3.12", "cp312-cp312"], ["3.13", "cp313-cp313"], + ["3.14", "cp314-cp314"], ] cuda_support: [["", "cpu", "cpu"]] steps: @@ -162,11 +162,11 @@ jobs: matrix: os: [['linux', 'ubuntu-22.04'], ['macos', 'macos-latest']] python_version: [ - ["3.9", "cp39-cp39"], ["3.10", "cp310-cp310"], ["3.11", "cp311-cp311"], ["3.12", "cp312-cp312"], ["3.13", "cp313-cp313"], + ["3.14", "cp314-cp314"], ] cuda_support: [["", "cpu", "cpu"]] steps: @@ -204,11 +204,11 @@ jobs: strategy: matrix: python_version: [ - ["3.9", "3.9"], ["3.10", "3.10.3"], ["3.11", "3.11"], ["3.12", "3.12"], ["3.13", "3.13"], + ["3.14", "3.14"], ] steps: - name: Setup Python @@ -244,11 +244,11 @@ jobs: strategy: matrix: python_version: [ - ["3.9", "3.9"], ["3.10", "3.10.3"], ["3.11", "3.11"], ["3.12", "3.12"], ["3.13", "3.13"], + ["3.14", "3.14"], ] steps: - name: Setup Python @@ -314,11 +314,11 @@ jobs: strategy: matrix: python_version: [ - ["3.9", "3.9"], ["3.10", "3.10.3"], ["3.11", "3.11"], ["3.12", "3.12"], ["3.13", "3.13"], + ["3.14", "3.14"], ] steps: - name: Checkout torchrl diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml index e660f29ba78..89fed0b2b18 100644 --- a/.github/workflows/test-linux.yml +++ b/.github/workflows/test-linux.yml @@ -26,13 +26,13 @@ jobs: tests-cpu: strategy: matrix: - python_version: ["3.9", "3.10", "3.11", "3.12"] + python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"] fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.12xlarge repository: pytorch/rl - docker-image: "nvidia/cuda:12.2.0-devel-ubuntu22.04" + docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04" timeout: 90 script: | if [[ "${{ github.ref }}" =~ release/* ]]; then @@ -56,14 +56,14 @@ jobs: tests-gpu: strategy: matrix: - python_version: ["3.11"] - cuda_arch_version: ["12.8"] + python_version: ["3.12"] + cuda_arch_version: ["13.0"] fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu repository: pytorch/rl - docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" + docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04" gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }} timeout: 90 @@ -128,14 +128,14 @@ jobs: tests-optdeps: strategy: matrix: - python_version: ["3.11"] - cuda_arch_version: ["12.8"] + python_version: ["3.12"] + cuda_arch_version: ["13.0"] fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu repository: pytorch/rl - docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" + docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04" gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }} timeout: 90 @@ -163,14 +163,14 @@ jobs: tests-stable-gpu: strategy: matrix: - python_version: ["3.10"] # "3.9", "3.10", "3.11" - cuda_arch_version: ["11.8"] # "11.6", "11.7" + python_version: ["3.12"] # "3.9", "3.10", "3.11" + cuda_arch_version: ["13.0"] # "11.6", "11.7" fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu repository: pytorch/rl - docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" + docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04" gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }} timeout: 90 diff --git a/pyproject.toml b/pyproject.toml index 0bc91eb32e5..f3a16ec1b87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,11 +16,11 @@ maintainers = [ ] keywords = ["reinforcement-learning", "pytorch", "rl", "machine-learning"] classifiers = [ - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", "Development Status :: 4 - Beta", "Intended Audience :: Developers", diff --git a/test/smoke_test_deps.py b/test/smoke_test_deps.py index ade1c68f4af..941107199fe 100644 --- a/test/smoke_test_deps.py +++ b/test/smoke_test_deps.py @@ -6,11 +6,16 @@ import argparse import os +import sys import tempfile import pytest +@pytest.mark.skipif( + sys.version_info >= (3, 13), + reason="dm_control not available on Python 3.13+ (labmaze lacks wheels)", +) def test_dm_control(): import dm_control # noqa: F401 import dm_env # noqa: F401 @@ -23,21 +28,29 @@ def test_dm_control(): env.reset() +@pytest.mark.skipif( + sys.version_info >= (3, 13), + reason="dm_control not available on Python 3.13+ (labmaze lacks wheels)", +) @pytest.mark.skip(reason="Not implemented yet") def test_dm_control_pixels(): - from torchrl.envs.libs.dm_control import _has_dmc, DMControlEnv # noqa + from torchrl.envs.libs.dm_control import DMControlEnv env = DMControlEnv("cheetah", "run", from_pixels=True) env.reset() +@pytest.mark.skipif( + sys.version_info >= (3, 14), + reason="gymnasium[atari] / ALE not available on Python 3.14 in CI (ale-py install failing)", +) def test_gym(): try: import gymnasium as gym except ImportError as err: ERROR = err try: - import gym # noqa: F401 + import gym as gym # noqa: F401 except ImportError as err: raise ImportError( f"gym and gymnasium load failed. Gym got error {err}." @@ -46,12 +59,30 @@ def test_gym(): from torchrl.envs.libs.gym import _has_gym, GymEnv # noqa assert _has_gym + # If gymnasium is installed without the atari extra, ALE won't be registered. + # In that case we skip rather than hard-failing the dependency smoke test. + try: + import ale_py # noqa: F401 + except Exception: # pragma: no cover + pytest.skip("ALE not available (missing ale_py); skipping Atari gym test.") if os.getenv("PYTORCH_TEST_FBCODE"): from pytorch.rl.test._utils_internal import PONG_VERSIONED else: from _utils_internal import PONG_VERSIONED - env = GymEnv(PONG_VERSIONED()) + try: + env = GymEnv(PONG_VERSIONED()) + except Exception as err: # gymnasium.error.NamespaceNotFound and similar + namespace_not_found = err.__class__.__name__ == "NamespaceNotFound" + if hasattr(gym, "error") and hasattr(gym.error, "NamespaceNotFound"): + namespace_not_found = namespace_not_found or isinstance( + err, gym.error.NamespaceNotFound + ) + if namespace_not_found: + pytest.skip( + "ALE namespace not registered (gymnasium installed without atari extra)." + ) + raise env.reset() diff --git a/test/test_env.py b/test/test_env.py index 252d495fb98..ea93b322be0 100644 --- a/test/test_env.py +++ b/test/test_env.py @@ -3829,9 +3829,14 @@ def test_serial(self, bwad, use_buffers): @pytest.mark.parametrize("bwad", [True, False]) @pytest.mark.parametrize("use_buffers", [False, True]) - def test_parallel(self, bwad, use_buffers, maybe_fork_ParallelEnv): + def test_parallel(self, bwad, use_buffers): N = 50 - env = maybe_fork_ParallelEnv(2, EnvWithMetadata, use_buffers=use_buffers) + # Always use spawn for non-tensor envs to avoid resource_sharer deadlocks with fork. + # Fork + threads (resource_sharer, tqdm monitor) is fundamentally problematic. + # See: https://bugs.python.org/issue30289 + env = ParallelEnv( + 2, EnvWithMetadata, use_buffers=use_buffers, mp_start_method="spawn" + ) try: r = env.rollout(N, break_when_any_done=bwad) assert r.get("non_tensor").tolist() == [list(range(N))] * 2 diff --git a/torchrl/collectors/distributed/generic.py b/torchrl/collectors/distributed/generic.py index 5f2211ea798..e9afa88b39c 100644 --- a/torchrl/collectors/distributed/generic.py +++ b/torchrl/collectors/distributed/generic.py @@ -300,6 +300,13 @@ def _run_collector( elif instruction == b"shutdown": if verbose: torchrl_logger.debug(f"RANK {rank} -- shutting down") + # Shutdown weight sync schemes first (stops background threads) + if weight_sync_schemes is not None: + for scheme in weight_sync_schemes.values(): + try: + scheme.shutdown() + except Exception: + pass try: collector.shutdown() except Exception: @@ -1117,6 +1124,11 @@ def load_state_dict(self, state_dict: OrderedDict) -> None: raise NotImplementedError def shutdown(self, timeout: float | None = None) -> None: + # Prevent double shutdown + if getattr(self, "_shutdown", False): + return + self._shutdown = True + self._store.set("TRAINER_status", b"shutdown") for i in range(self.num_workers): rank = i + 1 @@ -1138,6 +1150,25 @@ def shutdown(self, timeout: float | None = None) -> None: self.jobs[i].result() elif self.launcher == "submitit_delayed": pass + + # Clean up weight sync schemes AFTER workers have exited + # (workers have their own scheme instances that they clean up) + if self._weight_sync_schemes is not None: + torchrl_logger.debug("shutting down weight sync schemes") + for scheme in self._weight_sync_schemes.values(): + try: + scheme.shutdown() + except Exception as e: + torchrl_logger.warning( + f"Error shutting down weight sync scheme: {e}" + ) + self._weight_sync_schemes = None + + # Destroy torch.distributed process group + if torch.distributed.is_initialized(): + torchrl_logger.debug("destroying process group") + torch.distributed.destroy_process_group() + torchrl_logger.debug("collector shut down") diff --git a/torchrl/collectors/distributed/ray.py b/torchrl/collectors/distributed/ray.py index 2df0bca48e1..2f8930207f6 100644 --- a/torchrl/collectors/distributed/ray.py +++ b/torchrl/collectors/distributed/ray.py @@ -1071,6 +1071,19 @@ def shutdown( timeout=timeout if timeout is not None else 5.0 ) self.stop_remote_collectors() + + # Clean up weight sync schemes AFTER workers have exited + if getattr(self, "_weight_sync_schemes", None) is not None: + torchrl_logger.debug("shutting down weight sync schemes") + for scheme in self._weight_sync_schemes.values(): + try: + scheme.shutdown() + except Exception as e: + torchrl_logger.warning( + f"Error shutting down weight sync scheme: {e}" + ) + self._weight_sync_schemes = None + if shutdown_ray: ray.shutdown() diff --git a/torchrl/collectors/distributed/rpc.py b/torchrl/collectors/distributed/rpc.py index a5d0c9a7140..9cc5bbe8076 100644 --- a/torchrl/collectors/distributed/rpc.py +++ b/torchrl/collectors/distributed/rpc.py @@ -863,6 +863,7 @@ def shutdown(self, timeout: float | None = None) -> None: return if self._shutdown: return + torchrl_logger.debug("shutting down") for future, i in self.futures: # clear the futures @@ -876,10 +877,6 @@ def shutdown(self, timeout: float | None = None) -> None: torchrl_logger.debug("rpc shutdown") rpc.shutdown(timeout=int(IDLE_TIMEOUT)) - # Destroy torch.distributed process group - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - if self.launcher == "mp": for job in self.jobs: job.join(int(IDLE_TIMEOUT)) @@ -890,6 +887,23 @@ def shutdown(self, timeout: float | None = None) -> None: pass else: raise NotImplementedError(f"Unknown launcher {self.launcher}") + + # Clean up weight sync schemes AFTER workers have exited + if getattr(self, "_weight_sync_schemes", None) is not None: + torchrl_logger.debug("shutting down weight sync schemes") + for scheme in self._weight_sync_schemes.values(): + try: + scheme.shutdown() + except Exception as e: + torchrl_logger.warning( + f"Error shutting down weight sync scheme: {e}" + ) + self._weight_sync_schemes = None + + # Destroy torch.distributed process group + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + self._shutdown = True diff --git a/torchrl/collectors/distributed/sync.py b/torchrl/collectors/distributed/sync.py index a63b6d33c66..33a26220451 100644 --- a/torchrl/collectors/distributed/sync.py +++ b/torchrl/collectors/distributed/sync.py @@ -686,12 +686,42 @@ def load_state_dict(self, state_dict: OrderedDict) -> None: raise NotImplementedError def shutdown(self, timeout: float | None = None) -> None: - # Clean up weight sync schemes + # Prevent double shutdown + if getattr(self, "_shutdown", False): + return + self._shutdown = True + + # Wait for workers to exit + if hasattr(self, "jobs"): + for job in self.jobs: + if self.launcher == "mp": + if hasattr(job, "is_alive") and job.is_alive(): + job.join(timeout=timeout if timeout is not None else 10) + elif self.launcher == "submitit": + try: + job.result() + except Exception: + pass + + # Clean up weight sync schemes AFTER workers have exited if self._weight_sync_schemes is not None: + torchrl_logger.debug("shutting down weight sync schemes") for scheme in self._weight_sync_schemes.values(): - scheme.shutdown() + try: + scheme.shutdown() + except Exception as e: + torchrl_logger.warning( + f"Error shutting down weight sync scheme: {e}" + ) self._weight_sync_schemes = None + # Destroy torch.distributed process group + if torch.distributed.is_initialized(): + torchrl_logger.debug("destroying process group") + torch.distributed.destroy_process_group() + + torchrl_logger.debug("collector shut down") + class DistributedSyncDataCollector( DistributedSyncCollector, metaclass=_LegacyCollectorMeta diff --git a/torchrl/weight_update/_distributed.py b/torchrl/weight_update/_distributed.py index c9cad578c53..fd2625002c8 100644 --- a/torchrl/weight_update/_distributed.py +++ b/torchrl/weight_update/_distributed.py @@ -589,16 +589,13 @@ def _setup_connection_and_weights_on_receiver_impl( def shutdown(self) -> None: """Stop background receiver thread and clean up.""" - if self._stop_event is not None: - self._stop_event.set() - if self._background_thread is not None: - self._background_thread.join(timeout=5.0) - if self._background_thread.is_alive(): - torchrl_logger.warning( - "DistributedWeightSyncScheme: Background thread did not stop gracefully" - ) - self._background_thread = None - self._stop_event = None + # Check if already shutdown + if getattr(self, "_is_shutdown", False): + return + self._is_shutdown = True + + # Let base class handle background thread cleanup + super().shutdown() @property def model(self) -> Any | None: diff --git a/torchrl/weight_update/_shared.py b/torchrl/weight_update/_shared.py index cba335a967b..7aa15471a07 100644 --- a/torchrl/weight_update/_shared.py +++ b/torchrl/weight_update/_shared.py @@ -886,22 +886,18 @@ def __getstate__(self): def shutdown(self) -> None: """Stop the background receiver thread and clean up.""" + # Check if already shutdown + if getattr(self, "_is_shutdown", False): + return + self._is_shutdown = True + # Signal all workers to stop - if self._instruction_queues: + if getattr(self, "_instruction_queues", None): for worker_idx in self._instruction_queues: try: self._instruction_queues[worker_idx].put("stop") except Exception: pass - # Stop local background thread if running - if self._stop_event is not None: - self._stop_event.set() - if self._background_thread is not None: - self._background_thread.join(timeout=5.0) - if self._background_thread.is_alive(): - torchrl_logger.warning( - "SharedMemWeightSyncScheme: Background thread did not stop gracefully" - ) - self._background_thread = None - self._stop_event = None + # Let base class handle background thread cleanup + super().shutdown() diff --git a/torchrl/weight_update/weight_sync_schemes.py b/torchrl/weight_update/weight_sync_schemes.py index 75ab16563b4..b381a4db55b 100644 --- a/torchrl/weight_update/weight_sync_schemes.py +++ b/torchrl/weight_update/weight_sync_schemes.py @@ -1231,3 +1231,30 @@ def __getstate__(self): def __setstate__(self, state): """Restore the scheme from pickling.""" self.__dict__.update(state) + + def __del__(self): + """Clean up resources when the scheme is garbage collected.""" + try: + self.shutdown() + except Exception: + # Silently ignore any errors during garbage collection cleanup + pass + + def shutdown(self) -> None: + """Shutdown the scheme and release resources. + + This method stops any background threads and cleans up connections. + It is safe to call multiple times. Subclasses should override this + method to add custom cleanup logic, but should call super().shutdown() + to ensure base cleanup is performed. + """ + # Stop background receiver thread if running + if getattr(self, "_stop_event", None) is not None: + self._stop_event.set() + if getattr(self, "_background_thread", None) is not None: + try: + self._background_thread.join(timeout=5.0) + except Exception: + pass + self._background_thread = None + self._stop_event = None