pytorch
diff --git a/‎.ci/docker/requirements.txt
+2-2 b/‎.ci/docker/requirements.txt
+2-2
diff --git a/‎.jenkins/build.sh
+4-2 b/‎.jenkins/build.sh
+4-2
diff --git a/‎.jenkins/validate_tutorials_built.py
-2 b/‎.jenkins/validate_tutorials_built.py
-2
diff --git a/‎_static/img/onnx/custom_addandround.png
38.1 KB b/‎_static/img/onnx/custom_addandround.png
38.1 KB
diff --git a/‎_static/img/onnx/custom_addandround_function.png
-34 KB b/‎_static/img/onnx/custom_addandround_function.png
-34 KB
diff --git a/‎_static/img/onnx/custom_addandround_model.png
-7.37 KB b/‎_static/img/onnx/custom_addandround_model.png
-7.37 KB
diff --git a/‎_static/img/onnx/custom_aten_add_function.png
-15.9 KB b/‎_static/img/onnx/custom_aten_add_function.png
-15.9 KB
diff --git a/‎_static/img/onnx/custom_aten_add_model.png
-8.41 KB b/‎_static/img/onnx/custom_aten_add_model.png
-8.41 KB
diff --git a/‎_static/img/onnx/custom_aten_gelu_model.png
-381 Bytes b/‎_static/img/onnx/custom_aten_gelu_model.png
-381 Bytes
diff --git a/‎_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png
60.5 KB b/‎_static/img/onnx/image_classifier_onnx_model_on_netron_web_ui.png
60.5 KB
diff --git a/‎_static/img/onnx/image_clossifier_onnx_modelon_netron_web_ui.png
-94.9 KB b/‎_static/img/onnx/image_clossifier_onnx_modelon_netron_web_ui.png
-94.9 KB
diff --git a/‎advanced_source/coding_ddpg.py
+1-1 b/‎advanced_source/coding_ddpg.py
+1-1
diff --git a/‎advanced_source/pendulum.py
+1-1 b/‎advanced_source/pendulum.py
+1-1
diff --git a/‎beginner_source/basics/buildmodel_tutorial.py
+4-11 b/‎beginner_source/basics/buildmodel_tutorial.py
+4-11
diff --git a/‎beginner_source/basics/quickstart_tutorial.py
+4-10 b/‎beginner_source/basics/quickstart_tutorial.py
+4-10
diff --git a/‎beginner_source/basics/tensorqs_tutorial.py
+8-8 b/‎beginner_source/basics/tensorqs_tutorial.py
+8-8
diff --git a/‎beginner_source/chatbot_tutorial.py
+7-5 b/‎beginner_source/chatbot_tutorial.py
+7-5
diff --git a/‎beginner_source/examples_autograd/polynomial_autograd.py
+5-1 b/‎beginner_source/examples_autograd/polynomial_autograd.py
+5-1
diff --git a/‎beginner_source/fgsm_tutorial.py
+4-8 b/‎beginner_source/fgsm_tutorial.py
+4-8
diff --git a/‎beginner_source/introyt/tensors_deeper_tutorial.py
+22-26 b/‎beginner_source/introyt/tensors_deeper_tutorial.py
+22-26
diff --git a/‎beginner_source/knowledge_distillation_tutorial.py
+4-2 b/‎beginner_source/knowledge_distillation_tutorial.py
+4-2
diff --git a/‎beginner_source/nn_tutorial.py
+11-14 b/‎beginner_source/nn_tutorial.py
+11-14
@@ -28,8 +28,8 @@ tensorboard
 jinja2==3.1.3
 pytorch-lightning
 torchx
-torchrl==0.5.0
-tensordict==0.5.0
+torchrl==0.6.0
+tensordict==0.6.0
 ax-platform>=0.4.0
 nbformat>=5.9.2
 datasets
 
@@ -22,8 +22,10 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-# pip uninstall -y torch torchvision torchaudio torchtext torchdata
-# pip3 install torch==2.5.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
+sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+sudo pip uninstall -y fbgemm-gpu torchrec
+sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
 
@@ -25,7 +25,6 @@
     "intermediate_source/mnist_train_nas",  # used by ax_multiobjective_nas_tutorial.py
     "intermediate_source/fx_conv_bn_fuser",
     "intermediate_source/_torch_export_nightly_tutorial",  # does not work on release
-    "intermediate_source/transformer_building_blocks",  # does not work on release
     "advanced_source/super_resolution_with_onnxruntime",
     "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker
     "prototype_source/fx_graph_mode_ptq_dynamic",
@@ -51,7 +50,6 @@
     "intermediate_source/flask_rest_api_tutorial",
     "intermediate_source/text_to_speech_with_torchaudio",
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
-    "intermediate_source/torch_export_tutorial" # reenable after 2940 is fixed.
 ]
 
 def tutorial_source_dirs() -> List[Path]:
 
@@ -893,7 +893,7 @@ def make_recorder(actor_model_explore, transform_state_dict, record_interval):
         record_frames=1000,
         policy_exploration=actor_model_explore,
         environment=environment,
-        exploration_type=ExplorationType.MEAN,
+        exploration_type=ExplorationType.DETERMINISTIC,
         record_interval=record_interval,
     )
     return recorder_obj
 
@@ -604,7 +604,7 @@ def __init__(self, td_params=None, seed=None, device="cpu"):
     env,
     # ``Unsqueeze`` the observations that we will concatenate
     UnsqueezeTransform(
-        unsqueeze_dim=-1,
+        dim=-1,
         in_keys=["th", "thdot"],
         in_keys_inv=["th", "thdot"],
     ),
 
@@ -32,17 +32,10 @@
 #############################################
 # Get Device for Training
 # -----------------------
-# We want to be able to train our model on a hardware accelerator like the GPU or MPS,
-# if available. Let's check to see if `torch.cuda <https://pytorch.org/docs/stable/notes/cuda.html>`_
-# or `torch.backends.mps <https://pytorch.org/docs/stable/notes/mps.html>`_ are available, otherwise we use the CPU.
-
-device = (
-    "cuda"
-    if torch.cuda.is_available()
-    else "mps"
-    if torch.backends.mps.is_available()
-    else "cpu"
-)
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
 print(f"Using {device} device")
 
 ##############################################
 
@@ -84,16 +84,10 @@
 # To define a neural network in PyTorch, we create a class that inherits
 # from `nn.Module <https://pytorch.org/docs/stable/generated/torch.nn.Module.html>`_. We define the layers of the network
 # in the ``__init__`` function and specify how data will pass through the network in the ``forward`` function. To accelerate
-# operations in the neural network, we move it to the GPU or MPS if available.
-
-# Get cpu, gpu or mps device for training.
-device = (
-    "cuda"
-    if torch.cuda.is_available()
-    else "mps"
-    if torch.backends.mps.is_available()
-    else "cpu"
-)
+# operations in the neural network, we move it to the `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
 print(f"Using {device} device")
 
 # Define model
 
@@ -99,20 +99,20 @@
 # Operations on Tensors
 # ~~~~~~~~~~~~~~~~~~~~~~~
 #
-# Over 100 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing,
+# Over 1200 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing,
 # indexing, slicing), sampling and more are
 # comprehensively described `here <https://pytorch.org/docs/stable/torch.html>`__.
 #
-# Each of these operations can be run on the GPU (at typically higher speeds than on a
-# CPU). If you’re using Colab, allocate a GPU by going to Runtime > Change runtime type > GPU.
+# Each of these operations can be run on the CPU and `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If you’re using Colab, allocate an accelerator by going to Runtime > Change runtime type > GPU.
 #
-# By default, tensors are created on the CPU. We need to explicitly move tensors to the GPU using
-# ``.to`` method (after checking for GPU availability). Keep in mind that copying large tensors
+# By default, tensors are created on the CPU. We need to explicitly move tensors to the accelerator using
+# ``.to`` method (after checking for accelerator availability). Keep in mind that copying large tensors
 # across devices can be expensive in terms of time and memory!
 
-# We move our tensor to the GPU if available
-if torch.cuda.is_available():
-    tensor = tensor.to("cuda")
+# We move our tensor to the current accelerator if available
+if torch.accelerator.is_available():
+    tensor = tensor.to(torch.accelerator.current_accelerator())
 
 
 ######################################################################
 
@@ -108,8 +108,10 @@
 import json
 
 
-USE_CUDA = torch.cuda.is_available()
-device = torch.device("cuda" if USE_CUDA else "cpu")
+# If the current `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__ is available,
+# we will use it. Otherwise, we use the CPU.
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
 
 
 ######################################################################
@@ -1318,16 +1320,16 @@ def evaluateInput(encoder, decoder, searcher, voc):
     encoder_optimizer.load_state_dict(encoder_optimizer_sd)
     decoder_optimizer.load_state_dict(decoder_optimizer_sd)
 
-# If you have CUDA, configure CUDA to call
+# If you have an accelerator, configure it to call
 for state in encoder_optimizer.state.values():
     for k, v in state.items():
         if isinstance(v, torch.Tensor):
-            state[k] = v.cuda()
+            state[k] = v.to(device)
 
 for state in decoder_optimizer.state.values():
     for k, v in state.items():
         if isinstance(v, torch.Tensor):
-            state[k] = v.cuda()
+            state[k] = v.to(device)
 
 # Run training iterations
 print("Starting Training!")
 
@@ -17,8 +17,12 @@
 import torch
 import math
 
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+
 dtype = torch.float
-device = "cuda" if torch.cuda.is_available() else "cpu"
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
 torch.set_default_device(device)
 
 # Create Tensors to hold input and outputs.
 
@@ -125,14 +125,9 @@
 #    `pytorch/examples/mnist <https://github.com/pytorch/examples/tree/master/mnist>`__.
 #    For simplicity, download the pretrained model `here <https://drive.google.com/file/d/1HJV2nUHJqclXQ8flKvcWmjZ-OU5DGatl/view?usp=drive_link>`__.
 #
-# -  ``use_cuda`` - boolean flag to use CUDA if desired and available.
-#    Note, a GPU with CUDA is not critical for this tutorial as a CPU will
-#    not take much time.
-#
 
 epsilons = [0, .05, .1, .15, .2, .25, .3]
 pretrained_model = "data/lenet_mnist_model.pth"
-use_cuda=True
 # Set random seed for reproducibility
 torch.manual_seed(42)
 
@@ -184,9 +179,10 @@ def forward(self, x):
             ])),
         batch_size=1, shuffle=True)
 
-# Define what device we are using
-print("CUDA Available: ",torch.cuda.is_available())
-device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu")
+# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
 
 # Initialize the network
 model = Net().to(device)
 
@@ -632,34 +632,33 @@
 # does this *without* changing ``a`` - you can see that when we print
 # ``a`` again at the end, it retains its ``requires_grad=True`` property.
 # 
-# Moving to GPU
+# Moving to `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
 # -------------
 # 
-# One of the major advantages of PyTorch is its robust acceleration on
-# CUDA-compatible Nvidia GPUs. (“CUDA” stands for *Compute Unified Device
-# Architecture*, which is Nvidia’s platform for parallel computing.) So
-# far, everything we’ve done has been on CPU. How do we move to the faster
+# One of the major advantages of PyTorch is its robust acceleration on an
+# `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# such as CUDA, MPS, MTIA, or XPU. 
+# So far, everything we’ve done has been on CPU. How do we move to the faster
 # hardware?
 # 
-# First, we should check whether a GPU is available, with the
+# First, we should check whether an accelerator is available, with the
 # ``is_available()`` method.
 # 
 # .. note::
-#      If you do not have a CUDA-compatible GPU and CUDA drivers
-#      installed, the executable cells in this section will not execute any
-#      GPU-related code.
+#      If you do not have an accelerator, the executable cells in this section will not execute any
+#      accelerator-related code.
 # 
 
-if torch.cuda.is_available():
-    print('We have a GPU!')
+if torch.accelerator.is_available():
+    print('We have an accelerator!')
 else:
     print('Sorry, CPU only.')
 
 
 ##########################################################################
-# Once we’ve determined that one or more GPUs is available, we need to put
-# our data someplace where the GPU can see it. Your CPU does computation
-# on data in your computer’s RAM. Your GPU has dedicated memory attached
+# Once we’ve determined that one or more accelerators is available, we need to put
+# our data someplace where the accelerator can see it. Your CPU does computation
+# on data in your computer’s RAM. Your accelerator has dedicated memory attached
 # to it. Whenever you want to perform a computation on a device, you must
 # move *all* the data needed for that computation to memory accessible by
 # that device. (Colloquially, “moving the data to memory accessible by the
@@ -669,34 +668,31 @@
 # may do it at creation time:
 # 
 
-if torch.cuda.is_available():
-    gpu_rand = torch.rand(2, 2, device='cuda')
+if torch.accelerator.is_available():
+    gpu_rand = torch.rand(2, 2, device=torch.accelerator.current_accelerator())
     print(gpu_rand)
 else:
     print('Sorry, CPU only.')
 
 
 ##########################################################################
 # By default, new tensors are created on the CPU, so we have to specify
-# when we want to create our tensor on the GPU with the optional
+# when we want to create our tensor on the accelerator with the optional
 # ``device`` argument. You can see when we print the new tensor, PyTorch
 # informs us which device it’s on (if it’s not on CPU).
 # 
-# You can query the number of GPUs with ``torch.cuda.device_count()``. If
-# you have more than one GPU, you can specify them by index:
+# You can query the number of accelerators with ``torch.accelerator.device_count()``. If
+# you have more than one accelerator, you can specify them by index, take CUDA for example:
 # ``device='cuda:0'``, ``device='cuda:1'``, etc.
 # 
 # As a coding practice, specifying our devices everywhere with string
 # constants is pretty fragile. In an ideal world, your code would perform
-# robustly whether you’re on CPU or GPU hardware. You can do this by
+# robustly whether you’re on CPU or accelerator hardware. You can do this by
 # creating a device handle that can be passed to your tensors instead of a
 # string:
 # 
 
-if torch.cuda.is_available():
-    my_device = torch.device('cuda')
-else:
-    my_device = torch.device('cpu')
+my_device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device('cpu')
 print('Device: {}'.format(my_device))
 
 x = torch.rand(2, 2, device=my_device)
@@ -718,12 +714,12 @@
 # It is important to know that in order to do computation involving two or
 # more tensors, *all of the tensors must be on the same device*. The
 # following code will throw a runtime error, regardless of whether you
-# have a GPU device available:
+# have an accelerator device available, take CUDA for example:
 # 
 # .. code-block:: python
 # 
 #    x = torch.rand(2, 2)
-#    y = torch.rand(2, 2, device='gpu')
+#    y = torch.rand(2, 2, device='cuda')
 #    z = x + y  # exception will be thrown
 # 
 
 
@@ -37,8 +37,10 @@
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets
 
-# Check if GPU is available, and if not, use the CPU
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Check if the current `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
+# is available, and if not, use the CPU
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
 
 ######################################################################
 # Loading CIFAR-10
 
@@ -132,7 +132,7 @@
 # we'll write `log_softmax` and use it. Remember: although PyTorch
 # provides lots of prewritten loss functions, activation functions, and
 # so forth, you can easily write your own using plain python. PyTorch will
-# even create fast GPU or vectorized CPU code for your function
+# even create fast accelerator or vectorized CPU code for your function
 # automatically.
 
 def log_softmax(x):
@@ -827,38 +827,35 @@ def __iter__(self):
 fit(epochs, model, loss_func, opt, train_dl, valid_dl)
 
 ###############################################################################
-# Using your GPU
+# Using your `Accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
 # ---------------
 #
-# If you're lucky enough to have access to a CUDA-capable GPU (you can
+# If you're lucky enough to have access to an accelerator such as CUDA (you can
 # rent one for about $0.50/hour from most cloud providers) you can
-# use it to speed up your code. First check that your GPU is working in
+# use it to speed up your code. First check that your accelerator is working in
 # Pytorch:
 
-print(torch.cuda.is_available())
+# If the current accelerator is available, we will use it. Otherwise, we use the CPU.
+device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
+print(f"Using {device} device")
 
-###############################################################################
-# And then create a device object for it:
-
-dev = torch.device(
-    "cuda") if torch.cuda.is_available() else torch.device("cpu")
 
 ###############################################################################
-# Let's update ``preprocess`` to move batches to the GPU:
+# Let's update ``preprocess`` to move batches to the accelerator:
 
 
 def preprocess(x, y):
-    return x.view(-1, 1, 28, 28).to(dev), y.to(dev)
+    return x.view(-1, 1, 28, 28).to(device), y.to(device)
 
 
 train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
 train_dl = WrappedDataLoader(train_dl, preprocess)
 valid_dl = WrappedDataLoader(valid_dl, preprocess)
 
 ###############################################################################
-# Finally, we can move our model to the GPU.
+# Finally, we can move our model to the accelerator.
 
-model.to(dev)
+model.to(device)
 opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
 
 ###############################################################################
Original file line number	Diff line number	Diff line change
`@@ -893,7 +893,7 @@ def make_recorder(actor_model_explore, transform_state_dict, record_interval):`
`893`	`893`	`record_frames=1000,`
`894`	`894`	`policy_exploration=actor_model_explore,`
`895`	`895`	`environment=environment,`
`896`		`- exploration_type=ExplorationType.MEAN,`
	`896`	`+ exploration_type=ExplorationType.DETERMINISTIC,`
`897`	`897`	`record_interval=record_interval,`
`898`	`898`	`)`
`899`	`899`	`return recorder_obj`