Move version_counter_ to TensorImpl (pytorch#18223)

Will Feng · facebook-github-bot · commit 4ae59e474404 · 2019-04-11T15:12:45.000-07:00
Summary: According to pytorch#13638 (comment), after the Variable/Tensor merge, we may capture variables without autograd metadata inside an autograd function, and we need a working version counter in these cases. This PR makes it possible by moving `version_counter_` out of autograd metadata and into TensorImpl, so that variables without autograd metadata still have version counters. Pull Request resolved: pytorch#18223 Differential Revision: D14735123 Pulled By: yf225 fbshipit-source-id: 15f690311393ffd5a53522a226da82f5abb6c65b
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
@@ -76,8 +76,12 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
     AT_ERROR("opaque tensors do not have storage");
   }
 
-// NOTE: `shallow_copy_and_detach()` does not copy the AutogradMeta pointer
-// because it is unique for each Variable.
+// NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields:
+// 1. the AutogradMeta pointer, because it is unique for each Variable.
+// 2. the version counter, because although it lives in TensorImpl, the version counter is managed
+// by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what
+// the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details.
+//
 // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites
 // to this function that need to change the shallow copy's size or storage afterwards, and setting
 // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
@@ -183,8 +183,12 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // make it happen
   void set_indices_and_values_unsafe(const Tensor& indices, const Tensor& values);
 
-  // NOTE: `shallow_copy_and_detach()` does not copy the AutogradMeta pointer
-  // because it is unique for each Variable.
+  // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields:
+  // 1. the AutogradMeta pointer, because it is unique for each Variable.
+  // 2. the version counter, because although it lives in TensorImpl, the version counter is managed
+  // by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what
+  // the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details.
+  //
   // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites
   // to this function that need to change the shallow copy's size or storage afterwards, and setting
   // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
@@ -138,6 +138,61 @@ struct C10_API AutogradMetaInterface {
   virtual ~AutogradMetaInterface();
 };
 
+// NOTE [ Version Counter Sharing ]
+//
+// Every Tensor has a version counter. Version counters are incremented whenever the
+// data or size of a tensor changes through in-place Variable operations. Version
+// counters are used to detect modifications to saved variables which would result in
+// incorrect gradient calculations. Version counters may be shared between Variables:
+//
+// 1. A view shares the version counter of the base Variable,
+// 2. `x.detach()` shares the version counter of `x`,
+// 3. Unpacked saved variables share the version counter of the source.
+//
+// Version counters are not shared in these scenarios:
+//
+// 1. When we replace a `Variable`'s underlying `Tensor` by calling `set_data(...)`,
+// 2. `x.data` does not share the version counter of `x`. (See discussion at
+// https://github.com/pytorch/pytorch/issues/5396)
+//
+// Question: Why do we put the version counter in TensorImpl instead of AutogradMeta?
+//
+// Answer: After the Variable/Tensor merge, a tensor will not have AutogradMeta when
+// its `requires_grad_` is false, but when we use this tensor in the forward pass of
+// a function that requires saving this tensor for backward, we need to keep track of
+// this tensor's version to make sure it's always valid in the autograd graph.
+//
+// To achieve this goal, we put the version counter in TensorImpl instead of AutogradMeta,
+// and have it always be available. This allows us to have the optimization of not
+// carrying AutogradMeta when a tensor doesn't require gradient.
+//
+// A hypothetical alternative way to achieve this goal is to initialize AutogradMeta and
+// create the version counter for the non-requires-grad tensor only when it's saved for
+// backward. However, since saving a tensor for backward happens in the forward pass, and
+// our invariant is that forward pass needs to be thread-safe, lazy-initializing AutogradMeta
+// when saving a tensor can introduce race conditions when we are running the forward
+// pass in multi-thread scenarios, thus making the forward pass not thread-safe anymore,
+// which breaks the invariant.
+struct C10_API VariableVersion {
+ public:
+  // NOTE: As of C++11 and 14, default-constructing a std::atomic variable
+  // leaves it in a persistently undefined state. See
+  // https://cplusplus.github.io/LWG/issue2334.
+  VariableVersion(uint32_t version = 0)
+      : version_block_(std::make_shared<std::atomic<uint32_t>>(version)) {}
+
+  void bump() noexcept {
+    version_block_->fetch_add(1);
+  }
+
+  uint32_t current_version() const noexcept {
+    return version_block_->load();
+  }
+
+ private:
+  std::shared_ptr<std::atomic<uint32_t>> version_block_;
+};
+
 /**
  * The low-level representation of a tensor, which contains a pointer
  * to a storage (which contains the actual data) and metadata (e.g., sizes and
@@ -845,13 +900,18 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return std::move(autograd_meta_);
   }
 
-  // NOTE: `shallow_copy_and_detach()` does not copy the AutogradMeta pointer
-  // because it is unique for each Variable.
+  // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields:
+  // 1. the AutogradMeta pointer, because it is unique for each Variable.
+  // 2. the version counter, because although it lives in TensorImpl, the version counter is managed
+  // by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what
+  // the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details.
+  //
   // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites
   // to this function that need to change the shallow copy's size or storage afterwards, and setting
   // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is
   // undesirable.
   virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach() const {
+    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
     auto impl = c10::make_intrusive<TensorImpl>(Storage(storage()), type_id());
     impl->set_sizes_and_strides(sizes(), strides());
     impl->storage_offset_ = storage_offset_;
@@ -862,6 +922,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return impl;
   }
 
+  void set_version_counter(
+    const c10::VariableVersion& version_counter) noexcept {
+    version_counter_ = version_counter;
+  }
+
+  const c10::VariableVersion& version_counter() const noexcept {
+    return version_counter_;
+  }
+
+  void bump_version() noexcept {
+    version_counter_.bump();
+  }
+
   inline void set_pyobj(PyObject* pyobj) noexcept {
     pyobj_ = pyobj;
   }
@@ -1384,6 +1457,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // at a time).
   std::unique_ptr<c10::AutogradMetaInterface> autograd_meta_ = nullptr;
 
+  c10::VariableVersion version_counter_;
+
   PyObject* pyobj_ = nullptr; // weak reference
 
   // We could save a word or two by combining the SmallVector structs,
@@ -1470,6 +1545,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    weak refcount
 //    storage pointer
 //    autograd metadata pointer
+//    version counter (word 0)
+//    version counter (word 1)
 //    PyObject pointer
 //    sizes SmallVector (begin)
 //    sizes SmallVector (end)
@@ -1494,7 +1571,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    miscellaneous bitfield
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-              sizeof(TensorImpl) == sizeof(int64_t) * 27,
+              sizeof(TensorImpl) == sizeof(int64_t) * 29,
               "You changed the size of TensorImpl on 64-bit arch."
               "See Note [TensorImpl size constraints] on how to proceed.");
 
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
-#include <torch/csrc/autograd/variable_version.h>
 
 #include <ATen/ATen.h>
 
@@ -47,7 +46,7 @@ class TORCH_API SavedVariable {
   // passed in to the unpack function when reconstructing the Variable.
   std::shared_ptr<Function> grad_fn_;
   std::weak_ptr<Function> grad_accumulator_;
-  VariableVersion version_counter_;
+  c10::VariableVersion version_counter_;
 
   uint32_t saved_version_ = 0;
   uint32_t output_nr_ = 0;
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
@@ -7,7 +7,6 @@
 #include <torch/csrc/autograd/functions/tensor.h>
 #include <torch/csrc/autograd/generated/Functions.h>
 #include <torch/csrc/autograd/generated/VariableType.h>
-#include <torch/csrc/autograd/variable_version.h>
 
 #include <ATen/ATen.h>
 #include <c10/util/Exception.h>
@@ -171,8 +170,13 @@ void Variable::Impl::set_data(const at::Tensor &new_data) {
   device_opt_ = new_data.device();
   type_id_ = new_data.dispatch_type().type_id();
 
-  auto new_data_copy = at::Tensor(new_data.getIntrusivePtr()->shallow_copy_and_detach());
-  data_ = std::move(new_data_copy);
+  auto new_data_impl_copy = new_data.getIntrusivePtr()->shallow_copy_and_detach();
+  // Version counter is not shared when we replace a `Variable`'s underlying `Tensor`
+  // by calling `set_data(...)`. The original version of the `Variable` is always preserved.
+  // See NOTE [ Version Counter Sharing ] for details.
+  auto saved_version_ = data_.unsafeGetTensorImpl()->version_counter().current_version();
+  new_data_impl_copy->set_version_counter(saved_version_);
+  data_ = std::move(at::Tensor(new_data_impl_copy));
 }
 
 void Variable::Impl::release_resources() {
@@ -189,8 +193,8 @@ Variable::DifferentiableViewImpl::DifferentiableViewImpl(Variable base, at::Tens
     diff_view_meta->base_ = diff_view_meta->base_.base();
   }
   diff_view_meta->is_view_ = true;
-  diff_view_meta->version_counter_ = diff_view_meta->base_.version_counter();
-  diff_view_meta->attr_version = diff_view_meta->version_counter_.current_version();
+  data_.unsafeGetTensorImpl()->set_version_counter(diff_view_meta->base_.version_counter());
+  diff_view_meta->attr_version = data_.unsafeGetTensorImpl()->version_counter().current_version();
 }
 
 const std::shared_ptr<Function>& Variable::grad_fn() const {
@@ -200,7 +204,7 @@ const std::shared_ptr<Function>& Variable::grad_fn() const {
     if (!diff_view_meta->grad_fn_ && !diff_view_meta->base_.requires_grad()) {
       return diff_view_meta->grad_fn_;
     }
-    auto current_version = diff_view_meta->version_counter_.current_version();
+    auto current_version = this->current_version();
     if (diff_view_meta->attr_version != current_version) {
       AT_ASSERT(diff_view_meta->output_nr_ == 0);
       auto fn = std::make_shared<generated::AsStridedBackward>();
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
@@ -5,7 +5,6 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function_hook.h>
-#include <torch/csrc/autograd/variable_version.h>
 
 #include <ATen/ATen.h>
 #include <c10/util/Exception.h>
@@ -257,10 +256,10 @@ struct TORCH_API Variable : public at::Tensor {
 
   /// Increments the version count of this `Variable`.
   void bump_version() noexcept;
-  void set_version_counter(const VariableVersion& version_counter) noexcept;
+  void set_version_counter(const c10::VariableVersion& version_counter) noexcept;
 
   /// Retrieves this `Variable`s version counter.
-  const VariableVersion& version_counter() const noexcept;
+  const c10::VariableVersion& version_counter() const noexcept;
 
   /// Retrieves the current value of the `Variable`'s version counter.
   /// Equivalent to calling `version_counter().current_version()`.
@@ -335,7 +334,6 @@ struct TORCH_API Variable::AutogradMeta : public c10::AutogradMetaInterface {
   std::shared_ptr<Function> grad_fn_;
   std::weak_ptr<Function> grad_accumulator_;
 
-  VariableVersion version_counter_;
   std::vector<std::shared_ptr<FunctionPreHook>> hooks_;
 
   // Only meaningful on leaf variables (must be false otherwise)
@@ -692,20 +690,20 @@ inline bool Variable::is_leaf() const noexcept {
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 inline void Variable::set_version_counter(
-    const VariableVersion& version_counter) noexcept {
-  get_autograd_meta()->version_counter_ = version_counter;
+    const c10::VariableVersion& version_counter) noexcept {
+  data().unsafeGetTensorImpl()->set_version_counter(version_counter);
 }
 
 inline void Variable::bump_version() noexcept {
-  get_autograd_meta()->version_counter_.bump();
+  data().unsafeGetTensorImpl()->bump_version();
 }
 
 inline uint32_t Variable::current_version() const noexcept {
-  return get_autograd_meta()->version_counter_.current_version();
+  return data().unsafeGetTensorImpl()->version_counter().current_version();
 }
 
-inline const VariableVersion& Variable::version_counter() const noexcept {
-  return get_autograd_meta()->version_counter_;
+inline const c10::VariableVersion& Variable::version_counter() const noexcept {
+  return data().unsafeGetTensorImpl()->version_counter();
 }
 
 // Hooks
diff --git a/torch/csrc/autograd/variable_version.h b/torch/csrc/autograd/variable_version.h