changeset 59068:252a1602b4c6

8243326: Cleanup use of volatile in taskqueue code Summary: Removed volatile on queue elements, cleaned up other uses, made atomics explicit. Reviewed-by: tschatzl, iwalulya
author kbarrett
date Tue, 14 Apr 2020 02:25:19 -0400
parents e7c08ffff4d9
children 724aff747bb2
files src/hotspot/share/gc/g1/g1ConcurrentMark.hpp src/hotspot/share/gc/shared/taskqueue.hpp src/hotspot/share/gc/shared/taskqueue.inline.hpp
diffstat 3 files changed, 166 insertions(+), 145 deletions(-) [+]
line wrap: on
line diff
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp	Wed Apr 29 13:27:01 2020 +0200
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp	Tue Apr 14 02:25:19 2020 -0400
@@ -65,22 +65,13 @@
   }
   G1TaskQueueEntry(HeapWord* addr) : _holder((void*)((uintptr_t)addr | ArraySliceBit)) { }
 public:
-  G1TaskQueueEntry(const G1TaskQueueEntry& other) { _holder = other._holder; }
+
   G1TaskQueueEntry() : _holder(NULL) { }
+  // Trivially copyable, for use in GenericTaskQueue.
 
   static G1TaskQueueEntry from_slice(HeapWord* what) { return G1TaskQueueEntry(what); }
   static G1TaskQueueEntry from_oop(oop obj) { return G1TaskQueueEntry(obj); }
 
-  G1TaskQueueEntry& operator=(const G1TaskQueueEntry& t) {
-    _holder = t._holder;
-    return *this;
-  }
-
-  volatile G1TaskQueueEntry& operator=(const volatile G1TaskQueueEntry& t) volatile {
-    _holder = t._holder;
-    return *this;
-  }
-
   oop obj() const {
     assert(!is_array_slice(), "Trying to read array slice " PTR_FORMAT " as oop", p2i(_holder));
     return (oop)_holder;
--- a/src/hotspot/share/gc/shared/taskqueue.hpp	Wed Apr 29 13:27:01 2020 +0200
+++ b/src/hotspot/share/gc/shared/taskqueue.hpp	Tue Apr 14 02:25:19 2020 -0400
@@ -28,6 +28,8 @@
 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
 #include "oops/oopsHierarchy.hpp"
+#include "runtime/atomic.hpp"
+#include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/ostream.hpp"
 #include "utilities/stack.hpp"
@@ -107,31 +109,23 @@
 protected:
   // Internal type for indexing the queue; also used for the tag.
   typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
+  STATIC_ASSERT(N == idx_t(N)); // Ensure N fits in an idx_t.
 
-  // The first free element after the last one pushed (mod N).
-  volatile uint _bottom;
-
-  enum { MOD_N_MASK = N - 1 };
+  // N must be a power of 2 for computing modulo via masking.
+  // N must be >= 2 for the algorithm to work at all, though larger is better.
+  // C++11: is_power_of_2 is not (yet) constexpr.
+  STATIC_ASSERT((N >= 2) && ((N & (N - 1)) == 0));
+  static const uint MOD_N_MASK = N - 1;
 
   class Age {
-  public:
-    Age(size_t data = 0)         { _data = data; }
-    Age(const Age& age)          { _data = age._data; }
-    Age(idx_t top, idx_t tag)    { _fields._top = top; _fields._tag = tag; }
-
-    Age   get()        const volatile { return _data; }
-    void  set(Age age) volatile       { _data = age._data; }
+    friend class TaskQueueSuper;
 
-    idx_t top()        const volatile { return _fields._top; }
-    idx_t tag()        const volatile { return _fields._tag; }
+  public:
+    explicit Age(size_t data = 0) : _data(data) {}
+    Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; }
 
-    // Increment top; if it wraps, increment tag also.
-    void increment() {
-      _fields._top = increment_index(_fields._top);
-      if (_fields._top == 0) ++_fields._tag;
-    }
-
-    Age cmpxchg(const Age new_age, const Age old_age) volatile;
+    idx_t top() const { return _fields._top; }
+    idx_t tag() const { return _fields._tag; }
 
     bool operator ==(const Age& other) const { return _data == other._data; }
 
@@ -144,9 +138,41 @@
       size_t _data;
       fields _fields;
     };
+    STATIC_ASSERT(sizeof(size_t) >= sizeof(fields));
   };
 
-  volatile Age _age;
+  uint bottom_relaxed() const {
+    return Atomic::load(&_bottom);
+  }
+
+  uint bottom_acquire() const {
+    return Atomic::load_acquire(&_bottom);
+  }
+
+  void set_bottom_relaxed(uint new_bottom) {
+    Atomic::store(&_bottom, new_bottom);
+  }
+
+  void release_set_bottom(uint new_bottom) {
+    Atomic::release_store(&_bottom, new_bottom);
+  }
+
+  Age age_relaxed() const {
+    return Age(Atomic::load(&_age._data));
+  }
+
+  void set_age_relaxed(Age new_age) {
+    Atomic::store(&_age._data, new_age._data);
+  }
+
+  Age cmpxchg_age(Age old_age, Age new_age) {
+    return Age(Atomic::cmpxchg(&_age._data, old_age._data, new_age._data));
+  }
+
+  idx_t age_top_relaxed() const {
+    // Atomically accessing a subfield of an "atomic" member.
+    return Atomic::load(&_age._fields._top);
+  }
 
   // These both operate mod N.
   static uint increment_index(uint ind) {
@@ -163,7 +189,7 @@
   }
 
   // Returns the size corresponding to the given "bot" and "top".
-  uint size(uint bot, uint top) const {
+  uint clean_size(uint bot, uint top) const {
     uint sz = dirty_size(bot, top);
     // Has the queue "wrapped", so that bottom is less than top?  There's a
     // complicated special case here.  A pair of threads could perform pop_local
@@ -182,27 +208,46 @@
     return (sz == N - 1) ? 0 : sz;
   }
 
+private:
+  DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
+
+  // Index of the first free element after the last one pushed (mod N).
+  volatile uint _bottom;
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(uint));
+
+  // top() is the index of the oldest pushed element (mod N), and tag()
+  // is the associated epoch, to distinguish different modifications of
+  // the age.  There is no available element if top() == _bottom or
+  // (_bottom - top()) mod N == N-1; the latter indicates underflow
+  // during concurrent pop_local/pop_global.
+  volatile Age _age;
+  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(Age));
+
+  NONCOPYABLE(TaskQueueSuper);
+
 public:
   TaskQueueSuper() : _bottom(0), _age() {}
 
-  // Return true if the TaskQueue contains/does not contain any tasks.
-  bool peek()     const { return _bottom != _age.top(); }
-  bool is_empty() const { return size() == 0; }
+  // Return true if the TaskQueue contains any tasks.
+  // Unreliable if there are concurrent pushes or pops.
+  bool peek() const {
+    return bottom_relaxed() != age_top_relaxed();
+  }
+
+  bool is_empty() const {
+    return size() == 0;
+  }
 
   // Return an estimate of the number of elements in the queue.
-  // The "careful" version admits the possibility of pop_local/pop_global
-  // races.
+  // Treats pop_local/pop_global race that underflows as empty.
   uint size() const {
-    return size(_bottom, _age.top());
+    return clean_size(bottom_relaxed(), age_top_relaxed());
   }
 
-  uint dirty_size() const {
-    return dirty_size(_bottom, _age.top());
-  }
-
+  // Discard the contents of the queue.
   void set_empty() {
-    _bottom = 0;
-    _age.set(0);
+    set_bottom_relaxed(0);
+    set_age_relaxed(Age());
   }
 
   // Maximum number of elements allowed in the queue.  This is two less
@@ -211,9 +256,6 @@
   // in GenericTaskQueue.
   uint max_elems() const { return N - 2; }
 
-  // Total size of queue.
-  static const uint total_size() { return N; }
-
   TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
 };
 
@@ -254,11 +296,23 @@
   typedef typename TaskQueueSuper<N, F>::Age Age;
   typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
 
-  using TaskQueueSuper<N, F>::_bottom;
-  using TaskQueueSuper<N, F>::_age;
+  using TaskQueueSuper<N, F>::MOD_N_MASK;
+
+  using TaskQueueSuper<N, F>::bottom_relaxed;
+  using TaskQueueSuper<N, F>::bottom_acquire;
+
+  using TaskQueueSuper<N, F>::set_bottom_relaxed;
+  using TaskQueueSuper<N, F>::release_set_bottom;
+
+  using TaskQueueSuper<N, F>::age_relaxed;
+  using TaskQueueSuper<N, F>::set_age_relaxed;
+  using TaskQueueSuper<N, F>::cmpxchg_age;
+  using TaskQueueSuper<N, F>::age_top_relaxed;
+
   using TaskQueueSuper<N, F>::increment_index;
   using TaskQueueSuper<N, F>::decrement_index;
   using TaskQueueSuper<N, F>::dirty_size;
+  using TaskQueueSuper<N, F>::clean_size;
 
 public:
   using TaskQueueSuper<N, F>::max_elems;
@@ -285,13 +339,14 @@
 
   // Attempts to claim a task from the "local" end of the queue (the most
   // recently pushed) as long as the number of entries exceeds the threshold.
-  // If successful, returns true and sets t to the task; otherwise, returns false
-  // (the queue is empty or the number of elements below the threshold).
-  inline bool pop_local(volatile E& t, uint threshold = 0);
+  // If successfully claims a task, returns true and sets t to the task;
+  // otherwise, returns false and t is unspecified.  May fail and return
+  // false because of a successful steal by pop_global.
+  inline bool pop_local(E& t, uint threshold = 0);
 
   // Like pop_local(), but uses the "global" end of the queue (the least
   // recently pushed).
-  bool pop_global(volatile E& t);
+  bool pop_global(E& t);
 
   // Delete any resource associated with the queue.
   ~GenericTaskQueue();
@@ -301,9 +356,10 @@
   template<typename Fn> void iterate(Fn fn);
 
 private:
-  DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
+  // Base class has trailing padding.
+
   // Element array.
-  volatile E* _elems;
+  E* _elems;
 
   DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(E*));
   // Queue owner local variables. Not to be accessed by other threads.
@@ -444,12 +500,6 @@
   virtual bool should_exit_termination() = 0;
 };
 
-#ifdef _MSC_VER
-#pragma warning(push)
-// warning C4522: multiple assignment operators specified
-#pragma warning(disable:4522)
-#endif
-
 // This is a container class for either an oop* or a narrowOop*.
 // Both are pushed onto a task queue and the consumer will test is_narrow()
 // to determine which should be processed.
@@ -468,20 +518,13 @@
     _holder = (void*)p;
   }
   StarTask()             { _holder = NULL; }
+  // Trivially copyable, for use in GenericTaskQueue.
+
   operator oop*()        { return (oop*)_holder; }
   operator narrowOop*()  {
     return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK);
   }
 
-  StarTask& operator=(const StarTask& t) {
-    _holder = t._holder;
-    return *this;
-  }
-  volatile StarTask& operator=(const volatile StarTask& t) volatile {
-    _holder = t._holder;
-    return *this;
-  }
-
   bool is_narrow() const {
     return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0);
   }
@@ -494,19 +537,7 @@
   ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) {
     assert(idx <= size_t(max_jint), "too big");
   }
-  ObjArrayTask(const ObjArrayTask& t): _obj(t._obj), _index(t._index) { }
-
-  ObjArrayTask& operator =(const ObjArrayTask& t) {
-    _obj = t._obj;
-    _index = t._index;
-    return *this;
-  }
-  volatile ObjArrayTask&
-  operator =(const volatile ObjArrayTask& t) volatile {
-    (void)const_cast<oop&>(_obj = t._obj);
-    _index = t._index;
-    return *this;
-  }
+  // Trivially copyable, for use in GenericTaskQueue.
 
   inline oop obj()   const { return _obj; }
   inline int index() const { return _index; }
@@ -518,8 +549,4 @@
   int _index;
 };
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 #endif // SHARE_GC_SHARED_TASKQUEUE_HPP
--- a/src/hotspot/share/gc/shared/taskqueue.inline.hpp	Wed Apr 29 13:27:01 2020 +0200
+++ b/src/hotspot/share/gc/shared/taskqueue.inline.hpp	Tue Apr 14 02:25:19 2020 -0400
@@ -54,14 +54,14 @@
 
 template<class E, MEMFLAGS F, unsigned int N>
 inline GenericTaskQueue<E, F, N>::~GenericTaskQueue() {
-  ArrayAllocator<E>::free(const_cast<E*>(_elems), N);
+  ArrayAllocator<E>::free(_elems, N);
 }
 
 template<class E, MEMFLAGS F, unsigned int N> inline bool
 GenericTaskQueue<E, F, N>::push(E t) {
-  uint localBot = _bottom;
+  uint localBot = bottom_relaxed();
   assert(localBot < N, "_bottom out of range.");
-  idx_t top = _age.top();
+  idx_t top = age_top_relaxed();
   uint dirty_n_elems = dirty_size(localBot, top);
   // A dirty_size of N-1 cannot happen in push.  Considering only push:
   // (1) dirty_n_elems is initially 0.
@@ -75,13 +75,8 @@
   // so can't underflow to -1 (== N-1) with push.
   assert(dirty_n_elems <= max_elems(), "n_elems out of range.");
   if (dirty_n_elems < max_elems()) {
-    // g++ complains if the volatile result of the assignment is
-    // unused, so we cast the volatile away.  We cannot cast directly
-    // to void, because gcc treats that as not using the result of the
-    // assignment.  However, casting to E& means that we trigger an
-    // unused-value warning.  So, we cast the E& to void.
-    (void) const_cast<E&>(_elems[localBot] = t);
-    Atomic::release_store(&_bottom, increment_index(localBot));
+    _elems[localBot] = t;
+    release_set_bottom(increment_index(localBot));
     TASKQUEUE_STATS_ONLY(stats.record_push());
     return true;
   }
@@ -89,8 +84,7 @@
 }
 
 template <class E, MEMFLAGS F, unsigned int N>
-inline bool OverflowTaskQueue<E, F, N>::push(E t)
-{
+inline bool OverflowTaskQueue<E, F, N>::push(E t) {
   if (!taskqueue_t::push(t)) {
     overflow_stack()->push(t);
     TASKQUEUE_STATS_ONLY(stats.record_overflow(overflow_stack()->size()));
@@ -114,61 +108,56 @@
   // This queue was observed to contain exactly one element; either this
   // thread will claim it, or a competing "pop_global".  In either case,
   // the queue will be logically empty afterwards.  Create a new Age value
-  // that represents the empty queue for the given value of "_bottom".  (We
+  // that represents the empty queue for the given value of "bottom".  (We
   // must also increment "tag" because of the case where "bottom == 1",
   // "top == 0".  A pop_global could read the queue element in that case,
   // then have the owner thread do a pop followed by another push.  Without
   // the incrementing of "tag", the pop_global's CAS could succeed,
   // allowing it to believe it has claimed the stale element.)
-  Age newAge((idx_t)localBot, oldAge.tag() + 1);
+  Age newAge((idx_t)localBot, (idx_t)(oldAge.tag() + 1));
   // Perhaps a competing pop_global has already incremented "top", in which
   // case it wins the element.
   if (localBot == oldAge.top()) {
     // No competing pop_global has yet incremented "top"; we'll try to
     // install new_age, thus claiming the element.
-    Age tempAge = _age.cmpxchg(newAge, oldAge);
+    Age tempAge = cmpxchg_age(oldAge, newAge);
     if (tempAge == oldAge) {
       // We win.
-      assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
+      assert(dirty_size(localBot, age_top_relaxed()) != N - 1, "sanity");
       TASKQUEUE_STATS_ONLY(stats.record_pop_slow());
       return true;
     }
   }
-  // We lose; a completing pop_global gets the element.  But the queue is empty
+  // We lose; a competing pop_global got the element.  But the queue is empty
   // and top is greater than bottom.  Fix this representation of the empty queue
   // to become the canonical one.
-  _age.set(newAge);
-  assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
+  set_age_relaxed(newAge);
+  assert(dirty_size(localBot, age_top_relaxed()) != N - 1, "sanity");
   return false;
 }
 
 template<class E, MEMFLAGS F, unsigned int N> inline bool
-GenericTaskQueue<E, F, N>::pop_local(volatile E& t, uint threshold) {
-  uint localBot = _bottom;
+GenericTaskQueue<E, F, N>::pop_local(E& t, uint threshold) {
+  uint localBot = bottom_relaxed();
   // This value cannot be N-1.  That can only occur as a result of
   // the assignment to bottom in this method.  If it does, this method
   // resets the size to 0 before the next call (which is sequential,
   // since this is pop_local.)
-  uint dirty_n_elems = dirty_size(localBot, _age.top());
+  uint dirty_n_elems = dirty_size(localBot, age_top_relaxed());
   assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
   if (dirty_n_elems <= threshold) return false;
   localBot = decrement_index(localBot);
-  _bottom = localBot;
+  set_bottom_relaxed(localBot);
   // This is necessary to prevent any read below from being reordered
   // before the store just above.
   OrderAccess::fence();
-  // g++ complains if the volatile result of the assignment is
-  // unused, so we cast the volatile away.  We cannot cast directly
-  // to void, because gcc treats that as not using the result of the
-  // assignment.  However, casting to E& means that we trigger an
-  // unused-value warning.  So, we cast the E& to void.
-  (void) const_cast<E&>(t = _elems[localBot]);
+  t = _elems[localBot];
   // This is a second read of "age"; the "size()" above is the first.
   // If there's still at least one element in the queue, based on the
   // "_bottom" and "age" we've read, then there can be no interference with
   // a "pop_global" operation, and we're done.
-  idx_t tp = _age.top();    // XXX
-  if (size(localBot, tp) > 0) {
+  idx_t tp = age_top_relaxed();
+  if (clean_size(localBot, tp) > 0) {
     assert(dirty_size(localBot, tp) != N - 1, "sanity");
     TASKQUEUE_STATS_ONLY(stats.record_pop());
     return true;
@@ -176,11 +165,11 @@
     // Otherwise, the queue contained exactly one element; we take the slow
     // path.
 
-    // The barrier is required to prevent reordering the two reads of _age:
-    // one is the _age.get() below, and the other is _age.top() above the if-stmt.
-    // The algorithm may fail if _age.get() reads an older value than _age.top().
+    // The barrier is required to prevent reordering the two reads of age:
+    // one is the age() below, and the other is age_top() above the if-stmt.
+    // The algorithm may fail if age() reads an older value than age_top().
     OrderAccess::loadload();
-    return pop_local_slow(localBot, _age.get());
+    return pop_local_slow(localBot, age_relaxed());
   }
 }
 
@@ -192,9 +181,30 @@
   return true;
 }
 
+// A pop_global operation may read an element that is being concurrently
+// written by a push operation.  The pop_global operation will not use
+// such an element, returning failure instead.  But the concurrent read
+// and write places requirements on the element type.
+//
+// Strictly, such concurrent reads and writes are undefined behavior.
+// We ignore that. Instead we require that whatever value tearing may
+// occur as a result is benign. A trivially copyable type (C++14 3.9/9)
+// satisfies the requirement. But we might use classes such as oop that
+// are not trivially copyable (in some build configurations).  Such
+// classes need to be carefully examined with this requirement in mind.
+//
+// The sequence where such a read/write collision can arise is as follows.
+// Assume there is one value in the queue, so bottom == top+1.
+// (1) Thief is doing a pop_global.  It has read age and bottom, and its
+// captured (localBottom - oldAge.top) == 1.
+// (2) Owner does a pop_local and wins the race for that element.  It
+// decrements bottom and increments the age tag.
+// (3) Owner starts a push, writing elems[bottom].  At the same time, Thief
+// reads elems[oldAge.top].  The owner's bottom == the thief's oldAge.top.
+// (4) Thief will discard the read value, because its cmpxchg of age will fail.
 template<class E, MEMFLAGS F, unsigned int N>
-bool GenericTaskQueue<E, F, N>::pop_global(volatile E& t) {
-  Age oldAge = _age.get();
+bool GenericTaskQueue<E, F, N>::pop_global(E& t) {
+  Age oldAge = age_relaxed();
 #ifndef CPU_MULTI_COPY_ATOMIC
   // Architectures with non-multi-copy-atomic memory model require a
   // full fence here to guarantee that bottom is not older than age,
@@ -212,26 +222,24 @@
   OrderAccess::fence();
 #else
   // Everyone else can make do with a LoadLoad barrier to keep reads
-  // from _age and _bottom in order.
+  // from age and bottom in order.
   OrderAccess::loadload();
 #endif
-  uint localBot = Atomic::load_acquire(&_bottom);
-  uint n_elems = size(localBot, oldAge.top());
+  uint localBot = bottom_acquire();
+  uint n_elems = clean_size(localBot, oldAge.top());
   if (n_elems == 0) {
     return false;
   }
 
-  // g++ complains if the volatile result of the assignment is
-  // unused, so we cast the volatile away.  We cannot cast directly
-  // to void, because gcc treats that as not using the result of the
-  // assignment.  However, casting to E& means that we trigger an
-  // unused-value warning.  So, we cast the E& to void.
-  (void) const_cast<E&>(t = _elems[oldAge.top()]);
-  Age newAge(oldAge);
-  newAge.increment();
-  Age resAge = _age.cmpxchg(newAge, oldAge);
+  t = _elems[oldAge.top()];
+  // Increment top; if it wraps, also increment tag, to distinguish it
+  // from any recent _age for the same top() index.
+  idx_t new_top = increment_index(oldAge.top());
+  idx_t new_tag = oldAge.tag() + ((new_top == 0) ? 1 : 0);
+  Age newAge(new_top, new_tag);
+  Age resAge = cmpxchg_age(oldAge, newAge);
 
-  // Note that using "_bottom" here might fail, since a pop_local might
+  // Note that using "bottom" here might fail, since a pop_local might
   // have decremented it.
   assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity");
   return resAge == oldAge;
@@ -324,19 +332,14 @@
   return false;
 }
 
-template <unsigned int N, MEMFLAGS F>
-inline typename TaskQueueSuper<N, F>::Age TaskQueueSuper<N, F>::Age::cmpxchg(const Age new_age, const Age old_age) volatile {
-  return Atomic::cmpxchg(&_data, old_age._data, new_age._data);
-}
-
 template<class E, MEMFLAGS F, unsigned int N>
 template<class Fn>
 inline void GenericTaskQueue<E, F, N>::iterate(Fn fn) {
   uint iters = size();
-  uint index = _bottom;
+  uint index = bottom_relaxed();
   for (uint i = 0; i < iters; ++i) {
     index = decrement_index(index);
-    fn(const_cast<E&>(_elems[index])); // cast away volatility
+    fn(_elems[index]);
   }
 }