changeset 8892:22400a67babe

Merge
author amurillo
date Thu, 27 Aug 2015 14:40:19 -0700
parents 8e8377739c06 52bbd44b2b7d
children 4142c190cd5c 2e70148efaa5 3ca7e75b4e42
files src/share/vm/gc/g1/g1CollectorPolicy_ext.hpp
diffstat 145 files changed, 5174 insertions(+), 2711 deletions(-) [+]
line wrap: on
line diff
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/Address.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/Address.java	Thu Aug 27 14:40:19 2015 -0700
@@ -209,4 +209,7 @@
       returns the result as an Address. Returns null if the result was
       zero. */
   public Address    xorWithMask(long mask) throws UnsupportedOperationException;
+
+  // return address as long integer.
+  public long asLongValue();
 }
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/bsd/BsdAddress.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/bsd/BsdAddress.java	Thu Aug 27 14:40:19 2015 -0700
@@ -288,7 +288,7 @@
     return new BsdAddress(debugger, value);
   }
 
-
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/dummy/DummyAddress.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/dummy/DummyAddress.java	Thu Aug 27 14:40:19 2015 -0700
@@ -275,6 +275,7 @@
     return new DummyAddress(debugger, value);
   }
 
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxAddress.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxAddress.java	Thu Aug 27 14:40:19 2015 -0700
@@ -288,6 +288,7 @@
     return new LinuxAddress(debugger, value);
   }
 
+  public long asLongValue() { return addr; }
 
   //--------------------------------------------------------------------------------
   // Internals only below this point
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcAddress.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcAddress.java	Thu Aug 27 14:40:19 2015 -0700
@@ -283,7 +283,7 @@
     return new ProcAddress(debugger, value);
   }
 
-
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/RemoteAddress.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/RemoteAddress.java	Thu Aug 27 14:40:19 2015 -0700
@@ -281,7 +281,7 @@
     return new RemoteAddress(debugger, value);
   }
 
-
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/windbg/WindbgAddress.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/windbg/WindbgAddress.java	Thu Aug 27 14:40:19 2015 -0700
@@ -292,6 +292,7 @@
     return new WindbgAddress(debugger, value);
   }
 
+  public long asLongValue() { return addr; }
 
   //--------------------------------------------------------------------------------
   // Internals only below this point
--- a/agent/src/share/classes/sun/jvm/hotspot/oops/Symbol.java	Thu Aug 27 12:59:50 2015 -0700
+++ b/agent/src/share/classes/sun/jvm/hotspot/oops/Symbol.java	Thu Aug 27 14:40:19 2015 -0700
@@ -80,10 +80,19 @@
   public byte getByteAt(long index) {
     return addr.getJByteAt(baseOffset + index);
   }
-
+  // _identity_hash is a short
   private static CIntegerField idHash;
 
-  public int identityHash() { return     (int)idHash.getValue(this.addr); }
+  public int identityHash() {
+    long addr_value = getAddress().asLongValue();
+    int  addr_bits = (int)(addr_value >> (VM.getVM().getLogMinObjAlignmentInBytes() + 3));
+    int  length = (int)getLength();
+    int  byte0 = getByteAt(0);
+    int  byte1 = getByteAt(1);
+    int  id_hash = (int)(0xffff & idHash.getValue(this.addr));
+    return id_hash |
+           ((addr_bits ^ (length << 8) ^ ((byte0 << 8) | byte1)) << 16);
+  }
 
   public boolean equals(byte[] modUTF8Chars) {
     int l = (int) getLength();
--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Aug 27 14:40:19 2015 -0700
@@ -1033,27 +1033,39 @@
 };
 
   // graph traversal helpers
-  MemBarNode *has_parent_membar(const Node *n,
-				ProjNode *&ctl, ProjNode *&mem);
-  MemBarNode *has_child_membar(const MemBarNode *n,
-			       ProjNode *&ctl, ProjNode *&mem);
+
+  MemBarNode *parent_membar(const Node *n);
+  MemBarNode *child_membar(const MemBarNode *n);
+  bool leading_membar(const MemBarNode *barrier);
+
+  bool is_card_mark_membar(const MemBarNode *barrier);
+
+  MemBarNode *leading_to_normal(MemBarNode *leading);
+  MemBarNode *normal_to_leading(const MemBarNode *barrier);
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing);
 
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+
   bool unnecessary_acquire(const Node *barrier);
   bool needs_acquiring_load(const Node *load);
 
   // predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
   bool unnecessary_release(const Node *barrier);
   bool unnecessary_volatile(const Node *barrier);
   bool needs_releasing_store(const Node *store);
 
-  // Use barrier instructions for unsafe volatile gets rather than
-  // trying to identify an exact signature for them
-  const bool UseBarriersForUnsafeVolatileGet = false;
+  // predicate controlling translation of StoreCM
+  bool unnecessary_storestore(const Node *storecm);
 %}
 
 source %{
 
+  // Optimizaton of volatile gets and puts
+  // -------------------------------------
+  //
   // AArch64 has ldar<x> and stlr<x> instructions which we can safely
   // use to implement volatile reads and writes. For a volatile read
   // we simply need
@@ -1102,15 +1114,19 @@
   // A volatile write is translated to the node sequence
   //
   //   MemBarRelease
-  //   StoreX[mo_release]
+  //   StoreX[mo_release] {CardMark}-optional
   //   MemBarVolatile
   //
   // n.b. the above node patterns are generated with a strict
   // 'signature' configuration of input and output dependencies (see
-  // the predicates below for exact details). The two signatures are
-  // unique to translated volatile reads/stores -- they will not
-  // appear as a result of any other bytecode translation or inlining
-  // nor as a consequence of optimizing transforms.
+  // the predicates below for exact details). The card mark may be as
+  // simple as a few extra nodes or, in a few GC configurations, may
+  // include more complex control flow between the leading and
+  // trailing memory barriers. However, whatever the card mark
+  // configuration these signatures are unique to translated volatile
+  // reads/stores -- they will not appear as a result of any other
+  // bytecode translation or inlining nor as a consequence of
+  // optimizing transforms.
   //
   // We also want to catch inlined unsafe volatile gets and puts and
   // be able to implement them using either ldar<x>/stlr<x> or some
@@ -1122,7 +1138,7 @@
   //
   //   MemBarRelease
   //   MemBarCPUOrder
-  //   StoreX[mo_release]
+  //   StoreX[mo_release] {CardMark}-optional
   //   MemBarVolatile
   //
   // n.b. as an aside, the cpuorder membar is not itself subject to
@@ -1130,7 +1146,7 @@
   // predicates need to detect its presence in order to correctly
   // select the desired adlc rules.
   //
-  // Inlined unsafe volatiles gets manifest as a somewhat different
+  // Inlined unsafe volatile gets manifest as a somewhat different
   // node sequence to a normal volatile get
   //
   //   MemBarCPUOrder
@@ -1173,33 +1189,22 @@
   // n.b. the translation rules below which rely on detection of the
   // volatile signatures and insert ldar<x> or stlr<x> are failsafe.
   // If we see anything other than the signature configurations we
-  // always just translate the loads and stors to ldr<x> and str<x>
+  // always just translate the loads and stores to ldr<x> and str<x>
   // and translate acquire, release and volatile membars to the
   // relevant dmb instructions.
   //
-  // n.b.b as a case in point for the above comment, the current
-  // predicates don't detect the precise signature for certain types
-  // of volatile object stores (where the heap_base input type is not
-  // known at compile-time to be non-NULL). In those cases the
-  // MemBarRelease and MemBarVolatile bracket an if-then-else sequence
-  // with a store in each branch (we need a different store depending
-  // on whether heap_base is actually NULL). In such a case we will
-  // just plant a dmb both before and after the branch/merge. The
-  // predicate could (and probably should) be fixed later to also
-  // detect this case.
-
-  // graph traversal helpers
+
+  // graph traversal helpers used for volatile put/get optimization
+
+  // 1) general purpose helpers
 
   // if node n is linked to a parent MemBarNode by an intervening
-  // Control or Memory ProjNode return the MemBarNode otherwise return
+  // Control and Memory ProjNode return the MemBarNode otherwise return
   // NULL.
   //
   // n may only be a Load or a MemBar.
-  //
-  // The ProjNode* references c and m are used to return the relevant
-  // nodes.
-
-  MemBarNode *has_parent_membar(const Node *n, ProjNode *&c, ProjNode *&m)
+
+  MemBarNode *parent_membar(const Node *n)
   {
     Node *ctl = NULL;
     Node *mem = NULL;
@@ -1218,15 +1223,11 @@
     if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj())
       return NULL;
 
-    c = ctl->as_Proj();
-
     membar = ctl->lookup(0);
 
     if (!membar || !membar->is_MemBar())
       return NULL;
 
-    m = mem->as_Proj();
-
     if (mem->lookup(0) != membar)
       return NULL;
 
@@ -1235,12 +1236,8 @@
 
   // if n is linked to a child MemBarNode by intervening Control and
   // Memory ProjNodes return the MemBarNode otherwise return NULL.
-  //
-  // The ProjNode** arguments c and m are used to return pointers to
-  // the relevant nodes. A null argument means don't don't return a
-  // value.
-
-  MemBarNode *has_child_membar(const MemBarNode *n, ProjNode *&c, ProjNode *&m)
+
+  MemBarNode *child_membar(const MemBarNode *n)
   {
     ProjNode *ctl = n->proj_out(TypeFunc::Control);
     ProjNode *mem = n->proj_out(TypeFunc::Memory);
@@ -1249,9 +1246,6 @@
     if (! ctl || ! mem)
       return NULL;
 
-    c = ctl;
-    m = mem;
-
     MemBarNode *child = NULL;
     Node *x;
 
@@ -1279,9 +1273,838 @@
     return NULL;
   }
 
+  // helper predicate use to filter candidates for a leading memory
+  // barrier
+  //
+  // returns true if barrier is a MemBarRelease or a MemBarCPUOrder
+  // whose Ctl and Mem feeds come from a MemBarRelease otherwise false
+
+  bool leading_membar(const MemBarNode *barrier)
+  {
+    int opcode = barrier->Opcode();
+    // if this is a release membar we are ok
+    if (opcode == Op_MemBarRelease)
+      return true;
+    // if its a cpuorder membar . . .
+    if (opcode != Op_MemBarCPUOrder)
+      return false;
+    // then the parent has to be a release membar
+    MemBarNode *parent = parent_membar(barrier);
+    if (!parent)
+      return false;
+    opcode = parent->Opcode();
+    return opcode == Op_MemBarRelease;
+  }
+ 
+  // 2) card mark detection helper
+
+  // helper predicate which can be used to detect a volatile membar
+  // introduced as part of a conditional card mark sequence either by
+  // G1 or by CMS when UseCondCardMark is true.
+  //
+  // membar can be definitively determined to be part of a card mark
+  // sequence if and only if all the following hold
+  //
+  // i) it is a MemBarVolatile
+  //
+  // ii) either UseG1GC or (UseConcMarkSweepGC && UseCondCardMark) is
+  // true
+  //
+  // iii) the node's Mem projection feeds a StoreCM node.
+  
+  bool is_card_mark_membar(const MemBarNode *barrier)
+  {
+    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark))
+      return false;
+
+    if (barrier->Opcode() != Op_MemBarVolatile)
+      return false;
+
+    ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) {
+      Node *y = mem->fast_out(i);
+      if (y->Opcode() == Op_StoreCM) {
+	return true;
+      }
+    }
+  
+    return false;
+  }
+
+
+  // 3) helper predicates to traverse volatile put graphs which may
+  // contain GC barrier subgraphs
+
+  // Preamble
+  // --------
+  //
+  // for volatile writes we can omit generating barriers and employ a
+  // releasing store when we see a node sequence sequence with a
+  // leading MemBarRelease and a trailing MemBarVolatile as follows
+  //
+  //   MemBarRelease
+  //  {      ||      } -- optional
+  //  {MemBarCPUOrder}
+  //         ||     \\
+  //         ||     StoreX[mo_release]
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarVolatile
+  //
+  // where
+  //  || and \\ represent Ctl and Mem feeds via Proj nodes
+  //  | \ and / indicate further routing of the Ctl and Mem feeds
+  // 
+  // this is the graph we see for non-object stores. however, for a
+  // volatile Object store (StoreN/P) we may see other nodes below the
+  // leading membar because of the need for a GC pre- or post-write
+  // barrier.
+  //
+  // with most GC configurations we with see this simple variant which
+  // includes a post-write barrier card mark.
+  //
+  //   MemBarRelease______________________________
+  //         ||    \\               Ctl \        \\
+  //         ||    StoreN/P[mo_release] CastP2X  StoreB/CM
+  //         | \     /                       . . .  /
+  //         | MergeMem
+  //         | /
+  //         ||      /
+  //   MemBarVolatile
+  //
+  // i.e. the leading membar feeds Ctl to a CastP2X (which converts
+  // the object address to an int used to compute the card offset) and
+  // Ctl+Mem to a StoreB node (which does the actual card mark).
+  //
+  // n.b. a StoreCM node will only appear in this configuration when
+  // using CMS. StoreCM differs from a normal card mark write (StoreB)
+  // because it implies a requirement to order visibility of the card
+  // mark (StoreCM) relative to the object put (StoreP/N) using a
+  // StoreStore memory barrier (arguably this ought to be represented
+  // explicitly in the ideal graph but that is not how it works). This
+  // ordering is required for both non-volatile and volatile
+  // puts. Normally that means we need to translate a StoreCM using
+  // the sequence
+  //
+  //   dmb ishst
+  //   stlrb
+  //
+  // However, in the case of a volatile put if we can recognise this
+  // configuration and plant an stlr for the object write then we can
+  // omit the dmb and just plant an strb since visibility of the stlr
+  // is ordered before visibility of subsequent stores. StoreCM nodes
+  // also arise when using G1 or using CMS with conditional card
+  // marking. In these cases (as we shall see) we don't need to insert
+  // the dmb when translating StoreCM because there is already an
+  // intervening StoreLoad barrier between it and the StoreP/N.
+  //
+  // It is also possible to perform the card mark conditionally on it
+  // currently being unmarked in which case the volatile put graph
+  // will look slightly different
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder___________________________________________
+  //         ||    \\               Ctl \     Ctl \     \\  Mem \
+  //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
+  //         | \     /                              \            |
+  //         | MergeMem                            . . .      StoreB
+  //         | /                                                /
+  //         ||     /
+  //   MemBarVolatile
+  //
+  // It is worth noting at this stage that both the above
+  // configurations can be uniquely identified by checking that the
+  // memory flow includes the following subgraph:
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |  \      . . .
+  //          |  StoreX[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // This is referred to as a *normal* subgraph. It can easily be
+  // detected starting from any candidate MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile.
+  //
+  // the code below uses two helper predicates, leading_to_normal and
+  // normal_to_leading to identify this configuration, one validating
+  // the layout starting from the top membar and searching down and
+  // the other validating the layout starting from the lower membar
+  // and searching up.
+  //
+  // There are two special case GC configurations when a normal graph
+  // may not be generated: when using G1 (which always employs a
+  // conditional card mark); and when using CMS with conditional card
+  // marking configured. These GCs are both concurrent rather than
+  // stop-the world GCs. So they introduce extra Ctl+Mem flow into the
+  // graph between the leading and trailing membar nodes, in
+  // particular enforcing stronger memory serialisation beween the
+  // object put and the corresponding conditional card mark. CMS
+  // employs a post-write GC barrier while G1 employs both a pre- and
+  // post-write GC barrier. Of course the extra nodes may be absent --
+  // they are only inserted for object puts. This significantly
+  // complicates the task of identifying whether a MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
+  // when using these GC configurations (see below).
+  //
+  // In both cases the post-write subtree includes an auxiliary
+  // MemBarVolatile (StoreLoad barrier) separating the object put and
+  // the read of the corresponding card. This poses two additional
+  // problems.
+  //
+  // Firstly, a card mark MemBarVolatile needs to be distinguished
+  // from a normal trailing MemBarVolatile. Resolving this first
+  // problem is straightforward: a card mark MemBarVolatile always
+  // projects a Mem feed to a StoreCM node and that is a unique marker
+  //
+  //      MemBarVolatile (card mark)
+  //       C |    \     . . .
+  //         |   StoreCM   . . .
+  //       . . .
+  //
+  // The second problem is how the code generator is to translate the
+  // card mark barrier? It always needs to be translated to a "dmb
+  // ish" instruction whether or not it occurs as part of a volatile
+  // put. A StoreLoad barrier is needed after the object put to ensure
+  // i) visibility to GC threads of the object put and ii) visibility
+  // to the mutator thread of any card clearing write by a GC
+  // thread. Clearly a normal store (str) will not guarantee this
+  // ordering but neither will a releasing store (stlr). The latter
+  // guarantees that the object put is visible but does not guarantee
+  // that writes by other threads have also been observed.
+  // 
+  // So, returning to the task of translating the object put and the
+  // leading/trailing membar nodes: what do the non-normal node graph
+  // look like for these 2 special cases? and how can we determine the
+  // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile
+  // in both normal and non-normal cases?
+  //
+  // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
+  // which selects conditonal execution based on the value loaded
+  // (LoadB) from the card. Ctl and Mem are fed to the If via an
+  // intervening StoreLoad barrier (MemBarVolatile).
+  //
+  // So, with CMS we may see a node graph which looks like this
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder_(leading)__________________
+  //     C |    M \       \\                   C \
+  //       |       \    StoreN/P[mo_release]  CastP2X
+  //       |    Bot \    /
+  //       |       MergeMem
+  //       |         /
+  //      MemBarVolatile (card mark)
+  //     C |  ||    M |
+  //       | LoadB    |
+  //       |   |      |
+  //       | Cmp      |\
+  //       | /        | \
+  //       If         |  \
+  //       | \        |   \
+  // IfFalse  IfTrue  |    \
+  //       \     / \  |     \
+  //        \   / StoreCM    |
+  //         \ /      |      |
+  //        Region   . . .   |
+  //          | \           /
+  //          |  . . .  \  / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarVolatile (trailing)
+  //
+  // The first MergeMem merges the AliasIdxBot Mem slice from the
+  // leading membar and the oopptr Mem slice from the Store into the
+  // card mark membar. The trailing MergeMem merges the AliasIdxBot
+  // Mem slice from the card mark membar and the AliasIdxRaw slice
+  // from the StoreCM into the trailing membar (n.b. the latter
+  // proceeds via a Phi associated with the If region).
+  //
+  // G1 is quite a lot more complicated. The nodes inserted on behalf
+  // of G1 may comprise: a pre-write graph which adds the old value to
+  // the SATB queue; the releasing store itself; and, finally, a
+  // post-write graph which performs a card mark.
+  //
+  // The pre-write graph may be omitted, but only when the put is
+  // writing to a newly allocated (young gen) object and then only if
+  // there is a direct memory chain to the Initialize node for the
+  // object allocation. This will not happen for a volatile put since
+  // any memory chain passes through the leading membar.
+  //
+  // The pre-write graph includes a series of 3 If tests. The outermost
+  // If tests whether SATB is enabled (no else case). The next If tests
+  // whether the old value is non-NULL (no else case). The third tests
+  // whether the SATB queue index is > 0, if so updating the queue. The
+  // else case for this third If calls out to the runtime to allocate a
+  // new queue buffer.
+  //
+  // So with G1 the pre-write and releasing store subgraph looks like
+  // this (the nested Ifs are omitted).
+  //
+  //  MemBarRelease (leading)____________
+  //     C |  ||  M \   M \    M \  M \ . . .
+  //       | LoadB   \  LoadL  LoadN   \
+  //       | /        \                 \
+  //       If         |\                 \
+  //       | \        | \                 \
+  //  IfFalse  IfTrue |  \                 \
+  //       |     |    |   \                 |
+  //       |     If   |   /\                |
+  //       |     |          \               |
+  //       |                 \              |
+  //       |    . . .         \             |
+  //       | /       | /       |            |
+  //      Region  Phi[M]       |            |
+  //       | \       |         |            |
+  //       |  \_____ | ___     |            |
+  //     C | C \     |   C \ M |            |
+  //       | CastP2X | StoreN/P[mo_release] |
+  //       |         |         |            |
+  //     C |       M |       M |          M |
+  //        \        |         |           /
+  //                  . . . 
+  //          (post write subtree elided)
+  //                    . . .
+  //             C \         M /
+  //         MemBarVolatile (trailing)
+  //
+  // n.b. the LoadB in this subgraph is not the card read -- it's a
+  // read of the SATB queue active flag.
+  //
+  // The G1 post-write subtree is also optional, this time when the
+  // new value being written is either null or can be identified as a
+  // newly allocated (young gen) object with no intervening control
+  // flow. The latter cannot happen but the former may, in which case
+  // the card mark membar is omitted and the memory feeds from the
+  // leading membar and the StoreN/P are merged direct into the
+  // trailing membar as per the normal subgraph. So, the only special
+  // case which arises is when the post-write subgraph is generated.
+  //
+  // The kernel of the post-write G1 subgraph is the card mark itself
+  // which includes a card mark memory barrier (MemBarVolatile), a
+  // card test (LoadB), and a conditional update (If feeding a
+  // StoreCM). These nodes are surrounded by a series of nested Ifs
+  // which try to avoid doing the card mark. The top level If skips if
+  // the object reference does not cross regions (i.e. it tests if
+  // (adr ^ val) >> log2(regsize) != 0) -- intra-region references
+  // need not be recorded. The next If, which skips on a NULL value,
+  // may be absent (it is not generated if the type of value is >=
+  // OopPtr::NotNull). The 3rd If skips writes to young regions (by
+  // checking if card_val != young).  n.b. although this test requires
+  // a pre-read of the card it can safely be done before the StoreLoad
+  // barrier. However that does not bypass the need to reread the card
+  // after the barrier.
+  //
+  //                (pre-write subtree elided)
+  //        . . .                  . . .    . . .  . . .
+  //        C |                    M |     M |    M |
+  //       Region                  Phi[M] StoreN    |
+  //          |                     / \      |      |
+  //         / \_______            /   \     |      |
+  //      C / C \      . . .            \    |      |
+  //       If   CastP2X . . .            |   |      |
+  //       / \                           |   |      |
+  //      /   \                          |   |      |
+  // IfFalse IfTrue                      |   |      |
+  //   |       |                         |   |     /|
+  //   |       If                        |   |    / |
+  //   |      / \                        |   |   /  |
+  //   |     /   \                        \  |  /   |
+  //   | IfFalse IfTrue                   MergeMem  |
+  //   |  . . .    / \                       /      |
+  //   |          /   \                     /       |
+  //   |     IfFalse IfTrue                /        |
+  //   |      . . .    |                  /         |
+  //   |               If                /          |
+  //   |               / \              /           |
+  //   |              /   \            /            |
+  //   |         IfFalse IfTrue       /             |
+  //   |           . . .   |         /              |
+  //   |                    \       /               |
+  //   |                     \     /                |
+  //   |             MemBarVolatile__(card mark)    |
+  //   |                ||   C |  M \  M \          |
+  //   |               LoadB   If    |    |         |
+  //   |                      / \    |    |         |
+  //   |                     . . .   |    |         |
+  //   |                          \  |    |        /
+  //   |                        StoreCM   |       /
+  //   |                          . . .   |      /
+  //   |                        _________/      /
+  //   |                       /  _____________/
+  //   |   . . .       . . .  |  /            /
+  //   |    |                 | /   _________/
+  //   |    |               Phi[M] /        /
+  //   |    |                 |   /        /
+  //   |    |                 |  /        /
+  //   |  Region  . . .     Phi[M]  _____/
+  //   |    /                 |    /
+  //   |                      |   /   
+  //   | . . .   . . .        |  /
+  //   | /                    | /
+  // Region           |  |  Phi[M]
+  //   |              |  |  / Bot
+  //    \            MergeMem 
+  //     \            /
+  //     MemBarVolatile
+  //
+  // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
+  // from the leading membar and the oopptr Mem slice from the Store
+  // into the card mark membar i.e. the memory flow to the card mark
+  // membar still looks like a normal graph.
+  //
+  // The trailing MergeMem merges an AliasIdxBot Mem slice with other
+  // Mem slices (from the StoreCM and other card mark queue stores).
+  // However in this case the AliasIdxBot Mem slice does not come
+  // direct from the card mark membar. It is merged through a series
+  // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow
+  // from the leading membar with the Mem feed from the card mark
+  // membar. Each Phi corresponds to one of the Ifs which may skip
+  // around the card mark membar. So when the If implementing the NULL
+  // value check has been elided the total number of Phis is 2
+  // otherwise it is 3.
+  //
+  // So, the upshot is that in all cases the volatile put graph will
+  // include a *normal* memory subgraph betwen the leading membar and
+  // its child membar. When that child is not a card mark membar then
+  // it marks the end of a volatile put subgraph. If the child is a
+  // card mark membar then the normal subgraph will form part of a
+  // volatile put subgraph if and only if the child feeds an
+  // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
+  // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
+  // the leading barrier memory flow (for G1).
+  // 
+  // The predicates controlling generation of instructions for store
+  // and barrier nodes employ a few simple helper functions (described
+  // below) which identify the presence or absence of these subgraph
+  // configurations and provide a means of traversing from one node in
+  // the subgraph to another.
+
+  // leading_to_normal
+  //
+  //graph traversal helper which detects the normal case Mem feed
+  // from a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile membar i.e. it ensures that the following Mem
+  // flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // if the correct configuration is present returns the volatile
+  // membar otherwise NULL.
+  //
+  // the input membar is expected to be either a cpuorder membar or a
+  // release membar. in the latter case it should not have a cpu membar
+  // child.
+  //
+  // the returned membar may be a card mark membar rather than a
+  // trailing membar.
+
+  MemBarNode *leading_to_normal(MemBarNode *leading)
+  {
+    assert((leading->Opcode() == Op_MemBarRelease ||
+	    leading->Opcode() == Op_MemBarCPUOrder),
+	   "expecting a volatile or cpuroder membar!");
+
+    // check the mem flow
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+
+    if (!mem)
+      return NULL;
+
+    Node *x = NULL;
+    StoreNode * st = NULL;
+    MergeMemNode *mm = NULL;
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_MergeMem()) {
+	if (mm != NULL)
+	  return NULL;
+	// two merge mems is one too many
+	mm = x->as_MergeMem();
+      } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two releasing stores is one too many
+	if (st != NULL)
+	  return NULL;
+	st = x->as_Store();
+      }
+    }
+
+    if (!mm || !st)
+      return NULL;
+
+    bool found = false;
+    // ensure the store feeds the merge
+    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+      if (st->fast_out(i) == mm) {
+	found = true;
+	break;
+      }
+    }
+
+    if (!found)
+      return NULL;
+
+    MemBarNode *mbvol = NULL;
+    // ensure the merge feeds a volatile membar
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	mbvol = x->as_MemBar();
+	break;
+      }
+    }
+
+    return mbvol;
+  }
+
+  // normal_to_leading
+  //
+  // graph traversal helper which detects the normal case Mem feed
+  // from either a card mark or a trailing membar to a preceding
+  // release membar (optionally its cpuorder child) i.e. it ensures
+  // that the following Mem flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configuration is present returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a MemBarVolatile but
+  // need not be a card mark membar.
+
+  MemBarNode *normal_to_leading(const MemBarNode *barrier)
+  {
+    // input must be a volatile membar
+    assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
+    Node *x;
+
+    // the Mem feed to the membar should be a merge
+    x = barrier->in(TypeFunc::Memory);
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    // the AliasIdxBot slice should be another MemBar projection
+    x = mm->in(Compile::AliasIdxBot);
+    // ensure this is a non control projection
+    if (!x->is_Proj() || x->is_CFG())
+      return NULL;
+    // if it is fed by a membar that's the one we want
+    x = x->in(0);
+
+    if (!x->is_MemBar())
+      return NULL;
+
+    MemBarNode *leading = x->as_MemBar();
+    // reject invalid candidates
+    if (!leading_membar(leading))
+      return NULL;
+
+    // ok, we have a leading ReleaseMembar, now for the sanity clauses
+
+    // the leading membar must feed Mem to a releasing store
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+    StoreNode *st = NULL;
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	st = x->as_Store();
+	break;
+      }
+    }
+    if (st == NULL)
+      return NULL;
+
+    // the releasing store has to feed the same merge
+    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+      if (st->fast_out(i) == mm)
+	return leading;
+    }
+
+    return NULL;
+  }
+
+  // card_mark_to_trailing
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a card mark volatile membar to a trailing membar i.e. it
+  // ensures that one of the following three GC post-write Mem flow
+  // subgraphs is present.
+  //
+  // 1)
+  //     . . .
+  //       |
+  //   MemBarVolatile (card mark)
+  //      |          |     
+  //      |        StoreCM
+  //      |          |
+  //      |        . . .
+  //  Bot |  / 
+  //   MergeMem 
+  //      |
+  //   MemBarVolatile (trailing)
+  //
+  //
+  // 2)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    | 
+  //    |\       . . .
+  //    | \        | 
+  //    |  \  MemBarVolatile (card mark) 
+  //    |   \   |     |
+  //     \   \  |   StoreCM    . . .
+  //      \   \ |
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //   MemBarVolatile (trailing)
+  //
+  // 3)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    |\
+  //    | \
+  //    |  \      . . .
+  //    |   \       |
+  //    |\   \  MemBarVolatile (card mark)
+  //    | \   \   |     |
+  //    |  \   \  |   StoreCM    . . .
+  //    |   \   \ |
+  //     \   \  Phi
+  //      \   \ /  
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //   MemBarVolatile (trailing)
+  //
+  // configuration 1 is only valid if UseConcMarkSweepGC &&
+  // UseCondCardMark
+  //
+  // configurations 2 and 3 are only valid if UseG1GC.
+  //
+  // if a valid configuration is present returns the trailing membar
+  // otherwise NULL.
+  //
+  // n.b. the supplied membar is expected to be a card mark
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct operand and feeds Mem to a StoreCM node
+
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier)
+  {
+    // input must be a card mark volatile membar
+    assert(is_card_mark_membar(barrier), "expecting a card mark membar");
+
+    Node *feed = barrier->proj_out(TypeFunc::Memory);
+    Node *x;
+    MergeMemNode *mm = NULL;
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = true;
+    while (retry_feed) {
+      // see if we have a direct MergeMem feed
+      for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	x = feed->fast_out(i);
+	// the correct Phi will be merging a Bot memory slice
+	if (x->is_MergeMem()) {
+	  mm = x->as_MergeMem();
+	  break;
+	}
+      }
+      if (mm) {
+	retry_feed = false;
+      } else if (UseG1GC & phicount++ < MAX_PHIS) {
+	// the barrier may feed indirectly via one or two Phi nodes
+	PhiNode *phi = NULL;
+	for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	  x = feed->fast_out(i);
+	  // the correct Phi will be merging a Bot memory slice
+	  if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
+	    phi = x->as_Phi();
+	    break;
+	  }
+	}
+	if (!phi)
+	  return NULL;
+	// look for another merge below this phi
+	feed = phi;
+      } else {
+	// couldn't find a merge
+	return NULL;
+      }
+    }
+
+    // sanity check this feed turns up as the expected slice
+    assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
+
+    MemBarNode *trailing = NULL;
+    // be sure we have a volatile membar below the merge
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	trailing = x->as_MemBar();
+	break;
+      }
+    }
+
+    return trailing;
+  }
+
+  // trailing_to_card_mark
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a trailing membar to a preceding card mark volatile membar
+  // i.e. it identifies whether one of the three possible extra GC
+  // post-write Mem flow subgraphs is present
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configurationis present returns the card mark membar
+  // otherwise NULL
+
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
+  {
+    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+    Node *x = trailing->in(TypeFunc::Memory);
+    // the Mem feed to the membar should be a merge
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    x = mm->in(Compile::AliasIdxBot);
+    // with G1 we may possibly see a Phi or two before we see a Memory
+    // Proj from the card mark membar
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = !x->is_Proj();
+
+    while (retry_feed) {
+      if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
+	PhiNode *phi = x->as_Phi();
+	ProjNode *proj = NULL;
+	PhiNode *nextphi = NULL;
+	bool found_leading = false;
+	for (uint i = 1; i < phi->req(); i++) {
+	  x = phi->in(i);
+	  if (x->is_Phi()) {
+	    nextphi = x->as_Phi();
+	  } else if (x->is_Proj()) {
+	    int opcode = x->in(0)->Opcode();
+	    if (opcode == Op_MemBarVolatile) {
+	      proj = x->as_Proj();
+	    } else if (opcode == Op_MemBarRelease ||
+		       opcode == Op_MemBarCPUOrder) {
+	      // probably a leading membar
+	      found_leading = true;
+	    }
+	  }
+	}
+	// if we found a correct looking proj then retry from there
+	// otherwise we must see a leading and a phi or this the
+	// wrong config
+	if (proj != NULL) {
+	  x = proj;
+	  retry_feed = false;
+	} else if (found_leading && nextphi != NULL) {
+	  // retry from this phi to check phi2
+	  x = nextphi;
+	} else {
+	  // not what we were looking for
+	  return NULL;
+	}
+      } else {
+	return NULL;
+      }
+    }
+    // the proj has to come from the card mark membar
+    x = x->in(0);
+    if (!x->is_MemBar())
+      return NULL;
+
+    MemBarNode *card_mark_membar = x->as_MemBar();
+
+    if (!is_card_mark_membar(card_mark_membar))
+      return NULL;
+
+    return card_mark_membar;
+  }
+
+  // trailing_to_leading
+  //
+  // graph traversal helper which checks the Mem flow up the graph
+  // from a (non-card mark) volatile membar attempting to locate and
+  // return an associated leading membar. it first looks for a
+  // subgraph in the normal configuration (relying on helper
+  // normal_to_leading). failing that it then looks for one of the
+  // possible post-write card mark subgraphs linking the trailing node
+  // to a the card mark membar (relying on helper
+  // trailing_to_card_mark), and then checks that the card mark membar
+  // is fed by a leading membar (once again relying on auxiliary
+  // predicate normal_to_leading).
+  //
+  // if the configuration is valid returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a volatile membar but
+  // must *not* be a card mark membar.
+
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing)
+  {
+    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+    MemBarNode *leading = normal_to_leading(trailing);
+
+    if (leading)
+      return leading;
+
+    MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
+
+    if (!card_mark_membar)
+      return NULL;
+
+    return normal_to_leading(card_mark_membar);
+  }
+
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
 
-bool unnecessary_acquire(const Node *barrier) {
+bool unnecessary_acquire(const Node *barrier)
+{
   // assert barrier->is_MemBar();
   if (UseBarriersForVolatile)
     // we need to plant a dmb
@@ -1323,13 +2146,11 @@
     return (x->is_Load() && x->as_Load()->is_acquire());
   }
   
-  // only continue if we want to try to match unsafe volatile gets
-  if (UseBarriersForUnsafeVolatileGet)
-    return false;
+  // now check for an unsafe volatile get
 
   // need to check for
   //
-  //     MemBarCPUOrder
+  //   MemBarCPUOrder
   //        ||       \\
   //   MemBarAcquire* LoadX[mo_acquire]
   //        ||
@@ -1341,9 +2162,13 @@
   // check for a parent MemBarCPUOrder
   ProjNode *ctl;
   ProjNode *mem;
-  MemBarNode *parent = has_parent_membar(barrier, ctl, mem);
+  MemBarNode *parent = parent_membar(barrier);
   if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
     return false;
+  ctl = parent->proj_out(TypeFunc::Control);
+  mem = parent->proj_out(TypeFunc::Memory);
+  if (!ctl || !mem)
+    return false;
   // ensure the proj nodes both feed a LoadX[mo_acquire]
   LoadNode *ld = NULL;
   for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
@@ -1369,7 +2194,7 @@
   if (ld)
     return false;
   // check for a child cpuorder membar
-  MemBarNode *child  = has_child_membar(barrier->as_MemBar(), ctl, mem);
+  MemBarNode *child  = child_membar(barrier->as_MemBar());
   if (!child || child->Opcode() != Op_MemBarCPUOrder)
     return false;
 
@@ -1422,9 +2247,7 @@
     return true;
   }
 
-  // only continue if we want to try to match unsafe volatile gets
-  if (UseBarriersForUnsafeVolatileGet)
-    return false;
+  // now check for an unsafe volatile get
 
   // check if Ctl and Proj feed comes from a MemBarCPUOrder
   //
@@ -1435,22 +2258,20 @@
   //   MemBarCPUOrder
 
   MemBarNode *membar;
-  ProjNode *ctl;
-  ProjNode *mem;
-
-  membar = has_parent_membar(ld, ctl, mem);
+
+  membar = parent_membar(ld);
 
   if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
     return false;
 
   // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
 
-  membar = has_child_membar(membar, ctl, mem);
+  membar = child_membar(membar);
 
   if (!membar || !membar->Opcode() == Op_MemBarAcquire)
     return false;
 
-  membar = has_child_membar(membar, ctl, mem);
+  membar = child_membar(membar);
   
   if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
     return false;
@@ -1458,194 +2279,81 @@
   return true;
 }
 
-bool unnecessary_release(const Node *n) {
+bool unnecessary_release(const Node *n)
+{
+  assert((n->is_MemBar() &&
+	  n->Opcode() == Op_MemBarRelease),
+	 "expecting a release membar");
+
+  if (UseBarriersForVolatile)
+    // we need to plant a dmb
+    return false;
+
+  // if there is a dependent CPUOrder barrier then use that as the
+  // leading
+
+  MemBarNode *barrier = n->as_MemBar();
+  // check for an intervening cpuorder membar
+  MemBarNode *b = child_membar(barrier);
+  if (b && b->Opcode() == Op_MemBarCPUOrder) {
+    // ok, so start the check from the dependent cpuorder barrier
+    barrier = b;
+  }
+
+  // must start with a normal feed
+  MemBarNode *child_barrier = leading_to_normal(barrier);
+
+  if (!child_barrier)
+    return false;
+
+  if (!is_card_mark_membar(child_barrier))
+    // this is the trailing membar and we are done
+    return true;
+
+  // must be sure this card mark feeds a trailing membar
+  MemBarNode *trailing = card_mark_to_trailing(child_barrier);
+  return (trailing != NULL);
+}
+
+bool unnecessary_volatile(const Node *n)
+{
   // assert n->is_MemBar();
   if (UseBarriersForVolatile)
     // we need to plant a dmb
     return false;
 
-  // ok, so we can omit this release barrier if it has been inserted
-  // as part of a volatile store sequence
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-  // so we need to check that
-  //
-  // ia) the release membar (or its dependent cpuorder membar) feeds
-  // control to a store node (via a Control project node)
-  //
-  // ii) the store is ordered release
-  //
-  // iii) the release membar (or its dependent cpuorder membar) feeds
-  // control to a volatile membar (via the same Control project node)
-  //
-  // iv) the release membar feeds memory to a merge mem and to the
-  // same store (both via a single Memory proj node)
-  //
-  // v) the store outputs to the merge mem
-  //
-  // vi) the merge mem outputs to the same volatile membar
-  //
-  // n.b. if this is an inlined unsafe node then the release membar
-  // may feed its control and memory links via an intervening cpuorder
-  // membar. this case can be dealt with when we check the release
-  // membar projections. if they both feed a single cpuorder membar
-  // node continue to make the same checks as above but with the
-  // cpuorder membar substituted for the release membar. if they don't
-  // both feed a cpuorder membar then the check fails.
-  //
-  // n.b.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that but for now we can
-  // just take the hit of on inserting a redundant dmb for this
-  // redundant volatile membar
-
-  MemBarNode *barrier = n->as_MemBar();
-  ProjNode *ctl;
-  ProjNode *mem;
-  // check for an intervening cpuorder membar
-  MemBarNode *b = has_child_membar(barrier, ctl, mem);
-  if (b && b->Opcode() == Op_MemBarCPUOrder) {
-    // ok, so start form the dependent cpuorder barrier
-    barrier = b;
-  }
-  // check the ctl and mem flow
-  ctl = barrier->proj_out(TypeFunc::Control);
-  mem = barrier->proj_out(TypeFunc::Memory);
-
-  // the barrier needs to have both a Ctl and Mem projection
-  if (! ctl || ! mem)
+  MemBarNode *mbvol = n->as_MemBar();
+
+  // first we check if this is part of a card mark. if so then we have
+  // to generate a StoreLoad barrier
+  
+  if (is_card_mark_membar(mbvol))
+      return false;
+
+  // ok, if it's not a card mark then we still need to check if it is
+  // a trailing membar of a volatile put hgraph.
+
+  return (trailing_to_leading(mbvol) != NULL);
+}
+
+// predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
+bool needs_releasing_store(const Node *n)
+{
+  // assert n->is_Store();
+  if (UseBarriersForVolatile)
+    // we use a normal store and dmb combination
     return false;
 
-  Node *x = NULL;
-  Node *mbvol = NULL;
-  StoreNode * st = NULL;
-
-  // For a normal volatile write the Ctl ProjNode should have output
-  // to a MemBarVolatile and a Store marked as releasing
-  //
-  // n.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that case too but for now
-  // we can just take the hit of inserting a dmb and a non-volatile
-  // store to implement the volatile store
-
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      if (mbvol) {
-	return false;
-      }
-      mbvol = x;
-    } else if (x->is_Store()) {
-      st = x->as_Store();
-      if (! st->is_release()) {
-	return false;
-      }
-    } else if (!x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mbvol || !st)
-    return false;
-
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mm)
+  StoreNode *st = n->as_Store();
+
+  // the store must be marked as releasing
+  if (!st->is_release())
     return false;
 
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool unnecessary_volatile(const Node *n) {
-  // assert n->is_MemBar();
-  if (UseBarriersForVolatile)
-    // we need to plant a dmb
-    return false;
-
-  // ok, so we can omit this volatile barrier if it has been inserted
-  // as part of a volatile store sequence
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-  // we need to check that
-  //
-  // i) the volatile membar gets its control feed from a release
-  // membar (or its dependent cpuorder membar) via a Control project
-  // node
-  //
-  // ii) the release membar (or its dependent cpuorder membar) also
-  // feeds control to a store node via the same proj node
-  //
-  // iii) the store is ordered release
-  //
-  // iv) the release membar (or its dependent cpuorder membar) feeds
-  // memory to a merge mem and to the same store (both via a single
-  // Memory proj node)
-  //
-  // v) the store outputs to the merge mem
-  //
-  // vi) the merge mem outputs to the volatile membar
-  //
-  // n.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that but for now we can
-  // just take the hit of on inserting a redundant dmb for this
-  // redundant volatile membar
-
-  MemBarNode *mbvol = n->as_MemBar();
-  Node *x = n->lookup(TypeFunc::Control);
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
 
   if (! x || !x->is_Proj())
     return false;
@@ -1659,200 +2367,78 @@
 
   MemBarNode *barrier = x->as_MemBar();
 
-  // if the barrier is a release membar we have what we want. if it is
-  // a cpuorder membar then we need to ensure that it is fed by a
-  // release membar in which case we proceed to check the graph below
-  // this cpuorder membar as the feed
-
-  if (x->Opcode() != Op_MemBarRelease) {
-    if (x->Opcode() != Op_MemBarCPUOrder)
-      return false;
-    ProjNode *ctl;
-    ProjNode *mem;
-    MemBarNode *b = has_parent_membar(x, ctl, mem);
-    if (!b || !b->Opcode() == Op_MemBarRelease)
-      return false;
-  }
-
-  ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
-  ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
-  // barrier needs to have both a Ctl and Mem projection
-  // and we need to have reached it via the Ctl projection
-  if (! ctl || ! mem || ctl != proj)
-    return false;
-
-  StoreNode * st = NULL;
-
-  // The Ctl ProjNode should have output to a MemBarVolatile and
-  // a Store marked as releasing
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      if (x != mbvol) {
-	return false;
-      }
-    } else if (x->is_Store()) {
-      st = x->as_Store();
-      if (! st->is_release()) {
-	return false;
-      }
-    } else if (!x->is_Mach()){
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!st)
-    return false;
-
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mm)
-    return false;
-
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  return true;
-}
-
-
-
-bool needs_releasing_store(const Node *n)
-{
-  // assert n->is_Store();
-  if (UseBarriersForVolatile)
-    // we use a normal store and dmb combination
+  // if the barrier is a release membar or a cpuorder mmebar fed by a
+  // release membar then we need to check whether that forms part of a
+  // volatile put graph.
+
+  // reject invalid candidates
+  if (!leading_membar(barrier))
     return false;
 
-  StoreNode *st = n->as_Store();
-
-  if (!st->is_release())
-    return false;
-
-  // check if this store is bracketed by a release (or its dependent
-  // cpuorder membar) and a volatile membar
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-
-
-  Node *x = st->lookup(TypeFunc::Control);
-
-  if (! x || !x->is_Proj())
-    return false;
-
-  ProjNode *proj = x->as_Proj();
-
-  x = proj->lookup(0);
-
-  if (!x || !x->is_MemBar())
-    return false;
-
-  MemBarNode *barrier = x->as_MemBar();
-
-  // if the barrier is a release membar we have what we want. if it is
-  // a cpuorder membar then we need to ensure that it is fed by a
-  // release membar in which case we proceed to check the graph below
-  // this cpuorder membar as the feed
-
-  if (x->Opcode() != Op_MemBarRelease) {
-    if (x->Opcode() != Op_MemBarCPUOrder)
-      return false;
-    Node *ctl = x->lookup(TypeFunc::Control);
-    Node *mem = x->lookup(TypeFunc::Memory);
-    if (!ctl || !ctl->is_Proj() || !mem || !mem->is_Proj())
-      return false;
-    x = ctl->lookup(0);
-    if (!x || !x->is_MemBar() || !x->Opcode() == Op_MemBarRelease)
-      return false;
-    Node *y = mem->lookup(0);
-    if (!y || y != x)
-      return false;
-  }
-
-  ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
-  ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
-  // MemBarRelease needs to have both a Ctl and Mem projection
-  // and we need to have reached it via the Ctl projection
-  if (! ctl || ! mem || ctl != proj)
-    return false;
-
-  MemBarNode *mbvol = NULL;
-
-  // The Ctl ProjNode should have output to a MemBarVolatile and
-  // a Store marked as releasing
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      mbvol = x->as_MemBar();
-    } else if (x->is_Store()) {
-      if (x != st) {
-	return false;
-      }
-    } else if (!x->is_Mach()){
-      return false;
-    }
-  }
+  // does this lead a normal subgraph?
+  MemBarNode *mbvol = leading_to_normal(barrier);
 
   if (!mbvol)
     return false;
 
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      return false;
-    }
-  }
-
-  if (!mm)
+  // all done unless this is a card mark
+  if (!is_card_mark_membar(mbvol))
+    return true;
+  
+  // we found a card mark -- just make sure we have a trailing barrier
+
+  return (card_mark_to_trailing(mbvol) != NULL);
+}
+
+// predicate controlling translation of StoreCM
+//
+// returns true if a StoreStore must precede the card write otherwise
+// false
+
+bool unnecessary_storestore(const Node *storecm)
+{
+  assert(storecm->Opcode()  == Op_StoreCM, "expecting a StoreCM");
+
+  // we only ever need to generate a dmb ishst between an object put
+  // and the associated card mark when we are using CMS without
+  // conditional card marking
+
+  if (!UseConcMarkSweepGC || UseCondCardMark)
+    return true;
+
+  // if we are implementing volatile puts using barriers then the
+  // object put as an str so we must insert the dmb ishst
+
+  if (UseBarriersForVolatile)
     return false;
 
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
+  // we can omit the dmb ishst if this StoreCM is part of a volatile
+  // put because in thta case the put will be implemented by stlr
+  //
+  // we need to check for a normal subgraph feeding this StoreCM.
+  // that means the StoreCM must be fed Memory from a leading membar,
+  // either a MemBarRelease or its dependent MemBarCPUOrder, and the
+  // leading membar must be part of a normal subgraph
+
+  Node *x = storecm->in(StoreNode::Memory);
+
+  if (!x->is_Proj())
+    return false;
+
+  x = x->in(0);
+
+  if (!x->is_MemBar())
+    return false;
+
+  MemBarNode *leading = x->as_MemBar();
+
+  // reject invalid candidates
+  if (!leading_membar(leading))
+    return false;
+
+  // we can omit the StoreStore if it is the head of a normal subgraph
+  return (leading_to_normal(leading) != NULL);
+}
 
 
 #define __ _masm.
@@ -2944,6 +3530,13 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
+  enc_class aarch64_enc_strb0_ordered(memory mem) %{
+    MacroAssembler _masm(&cbuf);
+    __ membar(Assembler::StoreStore);
+    loadStore(_masm, &MacroAssembler::strb, zr, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   enc_class aarch64_enc_strh(iRegI src, memory mem) %{
     Register src_reg = as_Register($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::strh, src_reg, $mem->opcode(),
@@ -6613,6 +7206,7 @@
 instruct storeimmCM0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreCM mem zero));
+  predicate(unnecessary_storestore(n));
 
   ins_cost(INSN_COST);
   format %{ "strb zr, $mem\t# byte" %}
@@ -6622,6 +7216,21 @@
   ins_pipe(istore_mem);
 %}
 
+// Store CMS card-mark Immediate with intervening StoreStore
+// needed when using CMS with no conditional card marking
+instruct storeimmCM0_ordered(immI0 zero, memory mem)
+%{
+  match(Set mem (StoreCM mem zero));
+
+  ins_cost(INSN_COST * 2);
+  format %{ "dmb ishst"
+      "\n\tstrb zr, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_strb0_ordered(mem));
+
+  ins_pipe(istore_mem);
+%}
+
 // Store Byte
 instruct storeB(iRegIorL2I src, memory mem)
 %{
@@ -6643,7 +7252,7 @@
   predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
-  format %{ "strb zr, $mem\t# byte" %}
+  format %{ "strb rscractch2, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_strb0(mem));
 
@@ -7396,6 +8005,7 @@
   format %{ "membar_acquire" %}
 
   ins_encode %{
+    __ block_comment("membar_acquire");
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
 
@@ -7448,6 +8058,7 @@
   format %{ "membar_release" %}
 
   ins_encode %{
+    __ block_comment("membar_release");
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
   ins_pipe(pipe_serial);
@@ -7499,6 +8110,7 @@
   format %{ "membar_volatile" %}
 
   ins_encode %{
+    __ block_comment("membar_volatile");
     __ membar(Assembler::StoreLoad);
   %}
 
@@ -9429,7 +10041,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9465,7 +10077,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9501,7 +10113,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9537,7 +10149,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9573,7 +10185,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9609,7 +10221,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9645,7 +10257,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9681,7 +10293,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9717,7 +10329,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9754,7 +10366,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9792,7 +10404,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9830,7 +10442,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9868,7 +10480,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9906,7 +10518,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9944,7 +10556,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9982,7 +10594,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10020,7 +10632,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10058,7 +10670,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10096,7 +10708,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10134,7 +10746,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10172,7 +10784,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10210,7 +10822,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10248,7 +10860,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10286,7 +10898,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
--- a/src/cpu/aarch64/vm/aarch64_ad.m4	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/aarch64_ad.m4	Thu Aug 27 14:40:19 2015 -0700
@@ -42,7 +42,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -87,7 +87,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
   ins_pipe(ialu_reg_reg_shift);
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -268,7 +268,7 @@
     __ ldar(r21, r28);                                 //       ldar    x21, [x28]
 
 // LoadStoreExclusiveOp
-    __ stxrw(r24, r24, r7);                            //       stxr    w24, w24, [x7]
+    __ stxrw(r21, r24, r7);                            //       stxr    w21, w24, [x7]
     __ stlxrw(r21, r26, r28);                          //       stlxr   w21, w26, [x28]
     __ ldxrw(r21, r6);                                 //       ldxr    w21, [x6]
     __ ldaxrw(r15, r30);                               //       ldaxr   w15, [x30]
@@ -299,7 +299,7 @@
 
 // LoadStoreExclusiveOp
     __ ldxpw(r25, r4, r22);                            //       ldxp    w25, w4, [x22]
-    __ ldaxpw(r14, r14, r15);                          //       ldaxp   w14, w14, [x15]
+    __ ldaxpw(r13, r14, r15);                          //       ldaxp   w13, w14, [x15]
     __ stxpw(r20, r26, r8, r10);                       //       stxp    w20, w26, w8, [x10]
     __ stlxpw(r23, r18, r18, r18);                     //       stlxp   w23, w18, w18, [x18]
 
@@ -773,7 +773,7 @@
  260:   c85fffbb        ldaxr   x27, [x29]
  264:   c89fffa0        stlr    x0, [x29]
  268:   c8dfff95        ldar    x21, [x28]
- 26c:   88187cf8        stxr    w24, w24, [x7]
+ 26c:   88157cf8        stxr    w21, w24, [x7]
  270:   8815ff9a        stlxr   w21, w26, [x28]
  274:   885f7cd5        ldxr    w21, [x6]
  278:   885fffcf        ldaxr   w15, [x30]
@@ -796,7 +796,7 @@
  2bc:   c82870bb        stxp    w8, x27, x28, [x5]
  2c0:   c825b8c8        stlxp   w5, x8, x14, [x6]
  2c4:   887f12d9        ldxp    w25, w4, [x22]
- 2c8:   887fb9ee        ldaxp   w14, w14, [x15]
+ 2c8:   887fb9ed        ldaxp   w13, w14, [x15]
  2cc:   8834215a        stxp    w20, w26, w8, [x10]
  2d0:   8837ca52        stlxp   w23, w18, w18, [x18]
  2d4:   f806317e        str     x30, [x11,#99]
@@ -1085,13 +1085,13 @@
     0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
     0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
     0xd63f00a0,     0xc8147c55,     0xc805fcfd,     0xc85f7e05,
-    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
+    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88157cf8,
     0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
     0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
     0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
     0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
     0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
-    0xc825b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
+    0xc825b8c8,     0x887f12d9,     0x887fb9ed,     0x8834215a,
     0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
     0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
     0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1106,13 +1106,13 @@
 
 #define INSN4(NAME, sz, op, o0) /* Four registers */                    \
   void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) {     \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt1 && Rs != Rt2, "unpredictable instruction"); \
     load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0);                 \
   }
 
 #define INSN3(NAME, sz, op, o0) /* Three registers */                   \
   void NAME(Register Rs, Register Rt, Register Rn) {                    \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction");       \
     load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);    \
   }
 
@@ -1124,6 +1124,7 @@
 
 #define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
   void NAME(Register Rt1, Register Rt2, Register Rn) {                  \
+    guarantee(Rt1 != Rt2, "unpredictable instruction");                 \
     load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);  \
   }
 
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -611,6 +611,7 @@
     Label done;
 
     const Register swap_reg = r0;
+    const Register tmp = c_rarg2;
     const Register obj_reg = c_rarg3; // Will contain the oop
 
     const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
@@ -624,7 +625,7 @@
     ldr(obj_reg, Address(lock_reg, obj_offset));
 
     if (UseBiasedLocking) {
-      biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+      biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, done, &slow_case);
     }
 
     // Load (object->mark() | 1) into swap_reg
@@ -643,7 +644,7 @@
       cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
       bind(fast);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
       b(done);
       bind(fail);
     } else {
@@ -671,7 +672,7 @@
     if (PrintBiasedLockingStatistics) {
       br(Assembler::NE, slow_case);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
     }
     br(Assembler::EQ, done);
 
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -34,6 +34,7 @@
 #include "memory/resourceArea.hpp"
 #include "nativeInst_aarch64.hpp"
 #include "oops/klass.inline.hpp"
+#include "oops/oop.inline.hpp"
 #include "opto/compile.hpp"
 #include "opto/node.hpp"
 #include "runtime/biasedLocking.hpp"
@@ -398,11 +399,7 @@
   if (PrintBiasedLockingStatistics && counters == NULL)
     counters = BiasedLocking::counters();
 
-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    tmp_reg = rscratch2;
-  }
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
@@ -432,7 +429,7 @@
   if (counters != NULL) {
     Label around;
     cbnz(tmp_reg, around);
-    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
     b(done);
     bind(around);
   } else {
@@ -485,7 +482,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -511,7 +508,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -539,7 +536,7 @@
     // removing the bias bit from the object's header.
     if (counters != NULL) {
       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
-                  rscratch1);
+                  rscratch1, rscratch2);
     }
     bind(nope);
   }
@@ -1640,15 +1637,15 @@
   return Address(Rd);
 }
 
-void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
   Label retry_load;
   bind(retry_load);
   // flush and load exclusive from the memory location
   ldxrw(tmp, counter_addr);
   addw(tmp, tmp, 1);
   // if we store+flush with no intervening write tmp wil be zero
-  stxrw(tmp, tmp, counter_addr);
-  cbnzw(tmp, retry_load);
+  stxrw(tmp2, tmp, counter_addr);
+  cbnzw(tmp2, retry_load);
 }
 
 
@@ -2021,6 +2018,14 @@
   }
 }
 
+void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
+  if (decrement.is_register()) {
+    subw(Rd, Rn, decrement.as_register());
+  } else {
+    subw(Rd, Rn, decrement.as_constant());
+  }
+}
+
 void MacroAssembler::reinit_heapbase()
 {
   if (UseCompressedOops) {
@@ -2110,7 +2115,7 @@
     return a != b.as_register() && a != c && b.as_register() != c;
 }
 
-#define ATOMIC_OP(LDXR, OP, STXR)                                       \
+#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
@@ -2120,14 +2125,15 @@
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   OP(rscratch1, result, incr);                                          \
-  STXR(rscratch1, rscratch1, addr);                                     \
-  cbnzw(rscratch1, retry_load);                                         \
-  if (prev->is_valid() && prev != result)                               \
-    mov(prev, result);                                                  \
+  STXR(rscratch2, rscratch1, addr);                                     \
+  cbnzw(rscratch2, retry_load);                                         \
+  if (prev->is_valid() && prev != result) {                             \
+    IOP(prev, rscratch1, incr);                                         \
+  }                                                                     \
 }
 
-ATOMIC_OP(ldxr, add, stxr)
-ATOMIC_OP(ldxrw, addw, stxrw)
+ATOMIC_OP(ldxr, add, sub, stxr)
+ATOMIC_OP(ldxrw, addw, subw, stxrw)
 
 #undef ATOMIC_OP
 
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -107,9 +107,7 @@
   // Biased locking support
   // lock_reg and obj_reg must be loaded up with the appropriate values.
   // swap_reg is killed.
-  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
-  // be killed; if not supplied, push/pop will be used internally to
-  // allocate a temporary (inefficient, avoid if possible).
+  // tmp_reg must be supplied and must not be rscratch1 or rscratch2
   // Optional slow case is for implementations (interpreter and C1) which branch to
   // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
   // Returns offset of first potentially-faulting instruction for null
@@ -126,10 +124,10 @@
 
   // Helper functions for statistics gathering.
   // Unconditional atomic increment.
-  void atomic_incw(Register counter_addr, Register tmp);
-  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+  void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
+  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
     lea(tmp1, counter_addr);
-    atomic_incw(tmp1, tmp2);
+    atomic_incw(tmp1, tmp2, tmp3);
   }
   // Load Effective Address
   void lea(Register r, const Address &a) {
@@ -1057,6 +1055,7 @@
   void add(Register Rd, Register Rn, RegisterOrConstant increment);
   void addw(Register Rd, Register Rn, RegisterOrConstant increment);
   void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
+  void subw(Register Rd, Register Rn, RegisterOrConstant decrement);
 
   void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);
 
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1774,6 +1774,7 @@
   const Register obj_reg  = r19;  // Will contain the oop
   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
   const Register old_hdr  = r13;  // value of old header at unlock time
+  const Register tmp = c_rarg3;
 
   Label slow_path_lock;
   Label lock_done;
@@ -1795,7 +1796,7 @@
     __ ldr(obj_reg, Address(oop_handle_reg, 0));
 
     if (UseBiasedLocking) {
-      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, lock_done, &slow_path_lock);
     }
 
     // Load (object->mark() | 1) into swap_reg %r0
--- a/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1913,15 +1913,18 @@
 }
 
 void TemplateInterpreterGenerator::count_bytecode() {
+  Register rscratch3 = r0;
   __ push(rscratch1);
   __ push(rscratch2);
+  __ push(rscratch3);
   Label L;
   __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
   __ bind(L);
   __ ldxr(rscratch1, rscratch2);
   __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch1, rscratch1, rscratch2);
-  __ cbnzw(rscratch1, L);
+  __ stxr(rscratch3, rscratch1, rscratch2);
+  __ cbnzw(rscratch3, L);
+  __ pop(rscratch3);
   __ pop(rscratch2);
   __ pop(rscratch1);
 }
--- a/src/cpu/x86/vm/assembler_x86.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1674,6 +1674,13 @@
   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
 }
 
+void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  emit_int8(0x2A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -6604,13 +6611,6 @@
   emit_operand(dst, src);
 }
 
-void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
-  emit_int8(0x2A);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
--- a/src/cpu/x86/vm/interp_masm_x86.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -355,8 +355,8 @@
     case ctos:                                   // fall through
     case stos:                                   // fall through
     case itos: movl(rax, val_addr);                 break;
-    case ftos: movflt(xmm0, val_addr);              break;
-    case dtos: movdbl(xmm0, val_addr);              break;
+    case ftos: load_float(val_addr);                break;
+    case dtos: load_double(val_addr);               break;
     case vtos: /* nothing to do */                  break;
     default  : ShouldNotReachHere();
   }
@@ -376,8 +376,8 @@
     case ctos:                                     // fall through
     case stos:                                     // fall through
     case itos: movl(rax, val_addr);                   break;
-    case ftos: fld_s(val_addr);                       break;
-    case dtos: fld_d(val_addr);                       break;
+    case ftos: load_float(val_addr);                  break;
+    case dtos: load_double(val_addr);                 break;
     case vtos: /* nothing to do */                    break;
     default  : ShouldNotReachHere();
   }
@@ -578,6 +578,26 @@
   push(r);
 }
 
+void InterpreterMacroAssembler::push_f(XMMRegister r) {
+  subptr(rsp, wordSize);
+  movflt(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_f(XMMRegister r) {
+  movflt(r, Address(rsp, 0));
+  addptr(rsp, wordSize);
+}
+
+void InterpreterMacroAssembler::push_d(XMMRegister r) {
+  subptr(rsp, 2 * wordSize);
+  movdbl(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_d(XMMRegister r) {
+  movdbl(r, Address(rsp, 0));
+  addptr(rsp, 2 * Interpreter::stackElementSize);
+}
+
 #ifdef _LP64
 void InterpreterMacroAssembler::pop_i(Register r) {
   // XXX can't use pop currently, upper half non clean
@@ -590,31 +610,11 @@
   addptr(rsp, 2 * Interpreter::stackElementSize);
 }
 
-void InterpreterMacroAssembler::pop_f(XMMRegister r) {
-  movflt(r, Address(rsp, 0));
-  addptr(rsp, wordSize);
-}
-
-void InterpreterMacroAssembler::pop_d(XMMRegister r) {
-  movdbl(r, Address(rsp, 0));
-  addptr(rsp, 2 * Interpreter::stackElementSize);
-}
-
 void InterpreterMacroAssembler::push_l(Register r) {
   subptr(rsp, 2 * wordSize);
   movq(Address(rsp, 0), r);
 }
 
-void InterpreterMacroAssembler::push_f(XMMRegister r) {
-  subptr(rsp, wordSize);
-  movflt(Address(rsp, 0), r);
-}
-
-void InterpreterMacroAssembler::push_d(XMMRegister r) {
-  subptr(rsp, 2 * wordSize);
-  movdbl(Address(rsp, 0), r);
-}
-
 void InterpreterMacroAssembler::pop(TosState state) {
   switch (state) {
   case atos: pop_ptr();                 break;
@@ -623,8 +623,8 @@
   case stos:
   case itos: pop_i();                   break;
   case ltos: pop_l();                   break;
-  case ftos: pop_f();                   break;
-  case dtos: pop_d();                   break;
+  case ftos: pop_f(xmm0);               break;
+  case dtos: pop_d(xmm0);               break;
   case vtos: /* nothing to do */        break;
   default:   ShouldNotReachHere();
   }
@@ -640,8 +640,8 @@
   case stos:
   case itos: push_i();                  break;
   case ltos: push_l();                  break;
-  case ftos: push_f();                  break;
-  case dtos: push_d();                  break;
+  case ftos: push_f(xmm0);              break;
+  case dtos: push_d(xmm0);              break;
   case vtos: /* nothing to do */        break;
   default  : ShouldNotReachHere();
   }
@@ -675,8 +675,20 @@
     case stos:                                               // fall through
     case itos: pop_i(rax);                                   break;
     case ltos: pop_l(rax, rdx);                              break;
-    case ftos: pop_f();                                      break;
-    case dtos: pop_d();                                      break;
+    case ftos:
+      if (UseSSE >= 1) {
+        pop_f(xmm0);
+      } else {
+        pop_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        pop_d(xmm0);
+      } else {
+        pop_d();
+      }
+      break;
     case vtos: /* nothing to do */                           break;
     default  : ShouldNotReachHere();
   }
@@ -695,7 +707,7 @@
   fstp_s(Address(rsp, 0));
 }
 
-void InterpreterMacroAssembler::push_d(Register r) {
+void InterpreterMacroAssembler::push_d() {
   // Do not schedule for no AGI! Never write beyond rsp!
   subptr(rsp, 2 * wordSize);
   fstp_d(Address(rsp, 0));
@@ -711,8 +723,20 @@
     case stos:                                               // fall through
     case itos: push_i(rax);                                    break;
     case ltos: push_l(rax, rdx);                               break;
-    case ftos: push_f();                                       break;
-    case dtos: push_d(rax);                                    break;
+    case ftos:
+      if (UseSSE >= 1) {
+        push_f(xmm0);
+      } else {
+        push_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        push_d(xmm0);
+      } else {
+        push_d();
+      }
+      break;
     case vtos: /* nothing to do */                             break;
     default  : ShouldNotReachHere();
   }
@@ -995,22 +1019,6 @@
   leave();                           // remove frame anchor
   pop(ret_addr);                     // get return address
   mov(rsp, rbx);                     // set sp to sender sp
-#ifndef _LP64
-  if (UseSSE) {
-    // float and double are returned in xmm register in SSE-mode
-    if (state == ftos && UseSSE >= 1) {
-      subptr(rsp, wordSize);
-      fstp_s(Address(rsp, 0));
-      movflt(xmm0, Address(rsp, 0));
-      addptr(rsp, wordSize);
-    } else if (state == dtos && UseSSE >= 2) {
-      subptr(rsp, 2*wordSize);
-      fstp_d(Address(rsp, 0));
-      movdbl(xmm0, Address(rsp, 0));
-      addptr(rsp, 2*wordSize);
-    }
-  }
-#endif // _LP64
 }
 #endif // !CC_INTERP
 
@@ -1783,7 +1791,10 @@
 
 void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) {
 #ifndef _LP64
-  if (state == ftos || state == dtos) MacroAssembler::verify_FPU(stack_depth);
+  if ((state == ftos && UseSSE < 1) ||
+      (state == dtos && UseSSE < 2)) {
+    MacroAssembler::verify_FPU(stack_depth);
+  }
 #endif
 }
 
--- a/src/cpu/x86/vm/interp_masm_x86.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/interp_masm_x86.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -140,20 +140,20 @@
   void push_ptr(Register r = rax);
   void push_i(Register r = rax);
 
+  void push_f(XMMRegister r);
+  void pop_f(XMMRegister r);
+  void pop_d(XMMRegister r);
+  void push_d(XMMRegister r);
 #ifdef _LP64
   void pop_l(Register r = rax);
-  void pop_f(XMMRegister r = xmm0);
-  void pop_d(XMMRegister r = xmm0);
   void push_l(Register r = rax);
-  void push_f(XMMRegister r = xmm0);
-  void push_d(XMMRegister r = xmm0);
 #else
   void pop_l(Register lo = rax, Register hi = rdx);
   void pop_f();
   void pop_d();
 
   void push_l(Register lo = rax, Register hi = rdx);
-  void push_d(Register r = rax);
+  void push_d();
   void push_f();
 #endif // _LP64
 
--- a/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -42,6 +42,12 @@
   address generate_Reference_get_entry();
   address generate_CRC32_update_entry();
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+#ifndef _LP64
+  address generate_Float_intBitsToFloat_entry();
+  address generate_Float_floatToRawIntBits_entry();
+  address generate_Double_longBitsToDouble_entry();
+  address generate_Double_doubleToRawLongBits_entry();
+#endif
   void lock_method(void);
   void generate_stack_overflow_check(void);
 
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -3314,6 +3314,42 @@
   fincstp();
 }
 
+void MacroAssembler::load_float(Address src) {
+  if (UseSSE >= 1) {
+    movflt(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_s(src));
+  }
+}
+
+void MacroAssembler::store_float(Address dst) {
+  if (UseSSE >= 1) {
+    movflt(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_s(dst));
+  }
+}
+
+void MacroAssembler::load_double(Address src) {
+  if (UseSSE >= 2) {
+    movdbl(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_d(src));
+  }
+}
+
+void MacroAssembler::store_double(Address dst) {
+  if (UseSSE >= 2) {
+    movdbl(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_d(dst));
+  }
+}
+
 void MacroAssembler::fremr(Register tmp) {
   save_rax(tmp);
   { Label L;
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -471,6 +471,22 @@
   // Pop ST (ffree & fincstp combined)
   void fpop();
 
+  // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_float(Address src);
+
+  // Store float value to 'address'. If UseSSE >= 1, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_float(Address dst);
+
+  // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_double(Address src);
+
+  // Store double value to 'address'. If UseSSE >= 2, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_double(Address dst);
+
   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
   void push_fTOS();
 
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -170,22 +170,12 @@
     __ MacroAssembler::verify_FPU(0, "generate_return_entry_for compiled");
   }
 
-  // In SSE mode, interpreter returns FP results in xmm0 but they need
-  // to end up back on the FPU so it can operate on them.
-  if (state == ftos && UseSSE >= 1) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_return_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_return_entry_for in interpreter");
   }
 
-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_return_entry_for in interpreter");
-
   // Restore stack bottom in case i2c adjusted stack
   __ movptr(rsp, Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize));
   // and NULL it as marker that rsp is now tos until next java call
@@ -217,21 +207,12 @@
 address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state, int step) {
   address entry = __ pc();
 
-  // In SSE mode, FP results are in xmm0
-  if (state == ftos && UseSSE > 0) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_deopt_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_deopt_entry_for in interpreter");
   }
 
-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_deopt_entry_for in interpreter");
-
   // The stack is not extended by deopt but we must NULL last_sp as this
   // entry is like a "return".
   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
@@ -735,7 +716,7 @@
   if (UseCRC32Intrinsics) {
     address entry = __ pc();
 
-    // rbx,: Method*
+    // rbx: Method*
     // rsi: senderSP must preserved for slow path, set SP to it on fast path
     // rdx: scratch
     // rdi: scratch
@@ -841,6 +822,124 @@
   return generate_native_entry(false);
 }
 
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.intBitsToFloat(int bits)
+ */
+address InterpreterGenerator::generate_Float_intBitsToFloat_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+    __ movflt(xmm0, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.floatToRawIntBits(float value)
+ */
+address InterpreterGenerator::generate_Float_floatToRawIntBits_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.longBitsToDouble(long bits)
+ */
+address InterpreterGenerator::generate_Double_longBitsToDouble_entry() {
+  address entry;
+
+   if (UseSSE >= 2) {
+     entry = __ pc();
+
+     // rsi: the sender's SP
+
+     // Skip safepoint check (compiler intrinsic versions of this method
+     // do not perform safepoint checks either).
+
+     // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+     __ movdbl(xmm0, Address(rsp, wordSize));
+
+     // Return
+     __ pop(rdi); // get return address
+     __ mov(rsp, rsi); // set rsp to the sender's SP
+     __ jmp(rdi);
+   } else {
+     entry = generate_native_entry(false);
+   }
+
+   return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.doubleToRawLongBits(double value)
+ */
+address InterpreterGenerator::generate_Double_doubleToRawLongBits_entry() {
+  address entry;
+
+  if (UseSSE >= 2) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rdx, Address(rsp, 2*wordSize));
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
 //
 // Interpreter stub for calling a native method. (asm interpreter)
 // This sets up a somewhat different looking stack for calling the native method
@@ -1090,7 +1189,7 @@
               double_handler.addr());
     __ jcc(Assembler::notEqual, L);
     __ bind(push_double);
-    __ push(dtos);
+    __ push_d(); // FP values are returned using the FPU, so push FPU contents (even if UseSSE > 0).
     __ bind(L);
   }
   __ push(ltos);
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1707,10 +1707,10 @@
                                                          address& vep) {
   assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
   Label L;
-  aep = __ pc();  __ push_ptr();  __ jmp(L);
-  fep = __ pc();  __ push_f();    __ jmp(L);
-  dep = __ pc();  __ push_d();    __ jmp(L);
-  lep = __ pc();  __ push_l();    __ jmp(L);
+  aep = __ pc();  __ push_ptr();   __ jmp(L);
+  fep = __ pc();  __ push_f(xmm0); __ jmp(L);
+  dep = __ pc();  __ push_d(xmm0); __ jmp(L);
+  lep = __ pc();  __ push_l();     __ jmp(L);
   bep = cep = sep =
   iep = __ pc();  __ push_i();
   vep = __ pc();
--- a/src/cpu/x86/vm/templateTable_x86.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/cpu/x86/vm/templateTable_x86.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -349,53 +349,60 @@
 
 void TemplateTable::fconst(int value) {
   transition(vtos, ftos);
+  if (UseSSE >= 1) {
+    static float one = 1.0f, two = 2.0f;
+    switch (value) {
+    case 0:
+      __ xorps(xmm0, xmm0);
+      break;
+    case 1:
+      __ movflt(xmm0, ExternalAddress((address) &one));
+      break;
+    case 2:
+      __ movflt(xmm0, ExternalAddress((address) &two));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static float one = 1.0f, two = 2.0f;
-  switch (value) {
-  case 0:
-    __ xorps(xmm0, xmm0);
-    break;
-  case 1:
-    __ movflt(xmm0, ExternalAddress((address) &one));
-    break;
-  case 2:
-    __ movflt(xmm0, ExternalAddress((address) &two));
-    break;
-  default:
     ShouldNotReachHere();
-    break;
+#else
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
+    } else                 { ShouldNotReachHere();
+    }
+#endif // _LP64
   }
-#else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
-  } else                 { ShouldNotReachHere();
-  }
-#endif
 }
 
 void TemplateTable::dconst(int value) {
   transition(vtos, dtos);
+  if (UseSSE >= 2) {
+    static double one = 1.0;
+    switch (value) {
+    case 0:
+      __ xorpd(xmm0, xmm0);
+      break;
+    case 1:
+      __ movdbl(xmm0, ExternalAddress((address) &one));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static double one = 1.0;
-  switch (value) {
-  case 0:
-    __ xorpd(xmm0, xmm0);
-    break;
-  case 1:
-    __ movdbl(xmm0, ExternalAddress((address) &one));
-    break;
-  default:
     ShouldNotReachHere();
-    break;
+#else
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else                 { ShouldNotReachHere();
+    }
+#endif
   }
-
-#else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else                 { ShouldNotReachHere();
-  }
-#endif
 }
 
 void TemplateTable::bipush() {
@@ -454,8 +461,7 @@
   __ jccb(Assembler::notEqual, notFloat);
 
   // ftos
-  LP64_ONLY(__ movflt(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_s(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_float(Address(rcx, rbx, Address::times_ptr, base_offset));
   __ push(ftos);
   __ jmp(Done);
 
@@ -522,8 +528,7 @@
   __ jccb(Assembler::notEqual, Long);
 
   // dtos
-  LP64_ONLY(__ movdbl(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_d(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_double(Address(rcx, rbx, Address::times_ptr, base_offset));
   __ push(dtos);
 
   __ jmpb(Done);
@@ -617,15 +622,13 @@
 void TemplateTable::fload() {
   transition(vtos, ftos);
   locals_index(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }
 
 void TemplateTable::dload() {
   transition(vtos, dtos);
   locals_index(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }
 
 void TemplateTable::aload() {
@@ -657,15 +660,13 @@
 void TemplateTable::wide_fload() {
   transition(vtos, ftos);
   locals_index_wide(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }
 
 void TemplateTable::wide_dload() {
   transition(vtos, dtos);
   locals_index_wide(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }
 
 void TemplateTable::wide_aload() {
@@ -726,10 +727,9 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movflt(xmm0, Address(rdx, rax,
-                         Address::times_4,
-                         arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
-  NOT_LP64(__ fld_s(Address(rdx, rax, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ load_float(Address(rdx, rax,
+                        Address::times_4,
+                        arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }
 
 void TemplateTable::daload() {
@@ -737,10 +737,9 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movdbl(xmm0, Address(rdx, rax,
-                          Address::times_8,
-                          arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
-  NOT_LP64(__ fld_d(Address(rdx, rax, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ load_double(Address(rdx, rax,
+                         Address::times_8,
+                         arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }
 
 void TemplateTable::aaload() {
@@ -807,14 +806,12 @@
 
 void TemplateTable::fload(int n) {
   transition(vtos, ftos);
-  LP64_ONLY(__ movflt(xmm0, faddress(n)));
-  NOT_LP64(__ fld_s(faddress(n)));
+  __ load_float(faddress(n));
 }
 
 void TemplateTable::dload(int n) {
   transition(vtos, dtos);
-  LP64_ONLY(__ movdbl(xmm0, daddress(n)));
-  NOT_LP64(__ fld_d(daddress(n)));
+  __ load_double(daddress(n));
 }
 
 void TemplateTable::aload(int n) {
@@ -919,15 +916,13 @@
 void TemplateTable::fstore() {
   transition(ftos, vtos);
   locals_index(rbx);
-  LP64_ONLY(__ movflt(faddress(rbx), xmm0));
-  NOT_LP64(__ fstp_s(faddress(rbx)));
+  __ store_float(faddress(rbx));
 }
 
 void TemplateTable::dstore() {
   transition(dtos, vtos);
   locals_index(rbx);
-  LP64_ONLY(__ movdbl(daddress(rbx), xmm0));
-  NOT_LP64(__ fstp_d(daddress(rbx)));
+  __ store_double(daddress(rbx));
 }
 
 void TemplateTable::astore() {
@@ -956,7 +951,7 @@
 void TemplateTable::wide_fstore() {
 #ifdef _LP64
   transition(vtos, vtos);
-  __ pop_f();
+  __ pop_f(xmm0);
   locals_index_wide(rbx);
   __ movflt(faddress(rbx), xmm0);
 #else
@@ -967,7 +962,7 @@
 void TemplateTable::wide_dstore() {
 #ifdef _LP64
   transition(vtos, vtos);
-  __ pop_d();
+  __ pop_d(xmm0);
   locals_index_wide(rbx);
   __ movdbl(daddress(rbx), xmm0);
 #else
@@ -1011,29 +1006,21 @@
 void TemplateTable::fastore() {
   transition(ftos, vtos);
   __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 1 ? xmm0 : ST(0)
   // rbx:  index
   // rdx:  array
   index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movflt(Address(rdx, rbx,
-                   Address::times_4,
-                   arrayOopDesc::base_offset_in_bytes(T_FLOAT)),
-           xmm0));
-  NOT_LP64(__ fstp_s(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ store_float(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }
 
 void TemplateTable::dastore() {
   transition(dtos, vtos);
   __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 2 ? xmm0 : ST(0)
   // rbx:  index
   // rdx:  array
   index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movdbl(Address(rdx, rbx,
-                   Address::times_8,
-                   arrayOopDesc::base_offset_in_bytes(T_DOUBLE)),
-           xmm0));
-  NOT_LP64(__ fstp_d(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ store_double(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }
 
 void TemplateTable::aastore() {
@@ -1134,14 +1121,12 @@
 
 void TemplateTable::fstore(int n) {
   transition(ftos, vtos);
-  LP64_ONLY(__ movflt(faddress(n), xmm0));
-  NOT_LP64(__ fstp_s(faddress(n)));
+  __ store_float(faddress(n));
 }
 
 void TemplateTable::dstore(int n) {
   transition(dtos, vtos);
-  LP64_ONLY(__ movdbl(daddress(n), xmm0));
-  NOT_LP64(__ fstp_d(daddress(n)));
+  __ store_double(daddress(n));
 }
 
 
@@ -1425,82 +1410,127 @@
 
 void TemplateTable::fop2(Operation op) {
   transition(ftos, ftos);
+
+  if (UseSSE >= 1) {
+    switch (op) {
+    case add:
+      __ addss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ subss(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ divss(xmm0, xmm1);
+      break;
+    case rem:
+      // On x86_64 platforms the SharedRuntime::frem method is called to perform the
+      // modulo operation. The frem method calls the function
+      // double fmod(double x, double y) in math.h. The documentation of fmod states:
+      // "If x or y is a NaN, a NaN is returned." without specifying what type of NaN
+      // (signalling or quiet) is returned.
+      //
+      // On x86_32 platforms the FPU is used to perform the modulo operation. The
+      // reason is that on 32-bit Windows the sign of modulo operations diverges from
+      // what is considered the standard (e.g., -0.0f % -3.14f is 0.0f (and not -0.0f).
+      // The fprem instruction used on x86_32 is functionally equivalent to
+      // SharedRuntime::frem in that it returns a NaN.
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ subss(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ divss(xmm0, xmm1);
-    break;
-  case rem:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
-    break;
-  default:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
+#else
+      __ push_f(xmm0);
+      __ pop_f();
+      __ fld_s(at_rsp());
+      __ fremr(rax);
+      __ f2ieee();
+      __ pop(rax);  // pop second operand off the stack
+      __ push_f();
+      __ pop_f(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
     ShouldNotReachHere();
-    break;
-  }
 #else
-  switch (op) {
+    switch (op) {
     case add: __ fadd_s (at_rsp());                break;
     case sub: __ fsubr_s(at_rsp());                break;
     case mul: __ fmul_s (at_rsp());                break;
     case div: __ fdivr_s(at_rsp());                break;
     case rem: __ fld_s  (at_rsp()); __ fremr(rax); break;
     default : ShouldNotReachHere();
+    }
+    __ f2ieee();
+    __ pop(rax);  // pop second operand off the stack
+#endif // _LP64
   }
-  __ f2ieee();
-  __ pop(rax);  // pop float thing off
-#endif
 }
 
 void TemplateTable::dop2(Operation op) {
   transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    switch (op) {
+    case add:
+      __ addsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ subsd(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ divsd(xmm0, xmm1);
+      break;
+    case rem:
+      // Similar to fop2(), the modulo operation is performed using the
+      // SharedRuntime::drem method (on x86_64 platforms) or using the
+      // FPU (on x86_32 platforms) for the same reasons as mentioned in fop2().
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ subsd(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ divsd(xmm0, xmm1);
-    break;
-  case rem:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
-    break;
-  default:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
+#else
+      __ push_d(xmm0);
+      __ pop_d();
+      __ fld_d(at_rsp());
+      __ fremr(rax);
+      __ d2ieee();
+      __ pop(rax);
+      __ pop(rdx);
+      __ push_d();
+      __ pop_d(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
     ShouldNotReachHere();
-    break;
-  }
 #else
-  switch (op) {
+    switch (op) {
     case add: __ fadd_d (at_rsp());                break;
     case sub: __ fsubr_d(at_rsp());                break;
     case mul: {
@@ -1543,12 +1573,13 @@
     }
     case rem: __ fld_d  (at_rsp()); __ fremr(rax); break;
     default : ShouldNotReachHere();
+    }
+    __ d2ieee();
+    // Pop double precision number from rsp.
+    __ pop(rax);
+    __ pop(rdx);
+#endif
   }
-  __ d2ieee();
-  // Pop double precision number from rsp.
-  __ pop(rax);
-  __ pop(rdx);
-#endif
 }
 
 void TemplateTable::ineg() {
@@ -1562,7 +1593,6 @@
   NOT_LP64(__ lneg(rdx, rax));
 }
 
-#ifdef _LP64
 // Note: 'double' and 'long long' have 32-bits alignment on x86.
 static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
   // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
@@ -1577,26 +1607,30 @@
 // Buffer for 128-bits masks used by SSE instructions.
 static jlong float_signflip_pool[2*2];
 static jlong double_signflip_pool[2*2];
-#endif
 
 void TemplateTable::fneg() {
   transition(ftos, ftos);
-#ifdef _LP64
-  static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
-  __ xorps(xmm0, ExternalAddress((address) float_signflip));
-#else
-  __ fchs();
-#endif
+  if (UseSSE >= 1) {
+    static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
+    __ xorps(xmm0, ExternalAddress((address) float_signflip));
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(__ fchs());
+  }
 }
 
 void TemplateTable::dneg() {
   transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
+    __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+  } else {
 #ifdef _LP64
-  static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
-  __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+    ShouldNotReachHere();
 #else
-  __ fchs();
+    __ fchs();
 #endif
+  }
 }
 
 void TemplateTable::iinc() {
@@ -1798,18 +1832,26 @@
       __ extend_sign(rdx, rax);
       break;
     case Bytecodes::_i2f:
-      __ push(rax);          // store int on tos
-      __ fild_s(at_rsp());   // load int to ST0
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE >= 1) {
+        __ cvtsi2ssl(xmm0, rax);
+      } else {
+        __ push(rax);          // store int on tos
+        __ fild_s(at_rsp());   // load int to ST0
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+      }
       break;
     case Bytecodes::_i2d:
+      if (UseSSE >= 2) {
+        __ cvtsi2sdl(xmm0, rax);
+      } else {
       __ push(rax);          // add one slot for d2ieee()
       __ push(rax);          // store int on tos
       __ fild_s(at_rsp());   // load int to ST0
       __ d2ieee();           // truncate to double size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      }
       break;
     case Bytecodes::_i2b:
       __ shll(rax, 24);      // truncate upper 24 bits
@@ -1829,50 +1871,102 @@
       /* nothing to do */
       break;
     case Bytecodes::_l2f:
+      // On 64-bit platforms, the cvtsi2ssq instruction is used to convert
+      // 64-bit long values to floats. On 32-bit platforms it is not possible
+      // to use that instruction with 64-bit operands, therefore the FPU is
+      // used to perform the conversion.
       __ push(rdx);          // store long on tos
       __ push(rax);
       __ fild_d(at_rsp());   // load long to ST0
       __ f2ieee();           // truncate to float size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      if (UseSSE >= 1) {
+        __ push_f();
+        __ pop_f(xmm0);
+      }
       break;
     case Bytecodes::_l2d:
+      // On 32-bit platforms the FPU is used for conversion because on
+      // 32-bit platforms it is not not possible to use the cvtsi2sdq
+      // instruction with 64-bit operands.
       __ push(rdx);          // store long on tos
       __ push(rax);
       __ fild_d(at_rsp());   // load long to ST0
       __ d2ieee();           // truncate to double size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      if (UseSSE >= 2) {
+        __ push_d();
+        __ pop_d(xmm0);
+      }
       break;
     case Bytecodes::_f2i:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2i does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+        __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), 1);
       break;
     case Bytecodes::_f2l:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2l does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+       __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), 1);
       break;
     case Bytecodes::_f2d:
-      /* nothing to do */
+      if (UseSSE < 1) {
+        /* nothing to do */
+      } else if (UseSSE == 1) {
+        __ push_f(xmm0);
+        __ pop_f();
+      } else { // UseSSE >= 2
+        __ cvtss2sd(xmm0, xmm0);
+      }
       break;
     case Bytecodes::_d2i:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), 2);
       break;
     case Bytecodes::_d2l:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), 2);
       break;
     case Bytecodes::_d2f:
-      __ push(rcx);          // reserve space for f2ieee()
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE <= 1) {
+        __ push(rcx);          // reserve space for f2ieee()
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+        if (UseSSE == 1) {
+          // The cvtsd2ss instruction is not available if UseSSE==1, therefore
+          // the conversion is performed using the FPU in this case.
+          __ push_f();
+          __ pop_f(xmm0);
+        }
+      } else { // UseSSE >= 2
+        __ cvtsd2ss(xmm0, xmm0);
+      }
       break;
     default             :
       ShouldNotReachHere();
@@ -1901,42 +1995,47 @@
 }
 
 void TemplateTable::float_cmp(bool is_float, int unordered_result) {
-#ifdef _LP64
-  Label done;
-  if (is_float) {
-    // XXX get rid of pop here, use ... reg, mem32
-    __ pop_f(xmm1);
-    __ ucomiss(xmm1, xmm0);
-  } else {
-    // XXX get rid of pop here, use ... reg, mem64
-    __ pop_d(xmm1);
-    __ ucomisd(xmm1, xmm0);
-  }
-  if (unordered_result < 0) {
-    __ movl(rax, -1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::below, done);
-    __ setb(Assembler::notEqual, rdx);
-    __ movzbl(rax, rdx);
+  if ((is_float && UseSSE >= 1) ||
+      (!is_float && UseSSE >= 2)) {
+    Label done;
+    if (is_float) {
+      // XXX get rid of pop here, use ... reg, mem32
+      __ pop_f(xmm1);
+      __ ucomiss(xmm1, xmm0);
+    } else {
+      // XXX get rid of pop here, use ... reg, mem64
+      __ pop_d(xmm1);
+      __ ucomisd(xmm1, xmm0);
+    }
+    if (unordered_result < 0) {
+      __ movl(rax, -1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::below, done);
+      __ setb(Assembler::notEqual, rdx);
+      __ movzbl(rax, rdx);
+    } else {
+      __ movl(rax, 1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::above, done);
+      __ movl(rax, 0);
+      __ jccb(Assembler::equal, done);
+      __ decrementl(rax);
+    }
+    __ bind(done);
   } else {
-    __ movl(rax, 1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::above, done);
-    __ movl(rax, 0);
-    __ jccb(Assembler::equal, done);
-    __ decrementl(rax);
-  }
-  __ bind(done);
+#ifdef _LP64
+    ShouldNotReachHere();
 #else
-  if (is_float) {
-    __ fld_s(at_rsp());
-  } else {
-    __ fld_d(at_rsp());
-    __ pop(rdx);
+    if (is_float) {
+      __ fld_s(at_rsp());
+    } else {
+      __ fld_d(at_rsp());
+      __ pop(rdx);
+    }
+    __ pop(rcx);
+    __ fcmp2int(rax, unordered_result < 0);
+#endif // _LP64
   }
-  __ pop(rcx);
-  __ fcmp2int(rax, unordered_result < 0);
-#endif
 }
 
 void TemplateTable::branch(bool is_jsr, bool is_wide) {
@@ -2014,6 +2113,7 @@
     __ pop(rcx);
     __ pop(rdx);
     __ movptr(rax, Address(rcx, Method::method_counters_offset()));
+    __ testptr(rax, rax);
     __ jcc(Assembler::zero, dispatch);
     __ bind(has_counters);
 
@@ -2747,8 +2847,7 @@
   __ jcc(Assembler::notEqual, notFloat);
   // ftos
 
-  LP64_ONLY(__ movflt(xmm0, field));
-  NOT_LP64(__ fld_s(field));
+  __ load_float(field);
   __ push(ftos);
   // Rewrite bytecode to be faster
   if (!is_static && rc == may_rewrite) {
@@ -2762,8 +2861,7 @@
   __ jcc(Assembler::notEqual, notDouble);
 #endif
   // dtos
-  LP64_ONLY(__ movdbl(xmm0, field));
-  NOT_LP64(__ fld_d(field));
+  __ load_double(field);
   __ push(dtos);
   // Rewrite bytecode to be faster
   if (!is_static && rc == may_rewrite) {
@@ -3045,8 +3143,7 @@
   {
     __ pop(ftos);
     if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_s(field);)
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_fputfield, bc, rbx, true, byte_no);
     }
@@ -3063,8 +3160,7 @@
   {
     __ pop(dtos);
     if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_d(field);)
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_dputfield, bc, rbx, true, byte_no);
     }
@@ -3122,8 +3218,8 @@
     case Bytecodes::_fast_sputfield: // fall through
     case Bytecodes::_fast_cputfield: // fall through
     case Bytecodes::_fast_iputfield: __ push_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ push_d(); break;
-    case Bytecodes::_fast_fputfield: __ push_f(); break;
+    case Bytecodes::_fast_dputfield: __ push(dtos); break;
+    case Bytecodes::_fast_fputfield: __ push(ftos); break;
     case Bytecodes::_fast_lputfield: __ push_l(rax); break;
 
     default:
@@ -3146,8 +3242,8 @@
     case Bytecodes::_fast_sputfield: // fall through
     case Bytecodes::_fast_cputfield: // fall through
     case Bytecodes::_fast_iputfield: __ pop_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ pop_d(); break;
-    case Bytecodes::_fast_fputfield: __ pop_f(); break;
+    case Bytecodes::_fast_dputfield: __ pop(dtos); break;
+    case Bytecodes::_fast_fputfield: __ pop(ftos); break;
     case Bytecodes::_fast_lputfield: __ pop_l(rax); break;
     }
     __ bind(L2);
@@ -3211,12 +3307,10 @@
     __ movw(field, rax);
     break;
   case Bytecodes::_fast_fputfield:
-    NOT_LP64( __ fstp_s(field); )
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
     break;
   case Bytecodes::_fast_dputfield:
-    NOT_LP64( __ fstp_d(field); )
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
     break;
   default:
     ShouldNotReachHere();
@@ -3301,12 +3395,10 @@
     __ load_unsigned_short(rax, field);
     break;
   case Bytecodes::_fast_fgetfield:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
     break;
   case Bytecodes::_fast_dgetfield:
-    LP64_ONLY(__ movdbl(xmm0, field));
-    NOT_LP64(__ fld_d(field));
+    __ load_double(field);
     break;
   default:
     ShouldNotReachHere();
@@ -3346,8 +3438,7 @@
     __ verify_oop(rax);
     break;
   case ftos:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
     break;
   default:
     ShouldNotReachHere();
--- a/src/os/aix/vm/perfMemory_aix.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/os/aix/vm/perfMemory_aix.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2012, 2013 SAP AG. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -454,13 +454,27 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
+//
 static void close_directory_secure_cwd(DIR* dirp, int saved_cwd_fd) {
 
   int result;
--- a/src/os/bsd/vm/perfMemory_bsd.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/os/bsd/vm/perfMemory_bsd.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -375,10 +375,23 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
--- a/src/os/linux/vm/os_linux.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/os/linux/vm/os_linux.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -5785,9 +5785,11 @@
         status = pthread_mutex_unlock(_mutex);
         assert(status == 0, "invariant");
       } else {
+        // must capture correct index before unlocking
+        int index = _cur_index;
         status = pthread_mutex_unlock(_mutex);
         assert(status == 0, "invariant");
-        status = pthread_cond_signal(&_cond[_cur_index]);
+        status = pthread_cond_signal(&_cond[index]);
         assert(status == 0, "invariant");
       }
     } else {
--- a/src/os/linux/vm/perfMemory_linux.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/os/linux/vm/perfMemory_linux.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -374,10 +374,23 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
--- a/src/os/solaris/vm/perfMemory_solaris.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/os/solaris/vm/perfMemory_solaris.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -377,10 +377,23 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
--- a/src/share/vm/classfile/systemDictionary.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/classfile/systemDictionary.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -2680,187 +2680,3 @@
 #endif // INCLUDE_TRACE
 }
 
-#ifndef PRODUCT
-
-// statistics code
-class ClassStatistics: AllStatic {
- private:
-  static int nclasses;        // number of classes
-  static int nmethods;        // number of methods
-  static int nmethoddata;     // number of methodData
-  static int class_size;      // size of class objects in words
-  static int method_size;     // size of method objects in words
-  static int debug_size;      // size of debug info in methods
-  static int methoddata_size; // size of methodData objects in words
-
-  static void do_class(Klass* k) {
-    nclasses++;
-    class_size += k->size();
-    if (k->oop_is_instance()) {
-      InstanceKlass* ik = (InstanceKlass*)k;
-      class_size += ik->methods()->size();
-      class_size += ik->constants()->size();
-      class_size += ik->local_interfaces()->size();
-      class_size += ik->transitive_interfaces()->size();
-      // We do not have to count implementors, since we only store one!
-      // SSS: How should these be accounted now that they have moved?
-      // class_size += ik->fields()->length();
-    }
-  }
-
-  static void do_method(Method* m) {
-    nmethods++;
-    method_size += m->size();
-    // class loader uses same objArray for empty vectors, so don't count these
-    if (m->has_stackmap_table()) {
-      method_size += m->stackmap_data()->size();
-    }
-
-    MethodData* mdo = m->method_data();
-    if (mdo != NULL) {
-      nmethoddata++;
-      methoddata_size += mdo->size();
-    }
-  }
-
- public:
-  static void print() {
-    SystemDictionary::classes_do(do_class);
-    SystemDictionary::methods_do(do_method);
-    tty->print_cr("Class statistics:");
-    tty->print_cr("%d classes (%d bytes)", nclasses, class_size * oopSize);
-    tty->print_cr("%d methods (%d bytes = %d base + %d debug info)", nmethods,
-                  (method_size + debug_size) * oopSize, method_size * oopSize, debug_size * oopSize);
-    tty->print_cr("%d methoddata (%d bytes)", nmethoddata, methoddata_size * oopSize);
-  }
-};
-
-
-int ClassStatistics::nclasses        = 0;
-int ClassStatistics::nmethods        = 0;
-int ClassStatistics::nmethoddata     = 0;
-int ClassStatistics::class_size      = 0;
-int ClassStatistics::method_size     = 0;
-int ClassStatistics::debug_size      = 0;
-int ClassStatistics::methoddata_size = 0;
-
-void SystemDictionary::print_class_statistics() {
-  ResourceMark rm;
-  ClassStatistics::print();
-}
-
-
-class MethodStatistics: AllStatic {
- public:
-  enum {
-    max_parameter_size = 10
-  };
- private:
-
-  static int _number_of_methods;
-  static int _number_of_final_methods;
-  static int _number_of_static_methods;
-  static int _number_of_native_methods;
-  static int _number_of_synchronized_methods;
-  static int _number_of_profiled_methods;
-  static int _number_of_bytecodes;
-  static int _parameter_size_profile[max_parameter_size];
-  static int _bytecodes_profile[Bytecodes::number_of_java_codes];
-
-  static void initialize() {
-    _number_of_methods        = 0;
-    _number_of_final_methods  = 0;
-    _number_of_static_methods = 0;
-    _number_of_native_methods = 0;
-    _number_of_synchronized_methods = 0;
-    _number_of_profiled_methods = 0;
-    _number_of_bytecodes      = 0;
-    for (int i = 0; i < max_parameter_size             ; i++) _parameter_size_profile[i] = 0;
-    for (int j = 0; j < Bytecodes::number_of_java_codes; j++) _bytecodes_profile     [j] = 0;
-  };
-
-  static void do_method(Method* m) {
-    _number_of_methods++;
-    // collect flag info
-    if (m->is_final()       ) _number_of_final_methods++;
-    if (m->is_static()      ) _number_of_static_methods++;
-    if (m->is_native()      ) _number_of_native_methods++;
-    if (m->is_synchronized()) _number_of_synchronized_methods++;
-    if (m->method_data() != NULL) _number_of_profiled_methods++;
-    // collect parameter size info (add one for receiver, if any)
-    _parameter_size_profile[MIN2(m->size_of_parameters() + (m->is_static() ? 0 : 1), max_parameter_size - 1)]++;
-    // collect bytecodes info
-    {
-      Thread *thread = Thread::current();
-      HandleMark hm(thread);
-      BytecodeStream s(methodHandle(thread, m));
-      Bytecodes::Code c;
-      while ((c = s.next()) >= 0) {
-        _number_of_bytecodes++;
-        _bytecodes_profile[c]++;
-      }
-    }
-  }
-
- public:
-  static void print() {
-    initialize();
-    SystemDictionary::methods_do(do_method);
-    // generate output
-    tty->cr();
-    tty->print_cr("Method statistics (static):");
-    // flag distribution
-    tty->cr();
-    tty->print_cr("%6d final        methods  %6.1f%%", _number_of_final_methods       , _number_of_final_methods        * 100.0F / _number_of_methods);
-    tty->print_cr("%6d static       methods  %6.1f%%", _number_of_static_methods      , _number_of_static_methods       * 100.0F / _number_of_methods);
-    tty->print_cr("%6d native       methods  %6.1f%%", _number_of_native_methods      , _number_of_native_methods       * 100.0F / _number_of_methods);
-    tty->print_cr("%6d synchronized methods  %6.1f%%", _number_of_synchronized_methods, _number_of_synchronized_methods * 100.0F / _number_of_methods);
-    tty->print_cr("%6d profiled     methods  %6.1f%%", _number_of_profiled_methods, _number_of_profiled_methods * 100.0F / _number_of_methods);
-    // parameter size profile
-    tty->cr();
-    { int tot = 0;
-      int avg = 0;
-      for (int i = 0; i < max_parameter_size; i++) {
-        int n = _parameter_size_profile[i];
-        tot += n;
-        avg += n*i;
-        tty->print_cr("parameter size = %1d: %6d methods  %5.1f%%", i, n, n * 100.0F / _number_of_methods);
-      }
-      assert(tot == _number_of_methods, "should be the same");
-      tty->print_cr("                    %6d methods  100.0%%", _number_of_methods);
-      tty->print_cr("(average parameter size = %3.1f including receiver, if any)", (float)avg / _number_of_methods);
-    }
-    // bytecodes profile
-    tty->cr();
-    { int tot = 0;
-      for (int i = 0; i < Bytecodes::number_of_java_codes; i++) {
-        if (Bytecodes::is_defined(i)) {
-          Bytecodes::Code c = Bytecodes::cast(i);
-          int n = _bytecodes_profile[c];
-          tot += n;
-          tty->print_cr("%9d  %7.3f%%  %s", n, n * 100.0F / _number_of_bytecodes, Bytecodes::name(c));
-        }
-      }
-      assert(tot == _number_of_bytecodes, "should be the same");
-      tty->print_cr("%9d  100.000%%", _number_of_bytecodes);
-    }
-    tty->cr();
-  }
-};
-
-int MethodStatistics::_number_of_methods;
-int MethodStatistics::_number_of_final_methods;
-int MethodStatistics::_number_of_static_methods;
-int MethodStatistics::_number_of_native_methods;
-int MethodStatistics::_number_of_synchronized_methods;
-int MethodStatistics::_number_of_profiled_methods;
-int MethodStatistics::_number_of_bytecodes;
-int MethodStatistics::_parameter_size_profile[MethodStatistics::max_parameter_size];
-int MethodStatistics::_bytecodes_profile[Bytecodes::number_of_java_codes];
-
-
-void SystemDictionary::print_method_statistics() {
-  MethodStatistics::print();
-}
-
-#endif // PRODUCT
--- a/src/share/vm/classfile/systemDictionary.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/classfile/systemDictionary.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -366,8 +366,6 @@
   // Printing
   static void print(bool details = true);
   static void print_shared(bool details = true);
-  static void print_class_statistics()  PRODUCT_RETURN;
-  static void print_method_statistics() PRODUCT_RETURN;
 
   // Number of contained klasses
   // This is both fully loaded classes and classes in the process
--- a/src/share/vm/code/codeCache.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/code/codeCache.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -746,14 +746,17 @@
 void CodeCache::gc_epilogue() {
   assert_locked_or_safepoint(CodeCache_lock);
   NMethodIterator iter;
-  while(iter.next_alive()) {
+  while(iter.next()) {
     nmethod* nm = iter.method();
-    assert(!nm->is_unloaded(), "Tautology");
-    if (needs_cache_clean()) {
-      nm->cleanup_inline_caches();
+    if (!nm->is_zombie()) {
+      if (needs_cache_clean()) {
+        // Clean ICs of unloaded nmethods as well because they may reference other
+        // unloaded nmethods that may be flushed earlier in the sweeper cycle.
+        nm->cleanup_inline_caches();
+      }
+      DEBUG_ONLY(nm->verify());
+      DEBUG_ONLY(nm->verify_oop_relocations());
     }
-    DEBUG_ONLY(nm->verify());
-    DEBUG_ONLY(nm->verify_oop_relocations());
   }
   set_needs_cache_clean(false);
   prune_scavenge_root_nmethods();
@@ -993,29 +996,6 @@
   return number_of_marked_CodeBlobs;
 }
 
-void CodeCache::make_marked_nmethods_zombies() {
-  assert(SafepointSynchronize::is_at_safepoint(), "must be at a safepoint");
-  NMethodIterator iter;
-  while(iter.next_alive()) {
-    nmethod* nm = iter.method();
-    if (nm->is_marked_for_deoptimization()) {
-
-      // If the nmethod has already been made non-entrant and it can be converted
-      // then zombie it now. Otherwise make it non-entrant and it will eventually
-      // be zombied when it is no longer seen on the stack. Note that the nmethod
-      // might be "entrant" and not on the stack and so could be zombied immediately
-      // but we can't tell because we don't track it on stack until it becomes
-      // non-entrant.
-
-      if (nm->is_not_entrant() && nm->can_not_entrant_be_converted()) {
-        nm->make_zombie();
-      } else {
-        nm->make_not_entrant();
-      }
-    }
-  }
-}
-
 void CodeCache::make_marked_nmethods_not_entrant() {
   assert_locked_or_safepoint(CodeCache_lock);
   NMethodIterator iter;
@@ -1072,7 +1052,7 @@
     // Deoptimize all activations depending on marked nmethods
     Deoptimization::deoptimize_dependents();
 
-    // Make the dependent methods not entrant (in VM_Deoptimize they are made zombies)
+    // Make the dependent methods not entrant
     make_marked_nmethods_not_entrant();
   }
 }
@@ -1102,7 +1082,7 @@
     // Deoptimize all activations depending on marked nmethods
     Deoptimization::deoptimize_dependents();
 
-    // Make the dependent methods not entrant (in VM_Deoptimize they are made zombies)
+    // Make the dependent methods not entrant
     make_marked_nmethods_not_entrant();
   }
 }
--- a/src/share/vm/code/codeCache.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/code/codeCache.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -225,7 +225,6 @@
  public:
   static void mark_all_nmethods_for_deoptimization();
   static int  mark_for_deoptimization(Method* dependee);
-  static void make_marked_nmethods_zombies();
   static void make_marked_nmethods_not_entrant();
 
   // Flushing and deoptimization
--- a/src/share/vm/code/compiledIC.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/code/compiledIC.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -343,8 +343,8 @@
     // Kill any leftover stub we might have too
     clear_ic_stub();
     if (is_optimized()) {
-    set_ic_destination(entry);
-  } else {
+      set_ic_destination(entry);
+    } else {
       set_ic_destination_and_value(entry, (void*)NULL);
     }
   } else {
--- a/src/share/vm/code/compiledIC.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/code/compiledIC.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -214,7 +214,7 @@
   //
   // They all takes a TRAP argument, since they can cause a GC if the inline-cache buffer is full.
   //
-  void set_to_clean();  // Can only be called during a safepoint operation
+  void set_to_clean();
   void set_to_monomorphic(CompiledICInfo& info);
   void clear_ic_stub();
 
--- a/src/share/vm/code/nmethod.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/code/nmethod.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1021,7 +1021,6 @@
 
 
 void nmethod::cleanup_inline_caches() {
-
   assert_locked_or_safepoint(CompiledIC_lock);
 
   // If the method is not entrant or zombie then a JMP is plastered over the
@@ -1037,7 +1036,8 @@
     // In fact, why are we bothering to look at oops in a non-entrant method??
   }
 
-  // Find all calls in an nmethod, and clear the ones that points to zombie methods
+  // Find all calls in an nmethod and clear the ones that point to non-entrant,
+  // zombie and unloaded nmethods.
   ResourceMark rm;
   RelocIterator iter(this, low_boundary);
   while(iter.next()) {
@@ -1049,7 +1049,7 @@
         CodeBlob *cb = CodeCache::find_blob_unsafe(ic->ic_destination());
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
-          // Clean inline caches pointing to both zombie and not_entrant methods
+          // Clean inline caches pointing to zombie, non-entrant and unloaded methods
           if (!nm->is_in_use() || (nm->method()->code() != nm)) ic->set_to_clean();
         }
         break;
@@ -1059,7 +1059,7 @@
         CodeBlob *cb = CodeCache::find_blob_unsafe(csc->destination());
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
-          // Clean inline caches pointing to both zombie and not_entrant methods
+          // Clean inline caches pointing to zombie, non-entrant and unloaded methods
           if (!nm->is_in_use() || (nm->method()->code() != nm)) csc->set_to_clean();
         }
         break;
@@ -2529,7 +2529,7 @@
   // Hmm. OSR methods can be deopted but not marked as zombie or not_entrant
   // seems odd.
 
-  if( is_zombie() || is_not_entrant() )
+  if (is_zombie() || is_not_entrant() || is_unloaded())
     return;
 
   // Make sure all the entry points are correctly aligned for patching.
--- a/src/share/vm/compiler/compileBroker.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/compiler/compileBroker.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1399,6 +1399,28 @@
   // do the compilation
   if (method->is_native()) {
     if (!PreferInterpreterNativeStubs || method->is_method_handle_intrinsic()) {
+      // The following native methods:
+      //
+      // java.lang.Float.intBitsToFloat
+      // java.lang.Float.floatToRawIntBits
+      // java.lang.Double.longBitsToDouble
+      // java.lang.Double.doubleToRawLongBits
+      //
+      // are called through the interpreter even if interpreter native stubs
+      // are not preferred (i.e., calling through adapter handlers is preferred).
+      // The reason is that on x86_32 signaling NaNs (sNaNs) are not preserved
+      // if the version of the methods from the native libraries is called.
+      // As the interpreter and the C2-intrinsified version of the methods preserves
+      // sNaNs, that would result in an inconsistent way of handling of sNaNs.
+      if ((UseSSE >= 1 &&
+          (method->intrinsic_id() == vmIntrinsics::_intBitsToFloat ||
+           method->intrinsic_id() == vmIntrinsics::_floatToRawIntBits)) ||
+          (UseSSE >= 2 &&
+           (method->intrinsic_id() == vmIntrinsics::_longBitsToDouble ||
+            method->intrinsic_id() == vmIntrinsics::_doubleToRawLongBits))) {
+        return NULL;
+      }
+
       // To properly handle the appendix argument for out-of-line calls we are using a small trampoline that
       // pops off the appendix argument and jumps to the target (see gen_special_dispatch in SharedRuntime).
       //
--- a/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -620,7 +620,7 @@
   // Support for parallelizing survivor space rescan
   if ((CMSParallelRemarkEnabled && CMSParallelSurvivorRemarkEnabled) || CMSParallelInitialMarkEnabled) {
     const size_t max_plab_samples =
-      _young_gen->max_survivor_size() / (ThreadLocalAllocBuffer::min_size() * HeapWordSize);
+      _young_gen->max_survivor_size() / (PLAB::min_size() * HeapWordSize);
 
     _survivor_plab_array  = NEW_C_HEAP_ARRAY(ChunkArray, ParallelGCThreads, mtGC);
     _survivor_chunk_array = NEW_C_HEAP_ARRAY(HeapWord*, max_plab_samples, mtGC);
@@ -3005,7 +3005,7 @@
     COMPILER2_PRESENT(DerivedPointerTableDeactivate dpt_deact;)
     if (CMSParallelInitialMarkEnabled) {
       // The parallel version.
-      FlexibleWorkGang* workers = gch->workers();
+      WorkGang* workers = gch->workers();
       assert(workers != NULL, "Need parallel worker threads.");
       uint n_workers = workers->active_workers();
 
@@ -4488,7 +4488,7 @@
   // workers to be taken from the active workers in the work gang.
   CMSParRemarkTask(CMSCollector* collector,
                    CompactibleFreeListSpace* cms_space,
-                   uint n_workers, FlexibleWorkGang* workers,
+                   uint n_workers, WorkGang* workers,
                    OopTaskQueueSet* task_queues,
                    StrongRootsScope* strong_roots_scope):
     CMSParMarkTask("Rescan roots and grey objects in parallel",
@@ -5061,7 +5061,7 @@
 // Parallel version of remark
 void CMSCollector::do_remark_parallel() {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   // Choose to use the number of GC workers most recently set
   // into "active_workers".
@@ -5236,6 +5236,16 @@
 ////////////////////////////////////////////////////////
 // Parallel Reference Processing Task Proxy Class
 ////////////////////////////////////////////////////////
+class AbstractGangTaskWOopQueues : public AbstractGangTask {
+  OopTaskQueueSet*       _queues;
+  ParallelTaskTerminator _terminator;
+ public:
+  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues, uint n_threads) :
+    AbstractGangTask(name), _queues(queues), _terminator(n_threads, _queues) {}
+  ParallelTaskTerminator* terminator() { return &_terminator; }
+  OopTaskQueueSet* queues() { return _queues; }
+};
+
 class CMSRefProcTaskProxy: public AbstractGangTaskWOopQueues {
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   CMSCollector*          _collector;
@@ -5372,7 +5382,7 @@
 void CMSRefProcTaskExecutor::execute(ProcessTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   CMSRefProcTaskProxy rp_task(task, &_collector,
                               _collector.ref_processor()->span(),
@@ -5385,7 +5395,7 @@
 {
 
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   CMSRefEnqueueTaskProxy enq_task(task);
   workers->run_task(&enq_task);
@@ -5419,7 +5429,7 @@
       // balance_all_queues() and balance_queues()).
       GenCollectedHeap* gch = GenCollectedHeap::heap();
       uint active_workers = ParallelGCThreads;
-      FlexibleWorkGang* workers = gch->workers();
+      WorkGang* workers = gch->workers();
       if (workers != NULL) {
         active_workers = workers->active_workers();
         // The expectation is that active_workers will have already
--- a/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -25,7 +25,7 @@
 #include "precompiled.hpp"
 #include "gc/cms/compactibleFreeListSpace.hpp"
 #include "gc/cms/concurrentMarkSweepGeneration.hpp"
-#include "gc/cms/parNewGeneration.hpp"
+#include "gc/cms/parNewGeneration.inline.hpp"
 #include "gc/cms/parOopClosures.inline.hpp"
 #include "gc/serial/defNewGeneration.inline.hpp"
 #include "gc/shared/adaptiveSizePolicy.hpp"
@@ -248,8 +248,7 @@
         }
       }
       if (buf_space != NULL) {
-        plab->set_word_size(buf_size);
-        plab->set_buf(buf_space);
+        plab->set_buf(buf_space, buf_size);
         record_survivor_plab(buf_space, buf_size);
         obj = plab->allocate_aligned(word_sz, SurvivorAlignmentInBytes);
         // Note that we cannot compare buf_size < word_sz below
@@ -803,7 +802,7 @@
 void ParNewRefProcTaskExecutor::execute(ProcessTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   _state_set.reset(workers->active_workers(), _young_gen.promotion_failed());
   ParNewRefProcTaskProxy rp_task(task, _young_gen, _old_gen,
@@ -816,7 +815,7 @@
 void ParNewRefProcTaskExecutor::execute(EnqueueTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   ParNewRefEnqueueTaskProxy enq_task(task);
   workers->run_task(&enq_task);
@@ -890,7 +889,7 @@
   _gc_timer->register_gc_start();
 
   AdaptiveSizePolicy* size_policy = gch->gen_policy()->size_policy();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need workgang for parallel work");
   uint active_workers =
        AdaptiveSizePolicy::calc_active_workers(workers->total_workers(),
--- a/src/share/vm/gc/cms/parNewGeneration.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/cms/parNewGeneration.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -169,11 +169,7 @@
   // Allocate a to-space block of size "sz", or else return NULL.
   HeapWord* alloc_in_to_space_slow(size_t word_sz);
 
-  HeapWord* alloc_in_to_space(size_t word_sz) {
-    HeapWord* obj = to_space_alloc_buffer()->allocate_aligned(word_sz, SurvivorAlignmentInBytes);
-    if (obj != NULL) return obj;
-    else return alloc_in_to_space_slow(word_sz);
-  }
+  inline HeapWord* alloc_in_to_space(size_t word_sz);
 
   HeapWord* young_old_boundary() { return _young_old_boundary; }
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc/cms/parNewGeneration.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_CMS_PARNEWGENERATION_INLINE_HPP
+#define SHARE_VM_GC_CMS_PARNEWGENERATION_INLINE_HPP
+
+#include "gc/cms/parNewGeneration.hpp"
+#include "gc/shared/plab.inline.hpp"
+#include "utilities/globalDefinitions.hpp"
+
+inline HeapWord* ParScanThreadState::alloc_in_to_space(size_t word_sz) {
+  HeapWord* obj = to_space_alloc_buffer()->allocate_aligned(word_sz, SurvivorAlignmentInBytes);
+  if (obj != NULL) return obj;
+  else return alloc_in_to_space_slow(word_sz);
+}
+#endif // SHARE_VM_GC_CMS_PARNEWGENERATION_INLINE_HPP
--- a/src/share/vm/gc/cms/yieldingWorkgroup.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/cms/yieldingWorkgroup.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -26,20 +26,45 @@
 #include "gc/cms/yieldingWorkgroup.hpp"
 #include "utilities/macros.hpp"
 
-// Forward declaration of classes declared here.
-
-class GangWorker;
-class WorkData;
+YieldingFlexibleGangWorker::YieldingFlexibleGangWorker(YieldingFlexibleWorkGang* gang, int id)
+    : AbstractGangWorker(gang, id) {}
 
 YieldingFlexibleWorkGang::YieldingFlexibleWorkGang(
-  const char* name, uint workers, bool are_GC_task_threads) :
-  FlexibleWorkGang(name, workers, are_GC_task_threads, false),
-    _yielded_workers(0) {}
+    const char* name, uint workers, bool are_GC_task_threads) :
+         AbstractWorkGang(name, workers, are_GC_task_threads, false),
+         _yielded_workers(0),
+         _started_workers(0),
+         _finished_workers(0),
+         _sequence_number(0),
+         _task(NULL) {
+
+  // Other initialization.
+  _monitor = new Monitor(/* priority */       Mutex::leaf,
+                         /* name */           "WorkGroup monitor",
+                         /* allow_vm_block */ are_GC_task_threads,
+                                              Monitor::_safepoint_check_sometimes);
+
+  assert(monitor() != NULL, "Failed to allocate monitor");
+}
 
-GangWorker* YieldingFlexibleWorkGang::allocate_worker(uint which) {
-  YieldingFlexibleGangWorker* new_member =
-      new YieldingFlexibleGangWorker(this, which);
-  return (YieldingFlexibleGangWorker*) new_member;
+AbstractGangWorker* YieldingFlexibleWorkGang::allocate_worker(uint which) {
+  return new YieldingFlexibleGangWorker(this, which);
+}
+
+void YieldingFlexibleWorkGang::internal_worker_poll(YieldingWorkData* data) const {
+  assert(data != NULL, "worker data is null");
+  data->set_task(task());
+  data->set_sequence_number(sequence_number());
+}
+
+void YieldingFlexibleWorkGang::internal_note_start() {
+  assert(monitor()->owned_by_self(), "note_finish is an internal method");
+  _started_workers += 1;
+}
+
+void YieldingFlexibleWorkGang::internal_note_finish() {
+  assert(monitor()->owned_by_self(), "note_finish is an internal method");
+  _finished_workers += 1;
 }
 
 // Run a task; returns when the task is done, or the workers yield,
@@ -292,37 +317,37 @@
 ///////////////////////////////
 void YieldingFlexibleGangWorker::loop() {
   int previous_sequence_number = 0;
-  Monitor* gang_monitor = gang()->monitor();
+  Monitor* gang_monitor = yf_gang()->monitor();
   MutexLockerEx ml(gang_monitor, Mutex::_no_safepoint_check_flag);
-  WorkData data;
+  YieldingWorkData data;
   int id;
   while (true) {
     // Check if there is work to do.
-    gang()->internal_worker_poll(&data);
+    yf_gang()->internal_worker_poll(&data);
     if (data.task() != NULL && data.sequence_number() != previous_sequence_number) {
       // There is work to be done.
       // First check if we need to become active or if there
       // are already the requisite number of workers
-      if (gang()->started_workers() == yf_gang()->active_workers()) {
+      if (yf_gang()->started_workers() == yf_gang()->active_workers()) {
         // There are already enough workers, we do not need to
         // to run; fall through and wait on monitor.
       } else {
         // We need to pitch in and do the work.
-        assert(gang()->started_workers() < yf_gang()->active_workers(),
+        assert(yf_gang()->started_workers() < yf_gang()->active_workers(),
                "Unexpected state");
-        id = gang()->started_workers();
-        gang()->internal_note_start();
+        id = yf_gang()->started_workers();
+        yf_gang()->internal_note_start();
         // Now, release the gang mutex and do the work.
         {
           MutexUnlockerEx mul(gang_monitor, Mutex::_no_safepoint_check_flag);
           data.task()->work(id);   // This might include yielding
         }
         // Reacquire monitor and note completion of this worker
-        gang()->internal_note_finish();
+        yf_gang()->internal_note_finish();
         // Update status of task based on whether all workers have
         // finished or some have yielded
-        assert(data.task() == gang()->task(), "Confused task binding");
-        if (gang()->finished_workers() == yf_gang()->active_workers()) {
+        assert(data.task() == yf_gang()->task(), "Confused task binding");
+        if (yf_gang()->finished_workers() == yf_gang()->active_workers()) {
           switch (data.yf_task()->status()) {
             case ABORTING: {
               data.yf_task()->set_status(ABORTED);
@@ -338,7 +363,7 @@
           }
           gang_monitor->notify_all();  // Notify overseer
         } else { // at least one worker is still working or yielded
-          assert(gang()->finished_workers() < yf_gang()->active_workers(),
+          assert(yf_gang()->finished_workers() < yf_gang()->active_workers(),
                  "Counts inconsistent");
           switch (data.yf_task()->status()) {
             case ACTIVE: {
@@ -347,7 +372,7 @@
               break;
             }
             case YIELDING: {
-              if (gang()->finished_workers() + yf_gang()->yielded_workers()
+              if (yf_gang()->finished_workers() + yf_gang()->yielded_workers()
                   == yf_gang()->active_workers()) {
                 data.yf_task()->set_status(YIELDED);
                 gang_monitor->notify_all();  // notify overseer
--- a/src/share/vm/gc/cms/yieldingWorkgroup.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/cms/yieldingWorkgroup.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -29,6 +29,7 @@
 #include "utilities/macros.hpp"
 
 // Forward declarations
+class YieldingFlexibleGangTask;
 class YieldingFlexibleWorkGang;
 
 // Status of tasks
@@ -43,13 +44,32 @@
     COMPLETED
 };
 
+class YieldingWorkData: public StackObj {
+  // This would be a struct, but I want accessor methods.
+private:
+  AbstractGangTask* _task;
+  int               _sequence_number;
+public:
+  // Constructor and destructor
+  YieldingWorkData() : _task(NULL), _sequence_number(0) {}
+  ~YieldingWorkData() {}
+
+  // Accessors and modifiers
+  AbstractGangTask* task()               const { return _task; }
+  void set_task(AbstractGangTask* value)       { _task = value; }
+  int sequence_number()                  const { return _sequence_number; }
+  void set_sequence_number(int value)          { _sequence_number = value; }
+
+  YieldingFlexibleGangTask* yf_task()    const {
+    return (YieldingFlexibleGangTask*)_task;
+  }
+};
+
 // Class YieldingFlexibleGangWorker:
 //   Several instances of this class run in parallel as workers for a gang.
-class YieldingFlexibleGangWorker: public GangWorker {
+class YieldingFlexibleGangWorker: public AbstractGangWorker {
 public:
-  // Ctor
-  YieldingFlexibleGangWorker(AbstractWorkGang* gang, int id) :
-    GangWorker(gang, id) { }
+  YieldingFlexibleGangWorker(YieldingFlexibleWorkGang* gang, int id);
 
 public:
   YieldingFlexibleWorkGang* yf_gang() const
@@ -108,9 +128,6 @@
 
   friend class YieldingFlexibleWorkGang;
   friend class YieldingFlexibleGangWorker;
-  NOT_PRODUCT(virtual bool is_YieldingFlexibleGang_task() const {
-    return true;
-  })
 
   void set_status(Status s) {
     _status = s;
@@ -160,7 +177,7 @@
 // YieldingGangWorkers, and provides infrastructure
 // supporting yielding to the "GangOverseer",
 // being the thread that orchestrates the WorkGang via run_task().
-class YieldingFlexibleWorkGang: public FlexibleWorkGang {
+class YieldingFlexibleWorkGang: public AbstractWorkGang {
   // Here's the public interface to this class.
 public:
   // Constructor and destructor.
@@ -168,12 +185,10 @@
                            bool are_GC_task_threads);
 
   YieldingFlexibleGangTask* yielding_task() const {
-    assert(task() == NULL || task()->is_YieldingFlexibleGang_task(),
-           "Incorrect cast");
-    return (YieldingFlexibleGangTask*)task();
+    return task();
   }
   // Allocate a worker and return a pointer to it.
-  GangWorker* allocate_worker(uint which);
+  AbstractGangWorker* allocate_worker(uint which);
 
   // Run a task; returns when the task is done, or the workers yield,
   // or the task is aborted.
@@ -216,6 +231,42 @@
 private:
   friend class YieldingFlexibleGangWorker;
   void reset(); // NYI
+
+
+  // The monitor which protects these data,
+  // and notifies of changes in it.
+  Monitor*   _monitor;
+  // Accessors for fields
+  Monitor* monitor() const {
+    return _monitor;
+  }
+
+  // The number of started workers.
+  uint _started_workers;
+  // The number of finished workers.
+  uint _finished_workers;
+
+  uint started_workers() const {
+    return _started_workers;
+  }
+  uint finished_workers() const {
+    return _finished_workers;
+  }
+
+  // A sequence number for the current task.
+  int _sequence_number;
+  int sequence_number() const {
+    return _sequence_number;
+  }
+
+  YieldingFlexibleGangTask* _task;
+  YieldingFlexibleGangTask* task() const {
+    return _task;
+  }
+
+  void internal_worker_poll(YieldingWorkData* data) const;
+  void internal_note_start();
+  void internal_note_finish();
 };
 
 #endif // SHARE_VM_GC_CMS_YIELDINGWORKGROUP_HPP
--- a/src/share/vm/gc/g1/concurrentMark.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/concurrentMark.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -629,7 +629,7 @@
   gclog_or_tty->print_cr("CL Sleep Factor          %1.4lf", cleanup_sleep_factor());
 #endif
 
-  _parallel_workers = new FlexibleWorkGang("G1 Marker",
+  _parallel_workers = new WorkGang("G1 Marker",
        _max_parallel_marking_threads, false, true);
   if (_parallel_workers == NULL) {
     vm_exit_during_initialization("Failed necessary allocation.");
@@ -3088,29 +3088,6 @@
 }
 #endif
 
-template<bool scan>
-inline void CMTask::process_grey_object(oop obj) {
-  assert(scan || obj->is_typeArray(), "Skipping scan of grey non-typeArray");
-  assert(_nextMarkBitMap->isMarked((HeapWord*) obj), "invariant");
-
-  if (_cm->verbose_high()) {
-    gclog_or_tty->print_cr("[%u] processing grey object " PTR_FORMAT,
-                           _worker_id, p2i((void*) obj));
-  }
-
-  size_t obj_size = obj->size();
-  _words_scanned += obj_size;
-
-  if (scan) {
-    obj->oop_iterate(_cm_oop_closure);
-  }
-  statsOnly( ++_objs_scanned );
-  check_limits();
-}
-
-template void CMTask::process_grey_object<true>(oop);
-template void CMTask::process_grey_object<false>(oop);
-
 // Closure for iteration over bitmaps
 class CMBitMapClosure : public BitMapClosure {
 private:
--- a/src/share/vm/gc/g1/concurrentMark.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/concurrentMark.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -451,7 +451,7 @@
 
   double*   _accum_task_vtime;   // Accumulated task vtime
 
-  FlexibleWorkGang* _parallel_workers;
+  WorkGang* _parallel_workers;
 
   ForceOverflowSettings _force_overflow_conc;
   ForceOverflowSettings _force_overflow_stw;
@@ -1126,7 +1126,7 @@
   inline void deal_with_reference(oop obj);
 
   // It scans an object and visits its children.
-  void scan_object(oop obj) { process_grey_object<true>(obj); }
+  inline void scan_object(oop obj);
 
   // It pushes an object on the local queue.
   inline void push(oop obj);
--- a/src/share/vm/gc/g1/concurrentMark.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/concurrentMark.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -232,6 +232,9 @@
   }
 }
 
+// It scans an object and visits its children.
+inline void CMTask::scan_object(oop obj) { process_grey_object<true>(obj); }
+
 inline void CMTask::push(oop obj) {
   HeapWord* objAddr = (HeapWord*) obj;
   assert(_g1h->is_in_g1_reserved(objAddr), "invariant");
@@ -299,6 +302,28 @@
   return objAddr < global_finger;
 }
 
+template<bool scan>
+inline void CMTask::process_grey_object(oop obj) {
+  assert(scan || obj->is_typeArray(), "Skipping scan of grey non-typeArray");
+  assert(_nextMarkBitMap->isMarked((HeapWord*) obj), "invariant");
+
+  if (_cm->verbose_high()) {
+    gclog_or_tty->print_cr("[%u] processing grey object " PTR_FORMAT,
+                           _worker_id, p2i((void*) obj));
+  }
+
+  size_t obj_size = obj->size();
+  _words_scanned += obj_size;
+
+  if (scan) {
+    obj->oop_iterate(_cm_oop_closure);
+  }
+  statsOnly( ++_objs_scanned );
+  check_limits();
+}
+
+
+
 inline void CMTask::make_reference_grey(oop obj, HeapRegion* hr) {
   if (_cm->par_mark_and_count(obj, hr, _marked_bytes_array, _card_bm)) {
 
--- a/src/share/vm/gc/g1/g1AllocRegion.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1AllocRegion.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -46,10 +46,11 @@
   _dummy_region = dummy_region;
 }
 
-void G1AllocRegion::fill_up_remaining_space(HeapRegion* alloc_region,
-                                            bool bot_updates) {
+size_t G1AllocRegion::fill_up_remaining_space(HeapRegion* alloc_region,
+                                              bool bot_updates) {
   assert(alloc_region != NULL && alloc_region != _dummy_region,
          "pre-condition");
+  size_t result = 0;
 
   // Other threads might still be trying to allocate using a CAS out
   // of the region we are trying to retire, as they can do so without
@@ -73,6 +74,7 @@
       // If the allocation was successful we should fill in the space.
       CollectedHeap::fill_with_object(dummy, free_word_size);
       alloc_region->set_pre_dummy_top(dummy);
+      result += free_word_size * HeapWordSize;
       break;
     }
 
@@ -81,13 +83,18 @@
     // allocation and they fill up the region. In that case, we can
     // just get out of the loop.
   }
+  result += alloc_region->free();
+
   assert(alloc_region->free() / HeapWordSize < min_word_size_to_fill,
          "post-condition");
+  return result;
 }
 
-void G1AllocRegion::retire(bool fill_up) {
+size_t G1AllocRegion::retire(bool fill_up) {
   assert(_alloc_region != NULL, ar_ext_msg(this, "not initialized properly"));
 
+  size_t result = 0;
+
   trace("retiring");
   HeapRegion* alloc_region = _alloc_region;
   if (alloc_region != _dummy_region) {
@@ -98,7 +105,7 @@
            ar_ext_msg(this, "the alloc region should never be empty"));
 
     if (fill_up) {
-      fill_up_remaining_space(alloc_region, _bot_updates);
+      result = fill_up_remaining_space(alloc_region, _bot_updates);
     }
 
     assert(alloc_region->used() >= _used_bytes_before,
@@ -109,6 +116,8 @@
     _alloc_region = _dummy_region;
   }
   trace("retired");
+
+  return result;
 }
 
 HeapWord* G1AllocRegion::new_alloc_region_and_allocate(size_t word_size,
@@ -196,11 +205,11 @@
 }
 
 #if G1_ALLOC_REGION_TRACING
-void G1AllocRegion::trace(const char* str, size_t word_size, HeapWord* result) {
+void G1AllocRegion::trace(const char* str, size_t min_word_size, size_t desired_word_size, size_t actual_word_size, HeapWord* result) {
   // All the calls to trace that set either just the size or the size
   // and the result are considered part of level 2 tracing and are
   // skipped during level 1 tracing.
-  if ((word_size == 0 && result == NULL) || (G1_ALLOC_REGION_TRACING > 1)) {
+  if ((actual_word_size == 0 && result == NULL) || (G1_ALLOC_REGION_TRACING > 1)) {
     const size_t buffer_length = 128;
     char hr_buffer[buffer_length];
     char rest_buffer[buffer_length];
@@ -217,10 +226,10 @@
 
     if (G1_ALLOC_REGION_TRACING > 1) {
       if (result != NULL) {
-        jio_snprintf(rest_buffer, buffer_length, SIZE_FORMAT " " PTR_FORMAT,
-                     word_size, result);
-      } else if (word_size != 0) {
-        jio_snprintf(rest_buffer, buffer_length, SIZE_FORMAT, word_size);
+        jio_snprintf(rest_buffer, buffer_length, "min " SIZE_FORMAT " desired " SIZE_FORMAT " actual " SIZE_FORMAT " " PTR_FORMAT,
+                     min_word_size, desired_word_size, actual_word_size, result);
+      } else if (min_word_size != 0) {
+        jio_snprintf(rest_buffer, buffer_length, "min " SIZE_FORMAT " desired " SIZE_FORMAT, min_word_size, desired_word_size);
       } else {
         jio_snprintf(rest_buffer, buffer_length, "");
       }
@@ -251,26 +260,25 @@
   _g1h->retire_mutator_alloc_region(alloc_region, allocated_bytes);
 }
 
-HeapRegion* SurvivorGCAllocRegion::allocate_new_region(size_t word_size,
-                                                       bool force) {
+HeapRegion* G1GCAllocRegion::allocate_new_region(size_t word_size,
+                                                 bool force) {
   assert(!force, "not supported for GC alloc regions");
-  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Young);
+  return _g1h->new_gc_alloc_region(word_size, count(), _purpose);
 }
 
-void SurvivorGCAllocRegion::retire_region(HeapRegion* alloc_region,
-                                          size_t allocated_bytes) {
-  _g1h->retire_gc_alloc_region(alloc_region, allocated_bytes, InCSetState::Young);
+void G1GCAllocRegion::retire_region(HeapRegion* alloc_region,
+                                    size_t allocated_bytes) {
+  _g1h->retire_gc_alloc_region(alloc_region, allocated_bytes, _purpose);
 }
 
-HeapRegion* OldGCAllocRegion::allocate_new_region(size_t word_size,
-                                                  bool force) {
-  assert(!force, "not supported for GC alloc regions");
-  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Old);
-}
-
-void OldGCAllocRegion::retire_region(HeapRegion* alloc_region,
-                                     size_t allocated_bytes) {
-  _g1h->retire_gc_alloc_region(alloc_region, allocated_bytes, InCSetState::Old);
+size_t G1GCAllocRegion::retire(bool fill_up) {
+  HeapRegion* retired = get();
+  size_t end_waste = G1AllocRegion::retire(fill_up);
+  // Do not count retirement of the dummy allocation region.
+  if (retired != NULL) {
+    _stats->add_region_end_waste(end_waste / HeapWordSize);
+  }
+  return end_waste;
 }
 
 HeapRegion* OldGCAllocRegion::release() {
--- a/src/share/vm/gc/g1/g1AllocRegion.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1AllocRegion.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -26,6 +26,8 @@
 #define SHARE_VM_GC_G1_G1ALLOCREGION_HPP
 
 #include "gc/g1/heapRegion.hpp"
+#include "gc/g1/g1EvacStats.hpp"
+#include "gc/g1/g1InCSetState.hpp"
 
 class G1CollectedHeap;
 
@@ -102,16 +104,22 @@
   static inline HeapWord* par_allocate(HeapRegion* alloc_region,
                                        size_t word_size,
                                        bool bot_updates);
+  // Perform a MT-safe allocation out of the given region, with the given
+  // minimum and desired size. Returns the actual size allocated (between
+  // minimum and desired size) in actual_word_size if the allocation has been
+  // successful.
+  static inline HeapWord* par_allocate(HeapRegion* alloc_region,
+                                       size_t min_word_size,
+                                       size_t desired_word_size,
+                                       size_t* actual_word_size,
+                                       bool bot_updates);
 
   // Ensure that the region passed as a parameter has been filled up
   // so that noone else can allocate out of it any more.
-  static void fill_up_remaining_space(HeapRegion* alloc_region,
-                                      bool bot_updates);
-
-  // Retire the active allocating region. If fill_up is true then make
-  // sure that the region is full before we retire it so that noone
-  // else can allocate out of it.
-  void retire(bool fill_up);
+  // Returns the number of bytes that have been wasted by filled up
+  // the space.
+  static size_t fill_up_remaining_space(HeapRegion* alloc_region,
+                                        bool bot_updates);
 
   // After a region is allocated by alloc_new_region, this
   // method is used to set it as the active alloc_region
@@ -126,6 +134,12 @@
   void fill_in_ext_msg(ar_ext_msg* msg, const char* message);
 
 protected:
+  // Retire the active allocating region. If fill_up is true then make
+  // sure that the region is full before we retire it so that no one
+  // else can allocate out of it.
+  // Returns the number of bytes that have been filled up during retire.
+  virtual size_t retire(bool fill_up);
+
   // For convenience as subclasses use it.
   static G1CollectedHeap* _g1h;
 
@@ -154,7 +168,18 @@
   // First-level allocation: Should be called without holding a
   // lock. It will try to allocate lock-free out of the active region,
   // or return NULL if it was unable to.
-  inline HeapWord* attempt_allocation(size_t word_size, bool bot_updates);
+  inline HeapWord* attempt_allocation(size_t word_size,
+                                      bool bot_updates);
+  // Perform an allocation out of the current allocation region, with the given
+  // minimum and desired size. Returns the actual size allocated (between
+  // minimum and desired size) in actual_word_size if the allocation has been
+  // successful.
+  // Should be called without holding a lock. It will try to allocate lock-free
+  // out of the active region, or return NULL if it was unable to.
+  inline HeapWord* attempt_allocation(size_t min_word_size,
+                                      size_t desired_word_size,
+                                      size_t* actual_word_size,
+                                      bool bot_updates);
 
   // Second-level allocation: Should be called while holding a
   // lock. It will try to first allocate lock-free out of the active
@@ -164,6 +189,14 @@
   // it conform to its locking protocol.
   inline HeapWord* attempt_allocation_locked(size_t word_size,
                                              bool bot_updates);
+  // Same as attempt_allocation_locked(size_t, bool), but allowing specification
+  // of minimum word size of the block in min_word_size, and the maximum word
+  // size of the allocation in desired_word_size. The actual size of the block is
+  // returned in actual_word_size.
+  inline HeapWord* attempt_allocation_locked(size_t min_word_size,
+                                             size_t desired_word_size,
+                                             size_t* actual_word_size,
+                                             bool bot_updates);
 
   // Should be called to allocate a new region even if the max of this
   // type of regions has been reached. Should only be called if other
@@ -186,9 +219,17 @@
   virtual HeapRegion* release();
 
 #if G1_ALLOC_REGION_TRACING
-  void trace(const char* str, size_t word_size = 0, HeapWord* result = NULL);
+  void trace(const char* str,
+             size_t min_word_size = 0,
+             size_t desired_word_size = 0,
+             size_t actual_word_size = 0,
+             HeapWord* result = NULL);
 #else // G1_ALLOC_REGION_TRACING
-  void trace(const char* str, size_t word_size = 0, HeapWord* result = NULL) { }
+  void trace(const char* str,
+             size_t min_word_size = 0,
+             size_t desired_word_size = 0,
+             size_t actual_word_size = 0,
+             HeapWord* result = NULL) { }
 #endif // G1_ALLOC_REGION_TRACING
 };
 
@@ -201,22 +242,33 @@
     : G1AllocRegion("Mutator Alloc Region", false /* bot_updates */) { }
 };
 
-class SurvivorGCAllocRegion : public G1AllocRegion {
+// Common base class for allocation regions used during GC.
+class G1GCAllocRegion : public G1AllocRegion {
 protected:
+  G1EvacStats* _stats;
+  InCSetState::in_cset_state_t _purpose;
+
   virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
   virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+
+  virtual size_t retire(bool fill_up);
 public:
-  SurvivorGCAllocRegion()
-  : G1AllocRegion("Survivor GC Alloc Region", false /* bot_updates */) { }
+  G1GCAllocRegion(const char* name, bool bot_updates, G1EvacStats* stats, InCSetState::in_cset_state_t purpose)
+  : G1AllocRegion(name, bot_updates), _stats(stats), _purpose(purpose) {
+    assert(stats != NULL, "Must pass non-NULL PLAB statistics");
+  }
 };
 
-class OldGCAllocRegion : public G1AllocRegion {
-protected:
-  virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
-  virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+class SurvivorGCAllocRegion : public G1GCAllocRegion {
 public:
-  OldGCAllocRegion()
-  : G1AllocRegion("Old GC Alloc Region", true /* bot_updates */) { }
+  SurvivorGCAllocRegion(G1EvacStats* stats)
+  : G1GCAllocRegion("Survivor GC Alloc Region", false /* bot_updates */, stats, InCSetState::Young) { }
+};
+
+class OldGCAllocRegion : public G1GCAllocRegion {
+public:
+  OldGCAllocRegion(G1EvacStats* stats)
+  : G1GCAllocRegion("Old GC Alloc Region", true /* bot_updates */, stats, InCSetState::Old) { }
 
   // This specialization of release() makes sure that the last card that has
   // been allocated into has been completely filled by a dummy object.  This
--- a/src/share/vm/gc/g1/g1AllocRegion.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1AllocRegion.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -40,52 +40,74 @@
   }
 }
 
+inline HeapWord* G1AllocRegion::par_allocate(HeapRegion* alloc_region, size_t word_size, bool bot_updates) {
+  size_t temp;
+  return par_allocate(alloc_region, word_size, word_size, &temp, bot_updates);
+}
+
 inline HeapWord* G1AllocRegion::par_allocate(HeapRegion* alloc_region,
-                                             size_t word_size,
+                                             size_t min_word_size,
+                                             size_t desired_word_size,
+                                             size_t* actual_word_size,
                                              bool bot_updates) {
   assert(alloc_region != NULL, err_msg("pre-condition"));
   assert(!alloc_region->is_empty(), err_msg("pre-condition"));
 
   if (!bot_updates) {
-    return alloc_region->par_allocate_no_bot_updates(word_size);
+    return alloc_region->par_allocate_no_bot_updates(min_word_size, desired_word_size, actual_word_size);
   } else {
-    return alloc_region->par_allocate(word_size);
+    return alloc_region->par_allocate(min_word_size, desired_word_size, actual_word_size);
   }
 }
 
-inline HeapWord* G1AllocRegion::attempt_allocation(size_t word_size,
+inline HeapWord* G1AllocRegion::attempt_allocation(size_t word_size, bool bot_updates) {
+  size_t temp;
+  return attempt_allocation(word_size, word_size, &temp, bot_updates);
+}
+
+inline HeapWord* G1AllocRegion::attempt_allocation(size_t min_word_size,
+                                                   size_t desired_word_size,
+                                                   size_t* actual_word_size,
                                                    bool bot_updates) {
   assert(bot_updates == _bot_updates, ar_ext_msg(this, "pre-condition"));
 
   HeapRegion* alloc_region = _alloc_region;
   assert(alloc_region != NULL, ar_ext_msg(this, "not initialized properly"));
 
-  HeapWord* result = par_allocate(alloc_region, word_size, bot_updates);
+  HeapWord* result = par_allocate(alloc_region, min_word_size, desired_word_size, actual_word_size, bot_updates);
   if (result != NULL) {
-    trace("alloc", word_size, result);
+    trace("alloc", min_word_size, desired_word_size, *actual_word_size, result);
     return result;
   }
-  trace("alloc failed", word_size);
+  trace("alloc failed", min_word_size, desired_word_size);
   return NULL;
 }
 
-inline HeapWord* G1AllocRegion::attempt_allocation_locked(size_t word_size,
+inline HeapWord* G1AllocRegion::attempt_allocation_locked(size_t word_size, bool bot_updates) {
+  size_t temp;
+  return attempt_allocation_locked(word_size, word_size, &temp, bot_updates);
+}
+
+inline HeapWord* G1AllocRegion::attempt_allocation_locked(size_t min_word_size,
+                                                          size_t desired_word_size,
+                                                          size_t* actual_word_size,
                                                           bool bot_updates) {
   // First we have to redo the allocation, assuming we're holding the
   // appropriate lock, in case another thread changed the region while
   // we were waiting to get the lock.
-  HeapWord* result = attempt_allocation(word_size, bot_updates);
+  HeapWord* result = attempt_allocation(min_word_size, desired_word_size, actual_word_size, bot_updates);
   if (result != NULL) {
     return result;
   }
 
   retire(true /* fill_up */);
-  result = new_alloc_region_and_allocate(word_size, false /* force */);
+  result = new_alloc_region_and_allocate(desired_word_size, false /* force */);
   if (result != NULL) {
-    trace("alloc locked (second attempt)", word_size, result);
+    *actual_word_size = desired_word_size;
+    trace("alloc locked (second attempt)", min_word_size, desired_word_size, *actual_word_size, result);
     return result;
   }
-  trace("alloc locked failed", word_size);
+  trace("alloc locked failed", min_word_size, desired_word_size);
   return NULL;
 }
 
@@ -94,13 +116,13 @@
   assert(bot_updates == _bot_updates, ar_ext_msg(this, "pre-condition"));
   assert(_alloc_region != NULL, ar_ext_msg(this, "not initialized properly"));
 
-  trace("forcing alloc");
+  trace("forcing alloc", word_size, word_size);
   HeapWord* result = new_alloc_region_and_allocate(word_size, true /* force */);
   if (result != NULL) {
-    trace("alloc forced", word_size, result);
+    trace("alloc forced", word_size, word_size, word_size, result);
     return result;
   }
-  trace("alloc forced failed", word_size);
+  trace("alloc forced failed", word_size, word_size);
   return NULL;
 }
 
--- a/src/share/vm/gc/g1/g1Allocator.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1Allocator.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -24,12 +24,20 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1Allocator.inline.hpp"
+#include "gc/g1/g1AllocRegion.inline.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectorPolicy.hpp"
 #include "gc/g1/g1MarkSweep.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 #include "gc/g1/heapRegionSet.inline.hpp"
 
+G1DefaultAllocator::G1DefaultAllocator(G1CollectedHeap* heap) :
+  G1Allocator(heap),
+  _retained_old_gc_alloc_region(NULL),
+  _survivor_gc_alloc_region(heap->alloc_buffer_stats(InCSetState::Young)),
+  _old_gc_alloc_region(heap->alloc_buffer_stats(InCSetState::Old)) {
+}
+
 void G1DefaultAllocator::init_mutator_alloc_region() {
   assert(_mutator_alloc_region.get() == NULL, "pre-condition");
   _mutator_alloc_region.init();
@@ -79,6 +87,8 @@
 void G1DefaultAllocator::init_gc_alloc_regions(EvacuationInfo& evacuation_info) {
   assert_at_safepoint(true /* should_be_vm_thread */);
 
+  G1Allocator::init_gc_alloc_regions(evacuation_info);
+
   _survivor_gc_alloc_region.init();
   _old_gc_alloc_region.init();
   reuse_retained_old_region(evacuation_info,
@@ -101,10 +111,8 @@
     _retained_old_gc_alloc_region->record_retained_region();
   }
 
-  if (ResizePLAB) {
-    _g1h->alloc_buffer_stats(InCSetState::Young)->adjust_desired_plab_sz();
-    _g1h->alloc_buffer_stats(InCSetState::Old)->adjust_desired_plab_sz();
-  }
+  _g1h->alloc_buffer_stats(InCSetState::Young)->adjust_desired_plab_sz();
+  _g1h->alloc_buffer_stats(InCSetState::Old)->adjust_desired_plab_sz();
 }
 
 void G1DefaultAllocator::abandon_gc_alloc_regions() {
@@ -136,78 +144,159 @@
 HeapWord* G1Allocator::par_allocate_during_gc(InCSetState dest,
                                               size_t word_size,
                                               AllocationContext_t context) {
+  size_t temp = 0;
+  HeapWord* result = par_allocate_during_gc(dest, word_size, word_size, &temp, context);
+  assert(result == NULL || temp == word_size,
+         err_msg("Requested " SIZE_FORMAT " words, but got " SIZE_FORMAT " at " PTR_FORMAT,
+                 word_size, temp, p2i(result)));
+  return result;
+}
+
+HeapWord* G1Allocator::par_allocate_during_gc(InCSetState dest,
+                                              size_t min_word_size,
+                                              size_t desired_word_size,
+                                              size_t* actual_word_size,
+                                              AllocationContext_t context) {
   switch (dest.value()) {
     case InCSetState::Young:
-      return survivor_attempt_allocation(word_size, context);
+      return survivor_attempt_allocation(min_word_size, desired_word_size, actual_word_size, context);
     case InCSetState::Old:
-      return old_attempt_allocation(word_size, context);
+      return old_attempt_allocation(min_word_size, desired_word_size, actual_word_size, context);
     default:
       ShouldNotReachHere();
       return NULL; // Keep some compilers happy
   }
 }
 
-HeapWord* G1Allocator::survivor_attempt_allocation(size_t word_size,
+bool G1Allocator::survivor_is_full(AllocationContext_t context) const {
+  return _survivor_is_full;
+}
+
+bool G1Allocator::old_is_full(AllocationContext_t context) const {
+  return _old_is_full;
+}
+
+void G1Allocator::set_survivor_full(AllocationContext_t context) {
+  _survivor_is_full = true;
+}
+
+void G1Allocator::set_old_full(AllocationContext_t context) {
+  _old_is_full = true;
+}
+
+HeapWord* G1Allocator::survivor_attempt_allocation(size_t min_word_size,
+                                                   size_t desired_word_size,
+                                                   size_t* actual_word_size,
                                                    AllocationContext_t context) {
-  assert(!_g1h->is_humongous(word_size),
+  assert(!_g1h->is_humongous(desired_word_size),
          "we should not be seeing humongous-size allocations in this path");
 
-  HeapWord* result = survivor_gc_alloc_region(context)->attempt_allocation(word_size,
+  HeapWord* result = survivor_gc_alloc_region(context)->attempt_allocation(min_word_size,
+                                                                           desired_word_size,
+                                                                           actual_word_size,
                                                                            false /* bot_updates */);
-  if (result == NULL) {
+  if (result == NULL && !survivor_is_full(context)) {
     MutexLockerEx x(FreeList_lock, Mutex::_no_safepoint_check_flag);
-    result = survivor_gc_alloc_region(context)->attempt_allocation_locked(word_size,
+    result = survivor_gc_alloc_region(context)->attempt_allocation_locked(min_word_size,
+                                                                          desired_word_size,
+                                                                          actual_word_size,
                                                                           false /* bot_updates */);
+    if (result == NULL) {
+      set_survivor_full(context);
+    }
   }
   if (result != NULL) {
-    _g1h->dirty_young_block(result, word_size);
+    _g1h->dirty_young_block(result, *actual_word_size);
   }
   return result;
 }
 
-HeapWord* G1Allocator::old_attempt_allocation(size_t word_size,
+HeapWord* G1Allocator::old_attempt_allocation(size_t min_word_size,
+                                              size_t desired_word_size,
+                                              size_t* actual_word_size,
                                               AllocationContext_t context) {
-  assert(!_g1h->is_humongous(word_size),
+  assert(!_g1h->is_humongous(desired_word_size),
          "we should not be seeing humongous-size allocations in this path");
 
-  HeapWord* result = old_gc_alloc_region(context)->attempt_allocation(word_size,
+  HeapWord* result = old_gc_alloc_region(context)->attempt_allocation(min_word_size,
+                                                                      desired_word_size,
+                                                                      actual_word_size,
                                                                       true /* bot_updates */);
-  if (result == NULL) {
+  if (result == NULL && !old_is_full(context)) {
     MutexLockerEx x(FreeList_lock, Mutex::_no_safepoint_check_flag);
-    result = old_gc_alloc_region(context)->attempt_allocation_locked(word_size,
+    result = old_gc_alloc_region(context)->attempt_allocation_locked(min_word_size,
+                                                                     desired_word_size,
+                                                                     actual_word_size,
                                                                      true /* bot_updates */);
+    if (result == NULL) {
+      set_old_full(context);
+    }
   }
   return result;
 }
 
+void G1Allocator::init_gc_alloc_regions(EvacuationInfo& evacuation_info) {
+  _survivor_is_full = false;
+  _old_is_full = false;
+}
+
 G1PLABAllocator::G1PLABAllocator(G1Allocator* allocator) :
   _g1h(G1CollectedHeap::heap()),
   _allocator(allocator),
   _survivor_alignment_bytes(calc_survivor_alignment_bytes()) {
+  for (size_t i = 0; i < ARRAY_SIZE(_direct_allocated); i++) {
+    _direct_allocated[i] = 0;
+  }
+}
+
+bool G1PLABAllocator::may_throw_away_buffer(size_t const allocation_word_sz, size_t const buffer_size) const {
+  return (allocation_word_sz * 100 < buffer_size * ParallelGCBufferWastePct);
 }
 
 HeapWord* G1PLABAllocator::allocate_direct_or_new_plab(InCSetState dest,
                                                        size_t word_sz,
-                                                       AllocationContext_t context) {
-  size_t gclab_word_size = _g1h->desired_plab_sz(dest);
-  if (word_sz * 100 < gclab_word_size * ParallelGCBufferWastePct) {
+                                                       AllocationContext_t context,
+                                                       bool* plab_refill_failed) {
+  size_t plab_word_size = G1CollectedHeap::heap()->desired_plab_sz(dest);
+  size_t required_in_plab = PLAB::size_required_for_allocation(word_sz);
+
+  // Only get a new PLAB if the allocation fits and it would not waste more than
+  // ParallelGCBufferWastePct in the existing buffer.
+  if ((required_in_plab <= plab_word_size) &&
+    may_throw_away_buffer(required_in_plab, plab_word_size)) {
+
     G1PLAB* alloc_buf = alloc_buffer(dest, context);
     alloc_buf->retire();
 
-    HeapWord* buf = _allocator->par_allocate_during_gc(dest, gclab_word_size, context);
-    if (buf == NULL) {
-      return NULL; // Let caller handle allocation failure.
+    size_t actual_plab_size = 0;
+    HeapWord* buf = _allocator->par_allocate_during_gc(dest,
+                                                       required_in_plab,
+                                                       plab_word_size,
+                                                       &actual_plab_size,
+                                                       context);
+
+    assert(buf == NULL || ((actual_plab_size >= required_in_plab) && (actual_plab_size <= plab_word_size)),
+           err_msg("Requested at minimum " SIZE_FORMAT ", desired " SIZE_FORMAT " words, but got " SIZE_FORMAT " at " PTR_FORMAT,
+                   required_in_plab, plab_word_size, actual_plab_size, p2i(buf)));
+
+    if (buf != NULL) {
+      alloc_buf->set_buf(buf, actual_plab_size);
+
+      HeapWord* const obj = alloc_buf->allocate(word_sz);
+      assert(obj != NULL, err_msg("PLAB should have been big enough, tried to allocate "
+                                  SIZE_FORMAT " requiring " SIZE_FORMAT " PLAB size " SIZE_FORMAT,
+                                  word_sz, required_in_plab, plab_word_size));
+      return obj;
     }
     // Otherwise.
-    alloc_buf->set_word_size(gclab_word_size);
-    alloc_buf->set_buf(buf);
-
-    HeapWord* const obj = alloc_buf->allocate(word_sz);
-    assert(obj != NULL, "buffer was definitely big enough...");
-    return obj;
-  } else {
-    return _allocator->par_allocate_during_gc(dest, word_sz, context);
+    *plab_refill_failed = true;
   }
+  // Try direct allocation.
+  HeapWord* result = _allocator->par_allocate_during_gc(dest, word_sz, context);
+  if (result != NULL) {
+    _direct_allocated[dest.value()] += word_sz;
+  }
+  return result;
 }
 
 void G1PLABAllocator::undo_allocation(InCSetState dest, HeapWord* obj, size_t word_sz, AllocationContext_t context) {
@@ -225,11 +314,14 @@
   _alloc_buffers[InCSetState::Old]  = &_tenured_alloc_buffer;
 }
 
-void G1DefaultPLABAllocator::retire_alloc_buffers() {
+void G1DefaultPLABAllocator::flush_and_retire_stats() {
   for (uint state = 0; state < InCSetState::Num; state++) {
     G1PLAB* const buf = _alloc_buffers[state];
     if (buf != NULL) {
-      buf->flush_and_retire_stats(_g1h->alloc_buffer_stats(state));
+      G1EvacStats* stats = _g1h->alloc_buffer_stats(state);
+      buf->flush_and_retire_stats(stats);
+      stats->add_direct_allocated(_direct_allocated[state]);
+      _direct_allocated[state] = 0;
     }
   }
 }
--- a/src/share/vm/gc/g1/g1Allocator.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1Allocator.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -38,23 +38,36 @@
 // Also keeps track of retained regions across GCs.
 class G1Allocator : public CHeapObj<mtGC> {
   friend class VMStructs;
+private:
+  bool _survivor_is_full;
+  bool _old_is_full;
 protected:
   G1CollectedHeap* _g1h;
 
   virtual MutatorAllocRegion* mutator_alloc_region(AllocationContext_t context) = 0;
 
+  virtual bool survivor_is_full(AllocationContext_t context) const;
+  virtual bool old_is_full(AllocationContext_t context) const;
+
+  virtual void set_survivor_full(AllocationContext_t context);
+  virtual void set_old_full(AllocationContext_t context);
+
   // Accessors to the allocation regions.
   virtual SurvivorGCAllocRegion* survivor_gc_alloc_region(AllocationContext_t context) = 0;
   virtual OldGCAllocRegion* old_gc_alloc_region(AllocationContext_t context) = 0;
 
   // Allocation attempt during GC for a survivor object / PLAB.
-  inline HeapWord* survivor_attempt_allocation(size_t word_size,
+  inline HeapWord* survivor_attempt_allocation(size_t min_word_size,
+                                               size_t desired_word_size,
+                                               size_t* actual_word_size,
                                                AllocationContext_t context);
   // Allocation attempt during GC for an old object / PLAB.
-  inline HeapWord* old_attempt_allocation(size_t word_size,
+  inline HeapWord* old_attempt_allocation(size_t min_word_size,
+                                          size_t desired_word_size,
+                                          size_t* actual_word_size,
                                           AllocationContext_t context);
 public:
-  G1Allocator(G1CollectedHeap* heap) : _g1h(heap) { }
+  G1Allocator(G1CollectedHeap* heap) : _g1h(heap), _survivor_is_full(false), _old_is_full(false) { }
   virtual ~G1Allocator() { }
 
   static G1Allocator* create_allocator(G1CollectedHeap* g1h);
@@ -66,7 +79,7 @@
   virtual void init_mutator_alloc_region() = 0;
   virtual void release_mutator_alloc_region() = 0;
 
-  virtual void init_gc_alloc_regions(EvacuationInfo& evacuation_info) = 0;
+  virtual void init_gc_alloc_regions(EvacuationInfo& evacuation_info);
   virtual void release_gc_alloc_regions(EvacuationInfo& evacuation_info) = 0;
   virtual void abandon_gc_alloc_regions() = 0;
 
@@ -93,6 +106,12 @@
                                    size_t word_size,
                                    AllocationContext_t context);
 
+  HeapWord* par_allocate_during_gc(InCSetState dest,
+                                   size_t min_word_size,
+                                   size_t desired_word_size,
+                                   size_t* actual_word_size,
+                                   AllocationContext_t context);
+
   virtual size_t used_in_alloc_regions() = 0;
 };
 
@@ -114,7 +133,7 @@
 
   HeapRegion* _retained_old_gc_alloc_region;
 public:
-  G1DefaultAllocator(G1CollectedHeap* heap) : G1Allocator(heap), _retained_old_gc_alloc_region(NULL) { }
+  G1DefaultAllocator(G1CollectedHeap* heap);
 
   virtual void init_mutator_alloc_region();
   virtual void release_mutator_alloc_region();
@@ -163,8 +182,12 @@
     guarantee(_retired, "Allocation buffer has not been retired");
   }
 
-  virtual void set_buf(HeapWord* buf) {
-    PLAB::set_buf(buf);
+  // The amount of space in words wasted within the PLAB including
+  // waste due to refills and alignment.
+  size_t wasted() const { return _wasted; }
+
+  virtual void set_buf(HeapWord* buf, size_t word_size) {
+    PLAB::set_buf(buf, word_size);
     _retired = false;
   }
 
@@ -198,7 +221,10 @@
   // architectures have a special compare against zero instructions.
   const uint _survivor_alignment_bytes;
 
-  virtual void retire_alloc_buffers() = 0;
+  // Number of words allocated directly (not counting PLAB allocation).
+  size_t _direct_allocated[InCSetState::Num];
+
+  virtual void flush_and_retire_stats() = 0;
   virtual G1PLAB* alloc_buffer(InCSetState dest, AllocationContext_t context) = 0;
 
   // Calculate the survivor space object alignment in bytes. Returns that or 0 if
@@ -215,6 +241,11 @@
     }
   }
 
+  HeapWord* allocate_new_plab(InCSetState dest,
+                              size_t word_sz,
+                              AllocationContext_t context);
+
+  bool may_throw_away_buffer(size_t const allocation_word_sz, size_t const buffer_size) const;
 public:
   G1PLABAllocator(G1Allocator* allocator);
   virtual ~G1PLABAllocator() { }
@@ -225,31 +256,28 @@
 
   // Allocate word_sz words in dest, either directly into the regions or by
   // allocating a new PLAB. Returns the address of the allocated memory, NULL if
-  // not successful.
+  // not successful. Plab_refill_failed indicates whether an attempt to refill the
+  // PLAB failed or not.
   HeapWord* allocate_direct_or_new_plab(InCSetState dest,
                                         size_t word_sz,
-                                        AllocationContext_t context);
+                                        AllocationContext_t context,
+                                        bool* plab_refill_failed);
 
   // Allocate word_sz words in the PLAB of dest.  Returns the address of the
   // allocated memory, NULL if not successful.
-  HeapWord* plab_allocate(InCSetState dest,
-                          size_t word_sz,
-                          AllocationContext_t context) {
-    G1PLAB* buffer = alloc_buffer(dest, context);
-    if (_survivor_alignment_bytes == 0 || !dest.is_young()) {
-      return buffer->allocate(word_sz);
-    } else {
-      return buffer->allocate_aligned(word_sz, _survivor_alignment_bytes);
-    }
-  }
+  inline HeapWord* plab_allocate(InCSetState dest,
+                                 size_t word_sz,
+                                 AllocationContext_t context);
 
-  HeapWord* allocate(InCSetState dest, size_t word_sz,
-                     AllocationContext_t context) {
+  HeapWord* allocate(InCSetState dest,
+                     size_t word_sz,
+                     AllocationContext_t context,
+                     bool* refill_failed) {
     HeapWord* const obj = plab_allocate(dest, word_sz, context);
     if (obj != NULL) {
       return obj;
     }
-    return allocate_direct_or_new_plab(dest, word_sz, context);
+    return allocate_direct_or_new_plab(dest, word_sz, context, refill_failed);
   }
 
   void undo_allocation(InCSetState dest, HeapWord* obj, size_t word_sz, AllocationContext_t context);
@@ -273,7 +301,7 @@
     return _alloc_buffers[dest.value()];
   }
 
-  virtual void retire_alloc_buffers();
+  virtual void flush_and_retire_stats();
 
   virtual void waste(size_t& wasted, size_t& undo_wasted);
 };
--- a/src/share/vm/gc/g1/g1Allocator.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1Allocator.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -27,6 +27,7 @@
 
 #include "gc/g1/g1Allocator.hpp"
 #include "gc/g1/g1AllocRegion.inline.hpp"
+#include "gc/shared/plab.inline.hpp"
 
 HeapWord* G1Allocator::attempt_allocation(size_t word_size, AllocationContext_t context) {
   return mutator_alloc_region(context)->attempt_allocation(word_size, false /* bot_updates */);
@@ -43,4 +44,15 @@
   return mutator_alloc_region(context)->attempt_allocation_force(word_size, false /* bot_updates */);
 }
 
+inline HeapWord* G1PLABAllocator::plab_allocate(InCSetState dest,
+                                                size_t word_sz,
+                                                AllocationContext_t context) {
+  G1PLAB* buffer = alloc_buffer(dest, context);
+  if (_survivor_alignment_bytes == 0 || !dest.is_young()) {
+    return buffer->allocate(word_sz);
+  } else {
+    return buffer->allocate_aligned(word_sz, _survivor_alignment_bytes);
+  }
+}
+
 #endif // SHARE_VM_GC_G1_G1ALLOCATOR_HPP
--- a/src/share/vm/gc/g1/g1Allocator_ext.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1Allocator_ext.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -23,7 +23,7 @@
  */
 
 #include "precompiled.hpp"
-#include "gc/g1/g1Allocator.hpp"
+#include "gc/g1/g1Allocator.inline.hpp"
 #include "gc/g1/g1CollectedHeap.hpp"
 
 G1Allocator* G1Allocator::create_allocator(G1CollectedHeap* g1h) {
--- a/src/share/vm/gc/g1/g1BlockOffsetTable.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1BlockOffsetTable.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -26,7 +26,7 @@
 #define SHARE_VM_GC_G1_G1BLOCKOFFSETTABLE_INLINE_HPP
 
 #include "gc/g1/g1BlockOffsetTable.hpp"
-#include "gc/g1/heapRegion.inline.hpp"
+#include "gc/g1/heapRegion.hpp"
 #include "gc/shared/space.hpp"
 
 inline HeapWord* G1BlockOffsetTable::block_start(const void* addr) {
--- a/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1944,8 +1944,8 @@
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
   _summary_bytes_used(0),
-  _survivor_plab_stats(YoungPLABSize, PLABWeight),
-  _old_plab_stats(OldPLABSize, PLABWeight),
+  _survivor_evac_stats(YoungPLABSize, PLABWeight),
+  _old_evac_stats(OldPLABSize, PLABWeight),
   _expand_heap_after_alloc_failure(true),
   _surviving_young_words(NULL),
   _old_marking_cycles_started(0),
@@ -1960,7 +1960,7 @@
   _gc_tracer_stw(new (ResourceObj::C_HEAP, mtGC) G1NewTracer()),
   _gc_tracer_cm(new (ResourceObj::C_HEAP, mtGC) G1OldTracer()) {
 
-  _workers = new FlexibleWorkGang("GC Thread", ParallelGCThreads,
+  _workers = new WorkGang("GC Thread", ParallelGCThreads,
                           /* are_GC_task_threads */true,
                           /* are_ConcurrentGC_threads */false);
   _workers->initialize_workers();
@@ -3504,6 +3504,13 @@
   return G1HeapSummary(heap_summary, used(), eden_used_bytes, eden_capacity_bytes, survivor_used_bytes);
 }
 
+G1EvacSummary G1CollectedHeap::create_g1_evac_summary(G1EvacStats* stats) {
+  return G1EvacSummary(stats->allocated(), stats->wasted(), stats->undo_wasted(),
+                       stats->unused(), stats->used(), stats->region_end_waste(),
+                       stats->regions_filled(), stats->direct_allocated(),
+                       stats->failure_used(), stats->failure_waste());
+}
+
 void G1CollectedHeap::trace_heap(GCWhen::Type when, const GCTracer* gc_tracer) {
   const G1HeapSummary& heap_summary = create_g1_heap_summary();
   gc_tracer->report_gc_heap_summary(when, heap_summary);
@@ -3753,8 +3760,7 @@
   cl.flush_rem_set_entries();
 }
 
-void
-G1CollectedHeap::setup_surviving_young_words() {
+void G1CollectedHeap::setup_surviving_young_words() {
   assert(_surviving_young_words == NULL, "pre-condition");
   uint array_length = g1_policy()->young_cset_region_length();
   _surviving_young_words = NEW_C_HEAP_ARRAY(size_t, (size_t) array_length, mtGC);
@@ -3770,17 +3776,15 @@
 #endif // !ASSERT
 }
 
-void
-G1CollectedHeap::update_surviving_young_words(size_t* surv_young_words) {
-  MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
+void G1CollectedHeap::update_surviving_young_words(size_t* surv_young_words) {
+  assert_at_safepoint(true);
   uint array_length = g1_policy()->young_cset_region_length();
   for (uint i = 0; i < array_length; ++i) {
     _surviving_young_words[i] += surv_young_words[i];
   }
 }
 
-void
-G1CollectedHeap::cleanup_surviving_young_words() {
+void G1CollectedHeap::cleanup_surviving_young_words() {
   guarantee( _surviving_young_words != NULL, "pre-condition" );
   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words);
   _surviving_young_words = NULL;
@@ -4375,6 +4379,13 @@
 }
 
 class G1ParEvacuateFollowersClosure : public VoidClosure {
+private:
+  double _start_term;
+  double _term_time;
+  size_t _term_attempts;
+
+  void start_term_time() { _term_attempts++; _start_term = os::elapsedTime(); }
+  void end_term_time() { _term_time += os::elapsedTime() - _start_term; }
 protected:
   G1CollectedHeap*              _g1h;
   G1ParScanThreadState*         _par_scan_state;
@@ -4391,19 +4402,23 @@
                                 RefToScanQueueSet* queues,
                                 ParallelTaskTerminator* terminator)
     : _g1h(g1h), _par_scan_state(par_scan_state),
-      _queues(queues), _terminator(terminator) {}
+      _queues(queues), _terminator(terminator),
+      _start_term(0.0), _term_time(0.0), _term_attempts(0) {}
 
   void do_void();
 
+  double term_time() const { return _term_time; }
+  size_t term_attempts() const { return _term_attempts; }
+
 private:
   inline bool offer_termination();
 };
 
 bool G1ParEvacuateFollowersClosure::offer_termination() {
   G1ParScanThreadState* const pss = par_scan_state();
-  pss->start_term_time();
+  start_term_time();
   const bool res = terminator()->offer_termination();
-  pss->end_term_time();
+  end_term_time();
   return res;
 }
 
@@ -4444,15 +4459,17 @@
 class G1ParTask : public AbstractGangTask {
 protected:
   G1CollectedHeap*       _g1h;
-  RefToScanQueueSet      *_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   G1RootProcessor*       _root_processor;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
+  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
     : AbstractGangTask("G1 collection"),
       _g1h(g1h),
+      _pss(per_thread_states),
       _queues(task_queues),
       _root_processor(root_processor),
       _terminator(n_workers, _queues),
@@ -4499,7 +4516,8 @@
   void work(uint worker_id) {
     if (worker_id >= _n_workers) return;  // no work needed this round
 
-    _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::GCWorkerStart, worker_id, os::elapsedTime());
+    double start_sec = os::elapsedTime();
+    _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::GCWorkerStart, worker_id, start_sec);
 
     {
       ResourceMark rm;
@@ -4507,23 +4525,24 @@
 
       ReferenceProcessor*             rp = _g1h->ref_processor_stw();
 
-      G1ParScanThreadState            pss(_g1h, worker_id, rp);
+      G1ParScanThreadState*           pss = _pss[worker_id];
+      pss->set_ref_processor(rp);
 
       bool only_young = _g1h->collector_state()->gcs_are_young();
 
       // Non-IM young GC.
-      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkNone>                                scan_only_cld_cl(&scan_only_root_cl,
                                                                                only_young, // Only process dirty klasses.
                                                                                false);     // No need to claim CLDs.
       // IM young GC.
       //    Strong roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkFromRoot>                            scan_mark_cld_cl(&scan_mark_root_cl,
                                                                                false, // Process all klasses.
                                                                                true); // Need to claim CLDs.
       //    Weak roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkPromotedFromRoot>                    scan_mark_weak_cld_cl(&scan_mark_weak_root_cl,
                                                                                     false, // Process all klasses.
                                                                                     true); // Need to claim CLDs.
@@ -4554,8 +4573,7 @@
         weak_cld_cl    = &scan_only_cld_cl;
       }
 
-      pss.start_strong_roots();
-
+      double start_strong_roots_sec = os::elapsedTime();
       _root_processor->evacuate_roots(strong_root_cl,
                                       weak_root_cl,
                                       strong_cld_cl,
@@ -4563,32 +4581,45 @@
                                       trace_metadata,
                                       worker_id);
 
-      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, &pss);
+      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, pss);
       _root_processor->scan_remembered_sets(&push_heap_rs_cl,
                                             weak_root_cl,
                                             worker_id);
-      pss.end_strong_roots();
-
+      double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;
+
+      double term_sec = 0.0;
+      size_t evac_term_attempts = 0;
       {
         double start = os::elapsedTime();
-        G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, &_terminator);
+        G1ParEvacuateFollowersClosure evac(_g1h, pss, _queues, &_terminator);
         evac.do_void();
+
+        evac_term_attempts = evac.term_attempts();
+        term_sec = evac.term_time();
         double elapsed_sec = os::elapsedTime() - start;
-        double term_sec = pss.term_time();
         _g1h->g1_policy()->phase_times()->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
         _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
-        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, pss.term_attempts());
+        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
       }
-      _g1h->g1_policy()->record_thread_age_table(pss.age_table());
-      _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
+
+      assert(pss->queue_is_empty(), "should be empty");
 
       if (PrintTerminationStats) {
         MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
-        pss.print_termination_stats();
+        size_t lab_waste;
+        size_t lab_undo_waste;
+        pss->waste(lab_waste, lab_undo_waste);
+        _g1h->print_termination_stats(gclog_or_tty,
+                                      worker_id,
+                                      (os::elapsedTime() - start_sec) * 1000.0,   /* elapsed time */
+                                      strong_roots_sec * 1000.0,                  /* strong roots time */
+                                      term_sec * 1000.0,                          /* evac term time */
+                                      evac_term_attempts,                         /* evac term attempts */
+                                      lab_waste,                                  /* alloc buffer waste */
+                                      lab_undo_waste                              /* undo waste */
+                                      );
       }
 
-      assert(pss.queue_is_empty(), "should be empty");
-
       // Close the inner scope so that the ResourceMark and HandleMark
       // destructors are executed here and are included as part of the
       // "GC Worker Time".
@@ -4597,6 +4628,31 @@
   }
 };
 
+void G1CollectedHeap::print_termination_stats_hdr(outputStream* const st) {
+  st->print_raw_cr("GC Termination Stats");
+  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
+  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
+  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
+}
+
+void G1CollectedHeap::print_termination_stats(outputStream* const st,
+                                              uint worker_id,
+                                              double elapsed_ms,
+                                              double strong_roots_ms,
+                                              double term_ms,
+                                              size_t term_attempts,
+                                              size_t alloc_buffer_waste,
+                                              size_t undo_waste) const {
+  st->print_cr("%3d %9.2f %9.2f %6.2f "
+               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
+               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
+               worker_id, elapsed_ms, strong_roots_ms, strong_roots_ms * 100 / elapsed_ms,
+               term_ms, term_ms * 100 / elapsed_ms, term_attempts,
+               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
+               alloc_buffer_waste * HeapWordSize / K,
+               undo_waste * HeapWordSize / K);
+}
+
 class G1StringSymbolTableUnlinkTask : public AbstractGangTask {
 private:
   BoolObjectClosure* _is_alive;
@@ -5125,17 +5181,20 @@
 
 class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
-  G1CollectedHeap*   _g1h;
-  RefToScanQueueSet* _queues;
-  FlexibleWorkGang*  _workers;
-  uint               _active_workers;
+  G1CollectedHeap*        _g1h;
+  G1ParScanThreadState**  _pss;
+  RefToScanQueueSet*      _queues;
+  WorkGang*               _workers;
+  uint                    _active_workers;
 
 public:
   G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
-                           FlexibleWorkGang* workers,
+                           G1ParScanThreadState** per_thread_states,
+                           WorkGang* workers,
                            RefToScanQueueSet *task_queues,
                            uint n_workers) :
     _g1h(g1h),
+    _pss(per_thread_states),
     _queues(task_queues),
     _workers(workers),
     _active_workers(n_workers)
@@ -5154,17 +5213,20 @@
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   ProcessTask&     _proc_task;
   G1CollectedHeap* _g1h;
-  RefToScanQueueSet *_task_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet* _task_queues;
   ParallelTaskTerminator* _terminator;
 
 public:
   G1STWRefProcTaskProxy(ProcessTask& proc_task,
-                     G1CollectedHeap* g1h,
-                     RefToScanQueueSet *task_queues,
-                     ParallelTaskTerminator* terminator) :
+                        G1CollectedHeap* g1h,
+                        G1ParScanThreadState** per_thread_states,
+                        RefToScanQueueSet *task_queues,
+                        ParallelTaskTerminator* terminator) :
     AbstractGangTask("Process reference objects in parallel"),
     _proc_task(proc_task),
     _g1h(g1h),
+    _pss(per_thread_states),
     _task_queues(task_queues),
     _terminator(terminator)
   {}
@@ -5176,11 +5238,12 @@
 
     G1STWIsAliveClosure is_alive(_g1h);
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*           pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5190,10 +5253,10 @@
     }
 
     // Keep alive closure.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     // Complete GC closure
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _task_queues, _terminator);
 
     // Call the reference processing task's work routine.
     _proc_task.work(worker_id, is_alive, keep_alive, drain_queue);
@@ -5212,7 +5275,7 @@
   assert(_workers != NULL, "Need parallel worker threads.");
 
   ParallelTaskTerminator terminator(_active_workers, _queues);
-  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);
+  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _pss, _queues, &terminator);
 
   _workers->run_task(&proc_task_proxy);
 }
@@ -5254,15 +5317,17 @@
 
 class G1ParPreserveCMReferentsTask: public AbstractGangTask {
 protected:
-  G1CollectedHeap* _g1h;
-  RefToScanQueueSet      *_queues;
+  G1CollectedHeap*       _g1h;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, uint workers, RefToScanQueueSet *task_queues) :
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, int workers, RefToScanQueueSet *task_queues) :
     AbstractGangTask("ParPreserveCMReferents"),
     _g1h(g1h),
+    _pss(per_thread_states),
     _queues(task_queues),
     _terminator(workers, _queues),
     _n_workers(workers)
@@ -5272,12 +5337,13 @@
     ResourceMark rm;
     HandleMark   hm;
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-    assert(pss.queue_is_empty(), "both queue and overflow should be empty");
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*          pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+    assert(pss->queue_is_empty(), "both queue and overflow should be empty");
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5291,7 +5357,7 @@
 
     // Copying keep alive closure. Applied to referent objects that need
     // to be copied.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     ReferenceProcessor* rp = _g1h->ref_processor_cm();
 
@@ -5324,15 +5390,15 @@
     }
 
     // Drain the queue - which may cause stealing
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _queues, &_terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _queues, &_terminator);
     drain_queue.do_void();
     // Allocation buffers were retired at the end of G1ParEvacuateFollowersClosure
-    assert(pss.queue_is_empty(), "should be");
+    assert(pss->queue_is_empty(), "should be");
   }
 };
 
 // Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references() {
+void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_thread_states) {
   double ref_proc_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5362,6 +5428,7 @@
   uint no_of_gc_workers = workers()->active_workers();
 
   G1ParPreserveCMReferentsTask keep_cm_referents(this,
+                                                 per_thread_states,
                                                  no_of_gc_workers,
                                                  _task_queues);
 
@@ -5376,16 +5443,17 @@
   // JNI refs.
 
   // Use only a single queue for this PSS.
-  G1ParScanThreadState            pss(this, 0, NULL);
-  assert(pss.queue_is_empty(), "pre-condition");
+  G1ParScanThreadState*           pss = per_thread_states[0];
+  pss->set_ref_processor(NULL);
+  assert(pss->queue_is_empty(), "pre-condition");
 
   // We do not embed a reference processor in the copying/scanning
   // closures while we're actually processing the discovered
   // reference objects.
 
-  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, &pss, NULL);
-
-  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, &pss, NULL);
+  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, pss, NULL);
+
+  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, pss, NULL);
 
   OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5395,10 +5463,10 @@
   }
 
   // Keep alive closure.
-  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, &pss);
+  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, pss);
 
   // Serial Complete GC closure
-  G1STWDrainQueueClosure drain_queue(this, &pss);
+  G1STWDrainQueueClosure drain_queue(this, pss);
 
   // Setup the soft refs policy...
   rp->setup_policy(false);
@@ -5417,7 +5485,7 @@
     assert(rp->num_q() == no_of_gc_workers, "sanity");
     assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, no_of_gc_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, per_thread_states, workers(), _task_queues, no_of_gc_workers);
     stats = rp->process_discovered_references(&is_alive,
                                               &keep_alive,
                                               &drain_queue,
@@ -5429,14 +5497,14 @@
   _gc_tracer_stw->report_gc_reference_stats(stats);
 
   // We have completed copying any necessary live referent objects.
-  assert(pss.queue_is_empty(), "both queue and overflow should be empty");
+  assert(pss->queue_is_empty(), "both queue and overflow should be empty");
 
   double ref_proc_time = os::elapsedTime() - ref_proc_start;
   g1_policy()->phase_times()->record_ref_proc_time(ref_proc_time * 1000.0);
 }
 
 // Weak Reference processing during an evacuation pause (part 2).
-void G1CollectedHeap::enqueue_discovered_references() {
+void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_thread_states) {
   double ref_enq_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5455,7 +5523,7 @@
     assert(rp->num_q() == n_workers, "sanity");
     assert(n_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, n_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, per_thread_states, workers(), _task_queues, n_workers);
     rp->enqueue_discovered_references(&par_task_executor);
   }
 
@@ -5491,9 +5559,14 @@
   double start_par_time_sec = os::elapsedTime();
   double end_par_time_sec;
 
+  G1ParScanThreadState** per_thread_states = NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC);
+  for (uint i = 0; i < n_workers; i++) {
+    per_thread_states[i] = new_par_scan_state(i);
+  }
+
   {
     G1RootProcessor root_processor(this, n_workers);
-    G1ParTask g1_par_task(this, _task_queues, &root_processor, n_workers);
+    G1ParTask g1_par_task(this, per_thread_states, _task_queues, &root_processor, n_workers);
     // InitialMark needs claim bits to keep track of the marked-through CLDs.
     if (collector_state()->during_initial_mark_pause()) {
       ClassLoaderDataGraph::clear_claimed_marks();
@@ -5501,7 +5574,7 @@
 
     // The individual threads will set their evac-failure closures.
     if (PrintTerminationStats) {
-      G1ParScanThreadState::print_termination_stats_hdr();
+      print_termination_stats_hdr(gclog_or_tty);
     }
 
     workers()->run_task(&g1_par_task);
@@ -5528,7 +5601,7 @@
   // as we may have to copy some 'reachable' referent
   // objects (and their reachable sub-graphs) that were
   // not copied during the pause.
-  process_discovered_references();
+  process_discovered_references(per_thread_states);
 
   if (G1StringDedup::is_enabled()) {
     double fixup_start = os::elapsedTime();
@@ -5544,6 +5617,14 @@
   _allocator->release_gc_alloc_regions(evacuation_info);
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
+  for (uint i = 0; i < n_workers; i++) {
+    G1ParScanThreadState* pss = per_thread_states[i];
+    delete pss;
+  }
+  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, per_thread_states);
+
+  record_obj_copy_mem_stats();
+
   // Reset and re-enable the hot card cache.
   // Note the counts for the cards in the regions in the
   // collection set are reset when the collection set is freed.
@@ -5568,12 +5649,17 @@
   // will log these updates (and dirty their associated
   // cards). We need these updates logged to update any
   // RSets.
-  enqueue_discovered_references();
+  enqueue_discovered_references(per_thread_states);
 
   redirty_logged_cards();
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 }
 
+void G1CollectedHeap::record_obj_copy_mem_stats() {
+  _gc_tracer_stw->report_evacuation_statistics(create_g1_evac_summary(&_survivor_evac_stats),
+                                               create_g1_evac_summary(&_old_evac_stats));
+}
+
 void G1CollectedHeap::free_region(HeapRegion* hr,
                                   FreeRegionList* free_list,
                                   bool par,
@@ -5972,6 +6058,11 @@
       cur->set_evacuation_failed(false);
       // The region is now considered to be old.
       cur->set_old();
+      // Do some allocation statistics accounting. Regions that failed evacuation
+      // are always made old, so there is no need to update anything in the young
+      // gen statistics, but we need to update old gen statistics.
+      size_t used_words = cur->marked_bytes() / HeapWordSize;
+      _old_evac_stats.add_failure_used_and_waste(used_words, HeapRegion::GrainWords - used_words);
       _old_set.add(cur);
       evacuation_info.increment_collectionset_used_after(cur->used());
     }
@@ -6217,6 +6308,10 @@
   }
 }
 
+bool G1CollectedHeap::is_old_gc_alloc_region(HeapRegion* hr) {
+  return _allocator->is_retained_old_region(hr);
+}
+
 void G1CollectedHeap::set_region_short_lived_locked(HeapRegion* hr) {
   _young_list->push_region(hr);
 }
--- a/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -28,12 +28,12 @@
 #include "gc/g1/concurrentMark.hpp"
 #include "gc/g1/evacuationInfo.hpp"
 #include "gc/g1/g1AllocationContext.hpp"
-#include "gc/g1/g1Allocator.hpp"
 #include "gc/g1/g1BiasedArray.hpp"
 #include "gc/g1/g1CollectorState.hpp"
 #include "gc/g1/g1HRPrinter.hpp"
 #include "gc/g1/g1InCSetState.hpp"
 #include "gc/g1/g1MonitoringSupport.hpp"
+#include "gc/g1/g1EvacStats.hpp"
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/g1YCTypes.hpp"
 #include "gc/g1/hSpaceCounters.hpp"
@@ -41,6 +41,7 @@
 #include "gc/g1/heapRegionSet.hpp"
 #include "gc/shared/barrierSet.hpp"
 #include "gc/shared/collectedHeap.hpp"
+#include "gc/shared/plab.hpp"
 #include "memory/memRegion.hpp"
 #include "utilities/stack.hpp"
 
@@ -54,6 +55,7 @@
 class HRRSCleanupTask;
 class GenerationSpec;
 class OopsInHeapRegionClosure;
+class G1ParScanThreadState;
 class G1KlassScanClosure;
 class G1ParScanThreadState;
 class ObjectClosure;
@@ -75,7 +77,9 @@
 class EvacuationFailedInfo;
 class nmethod;
 class Ticks;
-class FlexibleWorkGang;
+class WorkGang;
+class G1Allocator;
+class G1ArchiveAllocator;
 
 typedef OverflowTaskQueue<StarTask, mtGC>         RefToScanQueue;
 typedef GenericTaskQueueSet<RefToScanQueue, mtGC> RefToScanQueueSet;
@@ -184,8 +188,7 @@
   friend class VM_G1IncCollectionPause;
   friend class VMStructs;
   friend class MutatorAllocRegion;
-  friend class SurvivorGCAllocRegion;
-  friend class OldGCAllocRegion;
+  friend class G1GCAllocRegion;
 
   // Closures used in implementation.
   friend class G1ParScanThreadState;
@@ -200,7 +203,7 @@
   friend class G1CheckCSetFastTableClosure;
 
 private:
-  FlexibleWorkGang* _workers;
+  WorkGang* _workers;
 
   static size_t _humongous_object_threshold_in_words;
 
@@ -245,7 +248,7 @@
   // The sequence of all heap regions in the heap.
   HeapRegionManager _hrm;
 
-  // Handles non-humongous allocations in the G1CollectedHeap.
+  // Manages all allocations with regions except humongous object allocations.
   G1Allocator* _allocator;
 
   // Outside of GC pauses, the number of bytes used in all regions other
@@ -263,11 +266,11 @@
   // Statistics for each allocation context
   AllocationContextStats _allocation_context_stats;
 
-  // PLAB sizing policy for survivors.
-  PLABStats _survivor_plab_stats;
+  // GC allocation statistics policy for survivors.
+  G1EvacStats _survivor_evac_stats;
 
-  // PLAB sizing policy for tenured objects.
-  PLABStats _old_plab_stats;
+  // GC allocation statistics policy for tenured objects.
+  G1EvacStats _old_evac_stats;
 
   // It specifies whether we should attempt to expand the heap after a
   // region allocation failure. If heap expansion fails we set this to
@@ -581,14 +584,14 @@
 
   // Process any reference objects discovered during
   // an incremental evacuation pause.
-  void process_discovered_references();
+  void process_discovered_references(G1ParScanThreadState** per_thread_states);
 
   // Enqueue any remaining discovered references
   // after processing.
-  void enqueue_discovered_references();
+  void enqueue_discovered_references(G1ParScanThreadState** per_thread_states);
 
 public:
-  FlexibleWorkGang* workers() const { return _workers; }
+  WorkGang* workers() const { return _workers; }
 
   G1Allocator* allocator() {
     return _allocator;
@@ -606,7 +609,7 @@
   bool expand(size_t expand_bytes);
 
   // Returns the PLAB statistics for a given destination.
-  inline PLABStats* alloc_buffer_stats(InCSetState dest);
+  inline G1EvacStats* alloc_buffer_stats(InCSetState dest);
 
   // Determines PLAB size for a given destination.
   inline size_t desired_plab_sz(InCSetState dest);
@@ -680,6 +683,9 @@
   // Allocates a new heap region instance.
   HeapRegion* new_heap_region(uint hrs_index, MemRegion mr);
 
+  // Allocates a new per thread par scan state for the given thread id.
+  G1ParScanThreadState* new_par_scan_state(uint worker_id);
+
   // Allocate the highest free region in the reserved heap. This will commit
   // regions as necessary.
   HeapRegion* alloc_highest_free_region();
@@ -789,6 +795,20 @@
   // Actually do the work of evacuating the collection set.
   void evacuate_collection_set(EvacuationInfo& evacuation_info);
 
+  // Print the header for the per-thread termination statistics.
+  static void print_termination_stats_hdr(outputStream* const st);
+  // Print actual per-thread termination statistics.
+  void print_termination_stats(outputStream* const st,
+                               uint worker_id,
+                               double elapsed_ms,
+                               double strong_roots_ms,
+                               double term_ms,
+                               size_t term_attempts,
+                               size_t alloc_buffer_waste,
+                               size_t undo_waste) const;
+  // Update object copying statistics.
+  void record_obj_copy_mem_stats();
+
   // The g1 remembered set of the heap.
   G1RemSet* _g1_rem_set;
 
@@ -1195,9 +1215,7 @@
 
   // Determine whether the given region is one that we are using as an
   // old GC alloc region.
-  bool is_old_gc_alloc_region(HeapRegion* hr) {
-    return _allocator->is_retained_old_region(hr);
-  }
+  bool is_old_gc_alloc_region(HeapRegion* hr);
 
   // Perform a collection of the heap; intended for use in implementing
   // "System.gc".  This probably implies as full a collection as the
@@ -1566,6 +1584,7 @@
                         const VerifyOption vo) const;
 
   G1HeapSummary create_g1_heap_summary();
+  G1EvacSummary create_g1_evac_summary(G1EvacStats* stats);
 
   // Printing
 
--- a/src/share/vm/gc/g1/g1CollectedHeap.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectedHeap.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -35,12 +35,12 @@
 #include "gc/shared/taskqueue.hpp"
 #include "runtime/orderAccess.inline.hpp"
 
-PLABStats* G1CollectedHeap::alloc_buffer_stats(InCSetState dest) {
+G1EvacStats* G1CollectedHeap::alloc_buffer_stats(InCSetState dest) {
   switch (dest.value()) {
     case InCSetState::Young:
-      return &_survivor_plab_stats;
+      return &_survivor_evac_stats;
     case InCSetState::Old:
-      return &_old_plab_stats;
+      return &_old_evac_stats;
     default:
       ShouldNotReachHere();
       return NULL; // Keep some compilers happy
--- a/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -24,6 +24,7 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1CollectedHeap.hpp"
+#include "gc/g1/g1ParScanThreadState.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 
 bool G1CollectedHeap::copy_allocation_context_stats(const jint* contexts,
@@ -37,3 +38,7 @@
                                              MemRegion mr) {
   return new HeapRegion(hrs_index, bot_shared(), mr);
 }
+
+G1ParScanThreadState* G1CollectedHeap::new_par_scan_state(uint worker_id) {
+  return new G1ParScanThreadState(this, worker_id);
+}
--- a/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1582,7 +1582,7 @@
 G1CollectorPolicy::record_concurrent_mark_cleanup_end() {
   _collectionSetChooser->clear();
 
-  FlexibleWorkGang* workers = _g1->workers();
+  WorkGang* workers = _g1->workers();
   uint n_workers = workers->active_workers();
 
   uint n_regions = _g1->num_regions();
--- a/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -26,8 +26,8 @@
 #define SHARE_VM_GC_G1_G1COLLECTORPOLICY_HPP
 
 #include "gc/g1/collectionSetChooser.hpp"
-#include "gc/g1/g1Allocator.hpp"
 #include "gc/g1/g1CollectorState.hpp"
+#include "gc/g1/g1InCSetState.hpp"
 #include "gc/g1/g1MMUTracker.hpp"
 #include "gc/shared/collectorPolicy.hpp"
 
--- a/src/share/vm/gc/g1/g1CollectorPolicy_ext.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_GC_G1_G1COLLECTORPOLICY_EXT_HPP
-#define SHARE_VM_GC_G1_G1COLLECTORPOLICY_EXT_HPP
-
-#include "gc/g1/g1CollectorPolicy.hpp"
-
-class G1CollectorPolicyExt : public G1CollectorPolicy { };
-
-#endif // SHARE_VM_GC_G1_G1COLLECTORPOLICY_EXT_HPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc/g1/g1EvacStats.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/g1/g1EvacStats.hpp"
+#include "gc/shared/gcId.hpp"
+#include "trace/tracing.hpp"
+
+void G1EvacStats::adjust_desired_plab_sz() {
+  if (PrintPLAB) {
+    gclog_or_tty->print(" (allocated = " SIZE_FORMAT " wasted = " SIZE_FORMAT " "
+                        "unused = " SIZE_FORMAT " used = " SIZE_FORMAT " "
+                        "undo_waste = " SIZE_FORMAT " region_end_waste = " SIZE_FORMAT " "
+                        "regions filled = %u direct_allocated = " SIZE_FORMAT " "
+                        "failure_used = " SIZE_FORMAT " failure_waste = " SIZE_FORMAT ") ",
+                        _allocated, _wasted, _unused, used(), _undo_wasted, _region_end_waste,
+                        _regions_filled, _direct_allocated, _failure_used, _failure_waste);
+  }
+
+  if (ResizePLAB) {
+
+    assert(is_object_aligned(max_size()) && min_size() <= max_size(),
+           "PLAB clipping computation may be incorrect");
+
+    if (_allocated == 0) {
+      assert((_unused == 0),
+             err_msg("Inconsistency in PLAB stats: "
+                     "_allocated: "SIZE_FORMAT", "
+                     "_wasted: "SIZE_FORMAT", "
+                     "_region_end_waste: "SIZE_FORMAT", "
+                     "_unused: "SIZE_FORMAT", "
+                     "_used  : "SIZE_FORMAT,
+                     _allocated, _wasted, _region_end_waste, _unused, used()));
+      _allocated = 1;
+    }
+    // We account region end waste fully to PLAB allocation. This is not completely fair,
+    // but is a conservative assumption because PLABs may be sized flexibly while we
+    // cannot adjust direct allocations.
+    // In some cases, wasted_frac may become > 1 but that just reflects the problem
+    // with region_end_waste.
+    double wasted_frac    = (double)(_unused + _wasted + _region_end_waste) / (double)_allocated;
+    size_t target_refills = (size_t)((wasted_frac * TargetSurvivorRatio) / TargetPLABWastePct);
+    if (target_refills == 0) {
+      target_refills = 1;
+    }
+    size_t cur_plab_sz = used() / target_refills;
+    // Take historical weighted average
+    _filter.sample(cur_plab_sz);
+    // Clip from above and below, and align to object boundary
+    size_t plab_sz;
+    plab_sz = MAX2(min_size(), (size_t)_filter.average());
+    plab_sz = MIN2(max_size(), plab_sz);
+    plab_sz = align_object_size(plab_sz);
+    // Latch the result
+    _desired_net_plab_sz = plab_sz;
+    if (PrintPLAB) {
+      gclog_or_tty->print_cr(" (plab_sz = " SIZE_FORMAT " desired_plab_sz = " SIZE_FORMAT ") ", cur_plab_sz, plab_sz);
+    }
+  }
+  // Clear accumulators for next round.
+  reset();
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc/g1/g1EvacStats.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_gc_G1_G1EVACSTATS_HPP
+#define SHARE_VM_gc_G1_G1EVACSTATS_HPP
+
+#include "gc/shared/plab.hpp"
+#include "runtime/atomic.hpp"
+
+// Records various memory allocation statistics gathered during evacuation.
+class G1EvacStats : public PLABStats {
+ private:
+  size_t _region_end_waste; // Number of words wasted due to skipping to the next region.
+  uint   _regions_filled;   // Number of regions filled completely.
+  size_t _direct_allocated; // Number of words allocated directly into the regions.
+
+  // Number of words in live objects remaining in regions that ultimately suffered an
+  // evacuation failure. This is used in the regions when the regions are made old regions.
+  size_t _failure_used;
+  // Number of words wasted in regions which failed evacuation. This is the sum of space
+  // for objects successfully copied out of the regions (now dead space) plus waste at the
+  // end of regions.
+  size_t _failure_waste;
+
+  virtual void reset() {
+    PLABStats::reset();
+    _region_end_waste = 0;
+    _regions_filled = 0;
+    _direct_allocated = 0;
+    _failure_used = 0;
+    _failure_waste = 0;
+  }
+
+ public:
+  G1EvacStats(size_t desired_plab_sz_, unsigned wt) : PLABStats(desired_plab_sz_, wt),
+    _region_end_waste(0), _regions_filled(0), _direct_allocated(0),
+    _failure_used(0), _failure_waste(0) {
+  }
+
+  virtual void adjust_desired_plab_sz();
+
+  size_t allocated() const { return _allocated; }
+  size_t wasted() const { return _wasted; }
+  size_t unused() const { return _unused; }
+  size_t used() const { return allocated() - (wasted() + unused()); }
+  size_t undo_wasted() const { return _undo_wasted; }
+
+  uint regions_filled() const { return _regions_filled; }
+  size_t region_end_waste() const { return _region_end_waste; }
+  size_t direct_allocated() const { return _direct_allocated; }
+
+  // Amount of space in heapwords used in the failing regions when an evacuation failure happens.
+  size_t failure_used() const { return _failure_used; }
+  // Amount of space in heapwords wasted (unused) in the failing regions when an evacuation failure happens.
+  size_t failure_waste() const { return _failure_waste; }
+
+  void add_direct_allocated(size_t value) {
+    Atomic::add_ptr(value, &_direct_allocated);
+  }
+
+  void add_region_end_waste(size_t value) {
+    Atomic::add_ptr(value, &_region_end_waste);
+    Atomic::add_ptr(1, &_regions_filled);
+  }
+
+  void add_failure_used_and_waste(size_t used, size_t waste) {
+    Atomic::add_ptr(used, &_failure_used);
+    Atomic::add_ptr(waste, &_failure_waste);
+  }
+};
+
+#endif // SHARE_VM_gc_G1_G1EVACSTATS_HPP
--- a/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -32,7 +32,11 @@
 
 G1ParCopyHelper::G1ParCopyHelper(G1CollectedHeap* g1,  G1ParScanThreadState* par_scan_state) :
   G1ParClosureSuper(g1, par_scan_state), _scanned_klass(NULL),
-  _cm(_g1->concurrent_mark()) {}
+  _cm(_g1->concurrent_mark()) { }
+
+G1ParCopyHelper::G1ParCopyHelper(G1CollectedHeap* g1) :
+  G1ParClosureSuper(g1), _scanned_klass(NULL),
+  _cm(_g1->concurrent_mark()) { }
 
 G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1) :
   _g1(g1), _par_scan_state(NULL), _worker_id(UINT_MAX) { }
--- a/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -76,15 +76,13 @@
 
 class G1ParScanClosure : public G1ParClosureSuper {
 public:
-  G1ParScanClosure(G1CollectedHeap* g1, ReferenceProcessor* rp) :
-    G1ParClosureSuper(g1) {
-    assert(_ref_processor == NULL, "sanity");
-    _ref_processor = rp;
-  }
+  G1ParScanClosure(G1CollectedHeap* g1) : G1ParClosureSuper(g1) { }
 
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
+
+  void set_ref_processor(ReferenceProcessor* ref_processor) { _ref_processor = ref_processor; }
 };
 
 // Add back base class for metadata
@@ -104,6 +102,7 @@
   void mark_forwarded_object(oop from_obj, oop to_obj);
  public:
   G1ParCopyHelper(G1CollectedHeap* g1,  G1ParScanThreadState* par_scan_state);
+  G1ParCopyHelper(G1CollectedHeap* g1);
 
   void set_scanned_klass(Klass* k) { _scanned_klass = k; }
   template <class T> void do_klass_barrier(T* p, oop new_obj);
@@ -132,6 +131,10 @@
     assert(_ref_processor == NULL, "sanity");
   }
 
+  G1ParCopyClosure(G1CollectedHeap* g1) : G1ParCopyHelper(g1) {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
   template <class T> void do_oop_nv(T* p) { do_oop_work(p); }
   virtual void do_oop(oop* p)       { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
--- a/src/share/vm/gc/g1/g1ParScanThreadState.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1ParScanThreadState.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "gc/g1/g1Allocator.inline.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1OopClosures.inline.hpp"
 #include "gc/g1/g1ParScanThreadState.inline.hpp"
@@ -31,17 +32,19 @@
 #include "oops/oop.inline.hpp"
 #include "runtime/prefetch.inline.hpp"
 
-G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, ReferenceProcessor* rp)
+G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
   : _g1h(g1h),
     _refs(g1h->task_queue(worker_id)),
     _dcq(&g1h->dirty_card_queue_set()),
     _ct_bs(g1h->g1_barrier_set()),
     _g1_rem(g1h->g1_rem_set()),
-    _hash_seed(17), _worker_id(worker_id),
-    _term_attempts(0),
+    _hash_seed(17),
+    _worker_id(worker_id),
     _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
-    _age_table(false), _scanner(g1h, rp),
-    _strong_roots_time(0), _term_time(0) {
+    _age_table(false),
+    _scanner(g1h),
+    _old_gen_is_full(false)
+{
   _scanner.set_par_scan_thread_state(this);
   // we allocate G1YoungSurvRateNumRegions plus one entries, since
   // we "sacrifice" entry 0 to keep track of surviving bytes for
@@ -66,38 +69,20 @@
   // need to be moved to the next space.
   _dest[InCSetState::Young]        = InCSetState::Old;
   _dest[InCSetState::Old]          = InCSetState::Old;
-
-  _start = os::elapsedTime();
 }
 
 G1ParScanThreadState::~G1ParScanThreadState() {
-  _plab_allocator->retire_alloc_buffers();
+  // Update allocation statistics.
+  _plab_allocator->flush_and_retire_stats();
   delete _plab_allocator;
+  _g1h->g1_policy()->record_thread_age_table(&_age_table);
+  // Update heap statistics.
+  _g1h->update_surviving_young_words(_surviving_young_words);
   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base);
 }
 
-void G1ParScanThreadState::print_termination_stats_hdr(outputStream* const st) {
-  st->print_raw_cr("GC Termination Stats");
-  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
-  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
-  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
-}
-
-void G1ParScanThreadState::print_termination_stats(outputStream* const st) const {
-  const double elapsed_ms = elapsed_time() * 1000.0;
-  const double s_roots_ms = strong_roots_time() * 1000.0;
-  const double term_ms    = term_time() * 1000.0;
-  size_t alloc_buffer_waste = 0;
-  size_t undo_waste = 0;
-  _plab_allocator->waste(alloc_buffer_waste, undo_waste);
-  st->print_cr("%3u %9.2f %9.2f %6.2f "
-               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
-               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
-               _worker_id, elapsed_ms, s_roots_ms, s_roots_ms * 100 / elapsed_ms,
-               term_ms, term_ms * 100 / elapsed_ms, term_attempts(),
-               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
-               alloc_buffer_waste * HeapWordSize / K,
-               undo_waste * HeapWordSize / K);
+void G1ParScanThreadState::waste(size_t& wasted, size_t& undo_wasted) {
+  _plab_allocator->waste(wasted, undo_wasted);
 }
 
 #ifdef ASSERT
@@ -152,26 +137,38 @@
 HeapWord* G1ParScanThreadState::allocate_in_next_plab(InCSetState const state,
                                                       InCSetState* dest,
                                                       size_t word_sz,
-                                                      AllocationContext_t const context) {
+                                                      AllocationContext_t const context,
+                                                      bool previous_plab_refill_failed) {
   assert(state.is_in_cset_or_humongous(), err_msg("Unexpected state: " CSETSTATE_FORMAT, state.value()));
   assert(dest->is_in_cset_or_humongous(), err_msg("Unexpected dest: " CSETSTATE_FORMAT, dest->value()));
 
   // Right now we only have two types of regions (young / old) so
   // let's keep the logic here simple. We can generalize it when necessary.
   if (dest->is_young()) {
+    bool plab_refill_in_old_failed = false;
     HeapWord* const obj_ptr = _plab_allocator->allocate(InCSetState::Old,
                                                         word_sz,
-                                                        context);
-    if (obj_ptr == NULL) {
-      return NULL;
-    }
+                                                        context,
+                                                        &plab_refill_in_old_failed);
     // Make sure that we won't attempt to copy any other objects out
     // of a survivor region (given that apparently we cannot allocate
-    // any new ones) to avoid coming into this slow path.
-    _tenuring_threshold = 0;
-    dest->set_old();
+    // any new ones) to avoid coming into this slow path again and again.
+    // Only consider failed PLAB refill here: failed inline allocations are
+    // typically large, so not indicative of remaining space.
+    if (previous_plab_refill_failed) {
+      _tenuring_threshold = 0;
+    }
+
+    if (obj_ptr != NULL) {
+      dest->set_old();
+    } else {
+      // We just failed to allocate in old gen. The same idea as explained above
+      // for making survivor gen unavailable for allocation applies for old gen.
+      _old_gen_is_full = plab_refill_in_old_failed;
+    }
     return obj_ptr;
   } else {
+    _old_gen_is_full = previous_plab_refill_failed;
     assert(dest->is_old(), err_msg("Unexpected dest: " CSETSTATE_FORMAT, dest->value()));
     // no other space to try.
     return NULL;
@@ -202,14 +199,20 @@
 
   uint age = 0;
   InCSetState dest_state = next_state(state, old_mark, age);
+  // The second clause is to prevent premature evacuation failure in case there
+  // is still space in survivor, but old gen is full.
+  if (_old_gen_is_full && dest_state.is_old()) {
+    return handle_evacuation_failure_par(old, old_mark);
+  }
   HeapWord* obj_ptr = _plab_allocator->plab_allocate(dest_state, word_sz, context);
 
   // PLAB allocations should succeed most of the time, so we'll
   // normally check against NULL once and that's it.
   if (obj_ptr == NULL) {
-    obj_ptr = _plab_allocator->allocate_direct_or_new_plab(dest_state, word_sz, context);
+    bool plab_refill_failed = false;
+    obj_ptr = _plab_allocator->allocate_direct_or_new_plab(dest_state, word_sz, context, &plab_refill_failed);
     if (obj_ptr == NULL) {
-      obj_ptr = allocate_in_next_plab(state, &dest_state, word_sz, context);
+      obj_ptr = allocate_in_next_plab(state, &dest_state, word_sz, context, plab_refill_failed);
       if (obj_ptr == NULL) {
         // This will either forward-to-self, or detect that someone else has
         // installed a forwarding pointer.
@@ -253,7 +256,7 @@
       } else {
         obj->set_mark(old_mark->set_age(age));
       }
-      age_table()->add(age, word_sz);
+      _age_table.add(age, word_sz);
     } else {
       obj->set_mark(old_mark);
     }
@@ -271,8 +274,7 @@
                                              obj);
     }
 
-    size_t* const surv_young_words = surviving_young_words();
-    surv_young_words[young_index] += word_sz;
+    _surviving_young_words[young_index] += word_sz;
 
     if (obj->is_objArray() && arrayOop(obj)->length() >= ParGCArrayScanChunk) {
       // We keep track of the next start index in the length field of
--- a/src/share/vm/gc/g1/g1ParScanThreadState.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1ParScanThreadState.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -35,16 +35,17 @@
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
 
+class G1PLABAllocator;
 class HeapRegion;
 class outputStream;
 
-class G1ParScanThreadState : public StackObj {
+class G1ParScanThreadState : public CHeapObj<mtGC> {
  private:
   G1CollectedHeap* _g1h;
   RefToScanQueue*  _refs;
   DirtyCardQueue   _dcq;
   G1SATBCardTableModRefBS* _ct_bs;
-  G1RemSet* _g1_rem;
+  G1RemSet*         _g1_rem;
 
   G1PLABAllocator*  _plab_allocator;
 
@@ -57,20 +58,16 @@
   int  _hash_seed;
   uint _worker_id;
 
-  size_t _term_attempts;
-
-  double _start;
-  double _start_strong_roots;
-  double _strong_roots_time;
-  double _start_term;
-  double _term_time;
-
   // Map from young-age-index (0 == not young, 1 is youngest) to
   // surviving words. base is what we get back from the malloc call
   size_t* _surviving_young_words_base;
   // this points into the array, as we use the first few entries for padding
   size_t* _surviving_young_words;
 
+  // Indicates whether in the last generation (old) there is no more space
+  // available for allocation.
+  bool _old_gen_is_full;
+
 #define PADDING_ELEM_NUM (DEFAULT_CACHE_LINE_SIZE / sizeof(size_t))
 
   DirtyCardQueue& dirty_card_queue()             { return _dcq;  }
@@ -85,10 +82,10 @@
   }
 
  public:
-  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, ReferenceProcessor* rp);
+  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id);
   ~G1ParScanThreadState();
 
-  ageTable*         age_table()       { return &_age_table;       }
+  void set_ref_processor(ReferenceProcessor* rp) { _scanner.set_ref_processor(rp); }
 
 #ifdef ASSERT
   bool queue_is_empty() const { return _refs->is_empty(); }
@@ -114,40 +111,14 @@
 
   uint worker_id() { return _worker_id; }
 
-  size_t term_attempts() const  { return _term_attempts; }
-  void note_term_attempt() { _term_attempts++; }
-
-  void start_strong_roots() {
-    _start_strong_roots = os::elapsedTime();
-  }
-  void end_strong_roots() {
-    _strong_roots_time += (os::elapsedTime() - _start_strong_roots);
-  }
-  double strong_roots_time() const { return _strong_roots_time; }
-
-  void start_term_time() {
-    note_term_attempt();
-    _start_term = os::elapsedTime();
-  }
-  void end_term_time() {
-    _term_time += (os::elapsedTime() - _start_term);
-  }
-  double term_time() const { return _term_time; }
-
-  double elapsed_time() const {
-    return os::elapsedTime() - _start;
-  }
-
-  // Print the header for the per-thread termination statistics.
-  static void print_termination_stats_hdr(outputStream* const st = gclog_or_tty);
-
-  // Print actual per-thread termination statistics.
-  void print_termination_stats(outputStream* const st = gclog_or_tty) const;
+  // Returns the current amount of waste due to alignment or not being able to fit
+  // objects within LABs and the undo waste.
+  virtual void waste(size_t& wasted, size_t& undo_wasted);
 
   size_t* surviving_young_words() {
-    // We add on to hide entry 0 which accumulates surviving words for
+    // We add one to hide entry 0 which accumulates surviving words for
     // age -1 regions (i.e. non-young ones)
-    return _surviving_young_words;
+    return _surviving_young_words + 1;
   }
 
  private:
@@ -190,12 +161,16 @@
 
   // Tries to allocate word_sz in the PLAB of the next "generation" after trying to
   // allocate into dest. State is the original (source) cset state for the object
-  // that is allocated for.
+  // that is allocated for. Previous_plab_refill_failed indicates whether previously
+  // a PLAB refill into "state" failed.
   // Returns a non-NULL pointer if successful, and updates dest if required.
+  // Also determines whether we should continue to try to allocate into the various
+  // generations or just end trying to allocate.
   HeapWord* allocate_in_next_plab(InCSetState const state,
                                   InCSetState* dest,
                                   size_t word_sz,
-                                  AllocationContext_t const context);
+                                  AllocationContext_t const context,
+                                  bool previous_plab_refill_failed);
 
   inline InCSetState next_state(InCSetState const state, markOop const m, uint& age);
  public:
--- a/src/share/vm/gc/g1/g1RootProcessor.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1RootProcessor.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -115,7 +115,7 @@
 
 G1RootProcessor::G1RootProcessor(G1CollectedHeap* g1h, uint n_workers) :
     _g1h(g1h),
-    _process_strong_tasks(new SubTasksDone(G1RP_PS_NumElements)),
+    _process_strong_tasks(G1RP_PS_NumElements),
     _srs(n_workers),
     _lock(Mutex::leaf, "G1 Root Scanning barrier lock", false, Monitor::_safepoint_check_never),
     _n_workers_discovered_strong_classes(0) {}
@@ -158,7 +158,7 @@
   {
     // Now the CM ref_processor roots.
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::CMRefRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_refProcessor_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_refProcessor_oops_do)) {
       // We need to treat the discovered reference lists of the
       // concurrent mark ref processor as roots and keep entries
       // (which are added by the marking threads) on them live
@@ -201,12 +201,12 @@
   // as implicitly live).
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::SATBFiltering, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_filter_satb_buffers) && _g1h->collector_state()->mark_in_progress()) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_filter_satb_buffers) && _g1h->collector_state()->mark_in_progress()) {
       JavaThread::satb_mark_queue_set().filter_thread_buffers();
     }
   }
 
-  _process_strong_tasks->all_tasks_completed(n_workers());
+  _process_strong_tasks.all_tasks_completed(n_workers());
 }
 
 void G1RootProcessor::process_strong_roots(OopClosure* oops,
@@ -216,7 +216,7 @@
   process_java_roots(oops, clds, clds, NULL, blobs, NULL, 0);
   process_vm_roots(oops, NULL, NULL, 0);
 
-  _process_strong_tasks->all_tasks_completed(n_workers());
+  _process_strong_tasks.all_tasks_completed(n_workers());
 }
 
 void G1RootProcessor::process_all_roots(OopClosure* oops,
@@ -226,11 +226,11 @@
   process_java_roots(oops, NULL, clds, clds, NULL, NULL, 0);
   process_vm_roots(oops, oops, NULL, 0);
 
-  if (!_process_strong_tasks->is_task_claimed(G1RP_PS_CodeCache_oops_do)) {
+  if (!_process_strong_tasks.is_task_claimed(G1RP_PS_CodeCache_oops_do)) {
     CodeCache::blobs_do(blobs);
   }
 
-  _process_strong_tasks->all_tasks_completed(n_workers());
+  _process_strong_tasks.all_tasks_completed(n_workers());
 }
 
 void G1RootProcessor::process_java_roots(OopClosure* strong_roots,
@@ -246,7 +246,7 @@
   // let the thread process the weak CLDs and nmethods.
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::CLDGRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_ClassLoaderDataGraph_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_ClassLoaderDataGraph_oops_do)) {
       ClassLoaderDataGraph::roots_cld_do(strong_clds, weak_clds);
     }
   }
@@ -264,49 +264,49 @@
                                        uint worker_i) {
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::UniverseRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_Universe_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_Universe_oops_do)) {
       Universe::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::JNIRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_JNIHandles_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_JNIHandles_oops_do)) {
       JNIHandles::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::ObjectSynchronizerRoots, worker_i);
-    if (!_process_strong_tasks-> is_task_claimed(G1RP_PS_ObjectSynchronizer_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_ObjectSynchronizer_oops_do)) {
       ObjectSynchronizer::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::FlatProfilerRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_FlatProfiler_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_FlatProfiler_oops_do)) {
       FlatProfiler::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::ManagementRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_Management_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_Management_oops_do)) {
       Management::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::JVMTIRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_jvmti_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_jvmti_oops_do)) {
       JvmtiExport::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::SystemDictionaryRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_SystemDictionary_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_SystemDictionary_oops_do)) {
       SystemDictionary::roots_oops_do(strong_roots, weak_roots);
     }
   }
--- a/src/share/vm/gc/g1/g1RootProcessor.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/g1RootProcessor.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -45,7 +45,7 @@
 // worker thread call the process_roots methods.
 class G1RootProcessor : public StackObj {
   G1CollectedHeap* _g1h;
-  SubTasksDone* _process_strong_tasks;
+  SubTasksDone _process_strong_tasks;
   StrongRootsScope _srs;
 
   // Used to implement the Thread work barrier.
--- a/src/share/vm/gc/g1/heapRegion.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/heapRegion.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -109,7 +109,7 @@
 // evacuation pauses between two cleanups, which is _highly_ unlikely.
 class G1OffsetTableContigSpace: public CompactibleSpace {
   friend class VMStructs;
-  HeapWord* _top;
+  HeapWord* volatile _top;
   HeapWord* volatile _scan_top;
  protected:
   G1BlockOffsetArrayContigSpace _offsets;
@@ -134,10 +134,18 @@
   // Reset the G1OffsetTableContigSpace.
   virtual void initialize(MemRegion mr, bool clear_space, bool mangle_space);
 
-  HeapWord** top_addr() { return &_top; }
-  // Allocation helpers (return NULL if full).
-  inline HeapWord* allocate_impl(size_t word_size, HeapWord* end_value);
-  inline HeapWord* par_allocate_impl(size_t word_size, HeapWord* end_value);
+  HeapWord* volatile* top_addr() { return &_top; }
+  // Try to allocate at least min_word_size and up to desired_size from this Space.
+  // Returns NULL if not possible, otherwise sets actual_word_size to the amount of
+  // space allocated.
+  // This version assumes that all allocation requests to this Space are properly
+  // synchronized.
+  inline HeapWord* allocate_impl(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
+  // Try to allocate at least min_word_size and up to desired_size from this Space.
+  // Returns NULL if not possible, otherwise sets actual_word_size to the amount of
+  // space allocated.
+  // This version synchronizes with other calls to par_allocate_impl().
+  inline HeapWord* par_allocate_impl(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
 
  public:
   void reset_after_compaction() { set_top(compaction_top()); }
@@ -179,9 +187,14 @@
   HeapWord* block_start(const void* p);
   HeapWord* block_start_const(const void* p) const;
 
-  // Add offset table update.
+  // Allocation (return NULL if full).  Assumes the caller has established
+  // mutually exclusive access to the space.
+  HeapWord* allocate(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
+  // Allocation (return NULL if full).  Enforces mutual exclusion internally.
+  HeapWord* par_allocate(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
+
   virtual HeapWord* allocate(size_t word_size);
-  HeapWord* par_allocate(size_t word_size);
+  virtual HeapWord* par_allocate(size_t word_size);
 
   HeapWord* saved_mark_word() const { ShouldNotReachHere(); return NULL; }
 
@@ -351,8 +364,9 @@
   // Override for scan_and_forward support.
   void prepare_for_compaction(CompactPoint* cp);
 
-  inline HeapWord* par_allocate_no_bot_updates(size_t word_size);
+  inline HeapWord* par_allocate_no_bot_updates(size_t min_word_size, size_t desired_word_size, size_t* word_size);
   inline HeapWord* allocate_no_bot_updates(size_t word_size);
+  inline HeapWord* allocate_no_bot_updates(size_t min_word_size, size_t desired_word_size, size_t* actual_size);
 
   // If this region is a member of a HeapRegionManager, the index in that
   // sequence, otherwise -1.
--- a/src/share/vm/gc/g1/heapRegion.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/heapRegion.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -32,33 +32,39 @@
 #include "oops/oop.inline.hpp"
 #include "runtime/atomic.inline.hpp"
 
-// This version requires locking.
-inline HeapWord* G1OffsetTableContigSpace::allocate_impl(size_t size,
-                                                HeapWord* const end_value) {
+inline HeapWord* G1OffsetTableContigSpace::allocate_impl(size_t min_word_size,
+                                                         size_t desired_word_size,
+                                                         size_t* actual_size) {
   HeapWord* obj = top();
-  if (pointer_delta(end_value, obj) >= size) {
-    HeapWord* new_top = obj + size;
+  size_t available = pointer_delta(end(), obj);
+  size_t want_to_allocate = MIN2(available, desired_word_size);
+  if (want_to_allocate >= min_word_size) {
+    HeapWord* new_top = obj + want_to_allocate;
     set_top(new_top);
     assert(is_aligned(obj) && is_aligned(new_top), "checking alignment");
+    *actual_size = want_to_allocate;
     return obj;
   } else {
     return NULL;
   }
 }
 
-// This version is lock-free.
-inline HeapWord* G1OffsetTableContigSpace::par_allocate_impl(size_t size,
-                                                    HeapWord* const end_value) {
+inline HeapWord* G1OffsetTableContigSpace::par_allocate_impl(size_t min_word_size,
+                                                             size_t desired_word_size,
+                                                             size_t* actual_size) {
   do {
     HeapWord* obj = top();
-    if (pointer_delta(end_value, obj) >= size) {
-      HeapWord* new_top = obj + size;
+    size_t available = pointer_delta(end(), obj);
+    size_t want_to_allocate = MIN2(available, desired_word_size);
+    if (want_to_allocate >= min_word_size) {
+      HeapWord* new_top = obj + want_to_allocate;
       HeapWord* result = (HeapWord*)Atomic::cmpxchg_ptr(new_top, top_addr(), obj);
       // result can be one of two:
       //  the old top value: the exchange succeeded
       //  otherwise: the new value of the top is returned.
       if (result == obj) {
         assert(is_aligned(obj) && is_aligned(new_top), "checking alignment");
+        *actual_size = want_to_allocate;
         return obj;
       }
     } else {
@@ -67,20 +73,34 @@
   } while (true);
 }
 
-inline HeapWord* G1OffsetTableContigSpace::allocate(size_t size) {
-  HeapWord* res = allocate_impl(size, end());
+inline HeapWord* G1OffsetTableContigSpace::allocate(size_t min_word_size,
+                                                    size_t desired_word_size,
+                                                    size_t* actual_size) {
+  HeapWord* res = allocate_impl(min_word_size, desired_word_size, actual_size);
   if (res != NULL) {
-    _offsets.alloc_block(res, size);
+    _offsets.alloc_block(res, *actual_size);
   }
   return res;
 }
 
+inline HeapWord* G1OffsetTableContigSpace::allocate(size_t word_size) {
+  size_t temp;
+  return allocate(word_size, word_size, &temp);
+}
+
+inline HeapWord* G1OffsetTableContigSpace::par_allocate(size_t word_size) {
+  size_t temp;
+  return par_allocate(word_size, word_size, &temp);
+}
+
 // Because of the requirement of keeping "_offsets" up to date with the
 // allocations, we sequentialize these with a lock.  Therefore, best if
 // this is used for larger LAB allocations only.
-inline HeapWord* G1OffsetTableContigSpace::par_allocate(size_t size) {
+inline HeapWord* G1OffsetTableContigSpace::par_allocate(size_t min_word_size,
+                                                        size_t desired_word_size,
+                                                        size_t* actual_size) {
   MutexLocker x(&_par_alloc_lock);
-  return allocate(size);
+  return allocate(min_word_size, desired_word_size, actual_size);
 }
 
 inline HeapWord* G1OffsetTableContigSpace::block_start(const void* p) {
@@ -128,14 +148,23 @@
   return pointer_delta(next, addr);
 }
 
-inline HeapWord* HeapRegion::par_allocate_no_bot_updates(size_t word_size) {
+inline HeapWord* HeapRegion::par_allocate_no_bot_updates(size_t min_word_size,
+                                                         size_t desired_word_size,
+                                                         size_t* actual_word_size) {
   assert(is_young(), "we can only skip BOT updates on young regions");
-  return par_allocate_impl(word_size, end());
+  return par_allocate_impl(min_word_size, desired_word_size, actual_word_size);
 }
 
 inline HeapWord* HeapRegion::allocate_no_bot_updates(size_t word_size) {
+  size_t temp;
+  return allocate_no_bot_updates(word_size, word_size, &temp);
+}
+
+inline HeapWord* HeapRegion::allocate_no_bot_updates(size_t min_word_size,
+                                                     size_t desired_word_size,
+                                                     size_t* actual_word_size) {
   assert(is_young(), "we can only skip BOT updates on young regions");
-  return allocate_impl(word_size, end());
+  return allocate_impl(min_word_size, desired_word_size, actual_word_size);
 }
 
 inline void HeapRegion::note_start_of_marking() {
--- a/src/share/vm/gc/g1/heapRegionManager.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/heapRegionManager.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -428,7 +428,7 @@
 
     uncommit_regions(idx_last_found + num_last_found - to_remove, to_remove);
 
-    cur -= num_last_found;
+    cur = idx_last_found;
     removed += to_remove;
   }
 
--- a/src/share/vm/gc/g1/vmStructs_g1.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/g1/vmStructs_g1.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -34,7 +34,7 @@
   static_field(HeapRegion, GrainBytes,        size_t)                         \
   static_field(HeapRegion, LogOfHRGrainBytes, int)                            \
                                                                               \
-  nonstatic_field(G1OffsetTableContigSpace, _top,       HeapWord*)            \
+  nonstatic_field(G1OffsetTableContigSpace, _top,       HeapWord* volatile)   \
                                                                               \
   nonstatic_field(G1HeapRegionTable, _base,             address)              \
   nonstatic_field(G1HeapRegionTable, _length,           size_t)               \
--- a/src/share/vm/gc/shared/collectorPolicy.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/collectorPolicy.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -225,6 +225,10 @@
   return align_size_up(3 * _space_alignment, _gen_alignment);
 }
 
+size_t GenCollectorPolicy::old_gen_size_lower_bound() {
+  return align_size_up(_space_alignment, _gen_alignment);
+}
+
 #ifdef ASSERT
 void GenCollectorPolicy::assert_flags() {
   CollectorPolicy::assert_flags();
@@ -284,7 +288,7 @@
 
   // Make sure the heap is large enough for two generations
   size_t smallest_new_size = young_gen_size_lower_bound();
-  size_t smallest_heap_size = align_size_up(smallest_new_size + align_size_up(_space_alignment, _gen_alignment),
+  size_t smallest_heap_size = align_size_up(smallest_new_size + old_gen_size_lower_bound(),
                                            _heap_alignment);
   if (MaxHeapSize < smallest_heap_size) {
     FLAG_SET_ERGO(size_t, MaxHeapSize, smallest_heap_size);
@@ -356,6 +360,7 @@
     vm_exit_during_initialization("Invalid young gen ratio specified");
   }
 
+  OldSize = MAX2(OldSize, old_gen_size_lower_bound());
   if (!is_size_aligned(OldSize, _gen_alignment)) {
     // Setting OldSize directly to preserve information about the possible
     // setting of OldSize on the command line.
--- a/src/share/vm/gc/shared/collectorPolicy.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/collectorPolicy.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -282,6 +282,8 @@
 
   size_t young_gen_size_lower_bound();
 
+  size_t old_gen_size_lower_bound();
+
   HeapWord* mem_allocate_work(size_t size,
                               bool is_tlab,
                               bool* gc_overhead_limit_was_exceeded);
--- a/src/share/vm/gc/shared/gcHeapSummary.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/gcHeapSummary.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -189,4 +189,44 @@
 
 };
 
+class G1EvacSummary : public StackObj {
+private:
+  size_t _allocated;          // Total allocated
+  size_t _wasted;             // of which wasted (internal fragmentation)
+  size_t _undo_wasted;        // of which wasted on undo (is not used for calculation of PLAB size)
+  size_t _unused;             // Unused in last buffer
+  size_t _used;
+
+  size_t _region_end_waste; // Number of words wasted due to skipping to the next region.
+  uint   _regions_filled;   // Number of regions filled completely.
+  size_t _direct_allocated; // Number of words allocated directly into the regions.
+
+  // Number of words in live objects remaining in regions that ultimately suffered an
+  // evacuation failure. This is used in the regions when the regions are made old regions.
+  size_t _failure_used;
+  // Number of words wasted in regions which failed evacuation. This is the sum of space
+  // for objects successfully copied out of the regions (now dead space) plus waste at the
+  // end of regions.
+  size_t _failure_waste;
+public:
+  G1EvacSummary(size_t allocated, size_t wasted, size_t undo_wasted, size_t unused,
+    size_t used, size_t region_end_waste, uint regions_filled, size_t direct_allocated,
+    size_t failure_used, size_t failure_waste) :
+    _allocated(allocated), _wasted(wasted), _undo_wasted(undo_wasted), _unused(unused),
+    _used(used),  _region_end_waste(region_end_waste), _regions_filled(regions_filled),
+    _direct_allocated(direct_allocated), _failure_used(failure_used), _failure_waste(failure_waste)
+  { }
+
+  size_t allocated() const { return _allocated; }
+  size_t wasted() const { return _wasted; }
+  size_t undo_wasted() const { return _undo_wasted; }
+  size_t unused() const { return _unused; }
+  size_t used() const { return _used; }
+  size_t region_end_waste() const { return _region_end_waste; }
+  uint regions_filled() const { return _regions_filled; }
+  size_t direct_allocated() const { return _direct_allocated; }
+  size_t failure_used() const { return _failure_used; }
+  size_t failure_waste() const { return _failure_waste; }
+};
+
 #endif // SHARE_VM_GC_SHARED_GCHEAPSUMMARY_HPP
--- a/src/share/vm/gc/shared/gcTrace.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/gcTrace.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -252,4 +252,12 @@
   send_evacuation_failed_event(ef_info);
   ef_info.reset();
 }
+
+void G1NewTracer::report_evacuation_statistics(const G1EvacSummary& young_summary, const G1EvacSummary& old_summary) const {
+  assert_set_gc_id();
+
+  send_young_evacuation_statistics(young_summary);
+  send_old_evacuation_statistics(old_summary);
+}
+
 #endif
--- a/src/share/vm/gc/shared/gcTrace.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/gcTrace.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -45,6 +45,7 @@
 class MetaspaceSummary;
 class PSHeapSummary;
 class G1HeapSummary;
+class G1EvacSummary;
 class ReferenceProcessorStats;
 class TimePartitions;
 class BoolObjectClosure;
@@ -257,10 +258,14 @@
   void report_evacuation_info(EvacuationInfo* info);
   void report_evacuation_failed(EvacuationFailedInfo& ef_info);
 
+  void report_evacuation_statistics(const G1EvacSummary& young_summary, const G1EvacSummary& old_summary) const;
  private:
   void send_g1_young_gc_event();
   void send_evacuation_info_event(EvacuationInfo* info);
   void send_evacuation_failed_event(const EvacuationFailedInfo& ef_info) const;
+
+  void send_young_evacuation_statistics(const G1EvacSummary& summary) const;
+  void send_old_evacuation_statistics(const G1EvacSummary& summary) const;
 };
 #endif
 
--- a/src/share/vm/gc/shared/gcTraceSend.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/gcTraceSend.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -234,6 +234,37 @@
     e.commit();
   }
 }
+
+static TraceStructG1EvacStats create_g1_evacstats(unsigned gcid, const G1EvacSummary& summary) {
+  TraceStructG1EvacStats s;
+  s.set_gcId(gcid);
+  s.set_allocated(summary.allocated() * HeapWordSize);
+  s.set_wasted(summary.wasted() * HeapWordSize);
+  s.set_used(summary.used() * HeapWordSize);
+  s.set_undoWaste(summary.undo_wasted() * HeapWordSize);
+  s.set_regionEndWaste(summary.region_end_waste() * HeapWordSize);
+  s.set_regionsRefilled(summary.regions_filled());
+  s.set_directAllocated(summary.direct_allocated() * HeapWordSize);
+  s.set_failureUsed(summary.failure_used() * HeapWordSize);
+  s.set_failureWaste(summary.failure_waste() * HeapWordSize);
+  return s;
+}
+
+void G1NewTracer::send_young_evacuation_statistics(const G1EvacSummary& summary) const {
+  EventGCG1EvacuationYoungStatistics surv_evt;
+  if (surv_evt.should_commit()) {
+    surv_evt.set_stats(create_g1_evacstats(_shared_gc_info.gc_id().id(), summary));
+    surv_evt.commit();
+  }
+}
+
+void G1NewTracer::send_old_evacuation_statistics(const G1EvacSummary& summary) const {
+  EventGCG1EvacuationOldStatistics old_evt;
+  if (old_evt.should_commit()) {
+    old_evt.set_stats(create_g1_evacstats(_shared_gc_info.gc_id().id(), summary));
+    old_evt.commit();
+  }
+}
 #endif
 
 static TraceStructVirtualSpace to_trace_struct(const VirtualSpaceSummary& summary) {
--- a/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -86,7 +86,7 @@
 {
   assert(policy != NULL, "Sanity check");
   if (UseConcMarkSweepGC) {
-    _workers = new FlexibleWorkGang("GC Thread", ParallelGCThreads,
+    _workers = new WorkGang("GC Thread", ParallelGCThreads,
                             /* are_GC_task_threads */true,
                             /* are_ConcurrentGC_threads */false);
     _workers->initialize_workers();
--- a/src/share/vm/gc/shared/genCollectedHeap.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/genCollectedHeap.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -30,9 +30,9 @@
 #include "gc/shared/collectorPolicy.hpp"
 #include "gc/shared/generation.hpp"
 
-class FlexibleWorkGang;
 class StrongRootsScope;
 class SubTasksDone;
+class WorkGang;
 
 // A "GenCollectedHeap" is a CollectedHeap that uses generational
 // collection.  It has two generations, young and old.
@@ -90,7 +90,7 @@
   // In block contents verification, the number of header words to skip
   NOT_PRODUCT(static size_t _skip_header_HeapWords;)
 
-  FlexibleWorkGang* _workers;
+  WorkGang* _workers;
 
 protected:
   // Helper functions for allocation
@@ -124,7 +124,7 @@
 public:
   GenCollectedHeap(GenCollectorPolicy *policy);
 
-  FlexibleWorkGang* workers() const { return _workers; }
+  WorkGang* workers() const { return _workers; }
 
   GCStats* gc_stats(Generation* generation) const;
 
--- a/src/share/vm/gc/shared/plab.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/plab.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -24,7 +24,7 @@
 
 #include "precompiled.hpp"
 #include "gc/shared/collectedHeap.hpp"
-#include "gc/shared/plab.hpp"
+#include "gc/shared/plab.inline.hpp"
 #include "gc/shared/threadLocalAllocBuffer.hpp"
 #include "oops/arrayOop.hpp"
 #include "oops/oop.inline.hpp"
--- a/src/share/vm/gc/shared/plab.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/plab.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -27,7 +27,6 @@
 
 #include "gc/shared/gcUtil.hpp"
 #include "memory/allocation.hpp"
-#include "runtime/atomic.hpp"
 #include "utilities/globalDefinitions.hpp"
 
 // Forward declarations.
@@ -75,6 +74,8 @@
   PLAB(size_t word_sz);
   virtual ~PLAB() {}
 
+  static size_t size_required_for_allocation(size_t word_size) { return word_size + AlignmentReserve; }
+
   // Minimum PLAB size.
   static size_t min_size();
   // Maximum PLAB size.
@@ -95,7 +96,7 @@
   }
 
   // Allocate the object aligned to "alignment_in_bytes".
-  HeapWord* allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes);
+  inline HeapWord* allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes);
 
   // Undo any allocation in the buffer, which is required to be of the
   // "obj" of the given "word_sz".
@@ -108,13 +109,6 @@
   size_t waste() { return _wasted; }
   size_t undo_waste() { return _undo_wasted; }
 
-  // Should only be done if we are about to reset with a new buffer of the
-  // given size.
-  void set_word_size(size_t new_word_sz) {
-    assert(new_word_sz > AlignmentReserve, "Too small");
-    _word_sz = new_word_sz;
-  }
-
   // The number of words of unallocated space remaining in the buffer.
   size_t words_remaining() {
     assert(_end >= _top, "Negative buffer");
@@ -126,7 +120,10 @@
   }
 
   // Sets the space of the buffer to be [buf, space+word_sz()).
-  virtual void set_buf(HeapWord* buf) {
+  virtual void set_buf(HeapWord* buf, size_t new_word_sz) {
+    assert(new_word_sz > AlignmentReserve, "Too small");
+    _word_sz = new_word_sz;
+
     _bottom   = buf;
     _top      = _bottom;
     _hard_end = _bottom + word_sz();
@@ -149,7 +146,8 @@
 };
 
 // PLAB book-keeping.
-class PLABStats VALUE_OBJ_CLASS_SPEC {
+class PLABStats : public CHeapObj<mtGC> {
+ protected:
   size_t _allocated;          // Total allocated
   size_t _wasted;             // of which wasted (internal fragmentation)
   size_t _undo_wasted;        // of which wasted on undo (is not used for calculation of PLAB size)
@@ -158,7 +156,7 @@
   AdaptiveWeightedAverage
          _filter;             // Integrator with decay
 
-  void reset() {
+  virtual void reset() {
     _allocated   = 0;
     _wasted      = 0;
     _undo_wasted = 0;
@@ -174,6 +172,8 @@
     _filter(wt)
   { }
 
+  virtual ~PLABStats() { }
+
   static const size_t min_size() {
     return PLAB::min_size();
   }
@@ -187,23 +187,15 @@
 
   // Updates the current desired PLAB size. Computes the new desired PLAB size with one gc worker thread,
   // updates _desired_plab_sz and clears sensor accumulators.
-  void adjust_desired_plab_sz();
+  virtual void adjust_desired_plab_sz();
 
-  void add_allocated(size_t v) {
-    Atomic::add_ptr(v, &_allocated);
-  }
+  inline void add_allocated(size_t v);
 
-  void add_unused(size_t v) {
-    Atomic::add_ptr(v, &_unused);
-  }
+  inline void add_unused(size_t v);
 
-  void add_wasted(size_t v) {
-    Atomic::add_ptr(v, &_wasted);
-  }
+  inline void add_wasted(size_t v);
 
-  void add_undo_wasted(size_t v) {
-    Atomic::add_ptr(v, &_undo_wasted);
-  }
+  inline void add_undo_wasted(size_t v);
 };
 
 #endif // SHARE_VM_GC_SHARED_PLAB_HPP
--- a/src/share/vm/gc/shared/plab.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/plab.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -27,9 +27,10 @@
 
 #include "gc/shared/collectedHeap.inline.hpp"
 #include "gc/shared/plab.hpp"
+#include "memory/allocation.inline.hpp"
+#include "runtime/atomic.inline.hpp"
 
-HeapWord* PLAB::allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes) {
-
+inline HeapWord* PLAB::allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes) {
   HeapWord* res = CollectedHeap::align_allocation_or_fail(_top, _end, alignment_in_bytes);
   if (res == NULL) {
     return NULL;
@@ -41,4 +42,20 @@
   return allocate(word_sz);
 }
 
+void PLABStats::add_allocated(size_t v) {
+  Atomic::add_ptr(v, &_allocated);
+}
+
+void PLABStats::add_unused(size_t v) {
+  Atomic::add_ptr(v, &_unused);
+}
+
+void PLABStats::add_wasted(size_t v) {
+  Atomic::add_ptr(v, &_wasted);
+}
+
+void PLABStats::add_undo_wasted(size_t v) {
+  Atomic::add_ptr(v, &_undo_wasted);
+}
+
 #endif // SHARE_VM_GC_SHARED_PLAB_INLINE_HPP
--- a/src/share/vm/gc/shared/workgroup.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/workgroup.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -28,58 +28,25 @@
 #include "memory/allocation.inline.hpp"
 #include "runtime/atomic.inline.hpp"
 #include "runtime/os.hpp"
+#include "runtime/semaphore.hpp"
+#include "runtime/thread.inline.hpp"
 
 // Definitions of WorkGang methods.
 
-AbstractWorkGang::AbstractWorkGang(const char* name,
-                                   bool  are_GC_task_threads,
-                                   bool  are_ConcurrentGC_threads) :
-  _name(name),
-  _are_GC_task_threads(are_GC_task_threads),
-  _are_ConcurrentGC_threads(are_ConcurrentGC_threads) {
-
-  assert(!(are_GC_task_threads && are_ConcurrentGC_threads),
-         "They cannot both be STW GC and Concurrent threads" );
-
-  // Other initialization.
-  _monitor = new Monitor(/* priority */       Mutex::leaf,
-                         /* name */           "WorkGroup monitor",
-                         /* allow_vm_block */ are_GC_task_threads,
-                                              Monitor::_safepoint_check_sometimes);
-  assert(monitor() != NULL, "Failed to allocate monitor");
-  _task = NULL;
-  _sequence_number = 0;
-  _started_workers = 0;
-  _finished_workers = 0;
-}
-
-WorkGang::WorkGang(const char* name,
-                   uint        workers,
-                   bool        are_GC_task_threads,
-                   bool        are_ConcurrentGC_threads) :
-  AbstractWorkGang(name, are_GC_task_threads, are_ConcurrentGC_threads) {
-  _total_workers = workers;
-}
-
-GangWorker* WorkGang::allocate_worker(uint which) {
-  GangWorker* new_worker = new GangWorker(this, which);
-  return new_worker;
-}
-
 // The current implementation will exit if the allocation
 // of any worker fails.  Still, return a boolean so that
 // a future implementation can possibly do a partial
 // initialization of the workers and report such to the
 // caller.
-bool WorkGang::initialize_workers() {
+bool AbstractWorkGang::initialize_workers() {
 
   if (TraceWorkGang) {
     tty->print_cr("Constructing work gang %s with %d threads",
                   name(),
                   total_workers());
   }
-  _gang_workers = NEW_C_HEAP_ARRAY(GangWorker*, total_workers(), mtInternal);
-  if (gang_workers() == NULL) {
+  _workers = NEW_C_HEAP_ARRAY(AbstractGangWorker*, total_workers(), mtInternal);
+  if (_workers == NULL) {
     vm_exit_out_of_memory(0, OOM_MALLOC_ERROR, "Cannot create GangWorker array.");
     return false;
   }
@@ -90,9 +57,9 @@
     worker_type = os::pgc_thread;
   }
   for (uint worker = 0; worker < total_workers(); worker += 1) {
-    GangWorker* new_worker = allocate_worker(worker);
+    AbstractGangWorker* new_worker = allocate_worker(worker);
     assert(new_worker != NULL, "Failed to allocate GangWorker");
-    _gang_workers[worker] = new_worker;
+    _workers[worker] = new_worker;
     if (new_worker == NULL || !os::create_thread(new_worker, worker_type)) {
       vm_exit_out_of_memory(0, OOM_MALLOC_ERROR,
               "Cannot create worker GC thread. Out of system resources.");
@@ -105,110 +72,208 @@
   return true;
 }
 
-GangWorker* AbstractWorkGang::gang_worker(uint i) const {
+AbstractGangWorker* AbstractWorkGang::worker(uint i) const {
   // Array index bounds checking.
-  GangWorker* result = NULL;
-  assert(gang_workers() != NULL, "No workers for indexing");
+  AbstractGangWorker* result = NULL;
+  assert(_workers != NULL, "No workers for indexing");
   assert(i < total_workers(), "Worker index out of bounds");
-  result = _gang_workers[i];
+  result = _workers[i];
   assert(result != NULL, "Indexing to null worker");
   return result;
 }
 
-void WorkGang::run_task(AbstractGangTask* task) {
-  run_task(task, total_workers());
-}
-
-void WorkGang::run_task(AbstractGangTask* task, uint no_of_parallel_workers) {
-  // This thread is executed by the VM thread which does not block
-  // on ordinary MutexLocker's.
-  MutexLockerEx ml(monitor(), Mutex::_no_safepoint_check_flag);
-  if (TraceWorkGang) {
-    tty->print_cr("Running work gang %s task %s", name(), task->name());
-  }
-  // Tell all the workers to run a task.
-  assert(task != NULL, "Running a null task");
-  // Initialize.
-  _task = task;
-  _sequence_number += 1;
-  _started_workers = 0;
-  _finished_workers = 0;
-  // Tell the workers to get to work.
-  monitor()->notify_all();
-  // Wait for them to be finished
-  while (finished_workers() < no_of_parallel_workers) {
-    if (TraceWorkGang) {
-      tty->print_cr("Waiting in work gang %s: %u/%u finished sequence %d",
-                    name(), finished_workers(), no_of_parallel_workers,
-                    _sequence_number);
-    }
-    monitor()->wait(/* no_safepoint_check */ true);
-  }
-  _task = NULL;
-  if (TraceWorkGang) {
-    tty->print_cr("\nFinished work gang %s: %u/%u sequence %d",
-                  name(), finished_workers(), no_of_parallel_workers,
-                  _sequence_number);
-    Thread* me = Thread::current();
-    tty->print_cr("  T: " PTR_FORMAT "  VM_thread: %d", p2i(me), me->is_VM_thread());
-  }
-}
-
-void FlexibleWorkGang::run_task(AbstractGangTask* task) {
-  // If active_workers() is passed, _finished_workers
-  // must only be incremented for workers that find non_null
-  // work (as opposed to all those that just check that the
-  // task is not null).
-  WorkGang::run_task(task, (uint) active_workers());
-}
-
-void AbstractWorkGang::internal_worker_poll(WorkData* data) const {
-  assert(monitor()->owned_by_self(), "worker_poll is an internal method");
-  assert(data != NULL, "worker data is null");
-  data->set_task(task());
-  data->set_sequence_number(sequence_number());
-}
-
-void AbstractWorkGang::internal_note_start() {
-  assert(monitor()->owned_by_self(), "note_finish is an internal method");
-  _started_workers += 1;
-}
-
-void AbstractWorkGang::internal_note_finish() {
-  assert(monitor()->owned_by_self(), "note_finish is an internal method");
-  _finished_workers += 1;
-}
-
 void AbstractWorkGang::print_worker_threads_on(outputStream* st) const {
-  uint    num_thr = total_workers();
-  for (uint i = 0; i < num_thr; i++) {
-    gang_worker(i)->print_on(st);
+  uint workers = total_workers();
+  for (uint i = 0; i < workers; i++) {
+    worker(i)->print_on(st);
     st->cr();
   }
 }
 
 void AbstractWorkGang::threads_do(ThreadClosure* tc) const {
   assert(tc != NULL, "Null ThreadClosure");
-  uint num_thr = total_workers();
-  for (uint i = 0; i < num_thr; i++) {
-    tc->do_thread(gang_worker(i));
+  uint workers = total_workers();
+  for (uint i = 0; i < workers; i++) {
+    tc->do_thread(worker(i));
   }
 }
 
-// GangWorker methods.
+// WorkGang dispatcher implemented with semaphores.
+//
+// Semaphores don't require the worker threads to re-claim the lock when they wake up.
+// This helps lowering the latency when starting and stopping the worker threads.
+class SemaphoreGangTaskDispatcher : public GangTaskDispatcher {
+  // The task currently being dispatched to the GangWorkers.
+  AbstractGangTask* _task;
+
+  volatile uint _started;
+  volatile uint _not_finished;
+
+  // Semaphore used to start the GangWorkers.
+  Semaphore* _start_semaphore;
+  // Semaphore used to notify the coordinator that all workers are done.
+  Semaphore* _end_semaphore;
+
+public:
+  SemaphoreGangTaskDispatcher() :
+      _task(NULL),
+      _started(0),
+      _not_finished(0),
+      _start_semaphore(new Semaphore()),
+      _end_semaphore(new Semaphore())
+{ }
+
+  ~SemaphoreGangTaskDispatcher() {
+    delete _start_semaphore;
+    delete _end_semaphore;
+  }
+
+  void coordinator_execute_on_workers(AbstractGangTask* task, uint num_workers) {
+    // No workers are allowed to read the state variables until they have been signaled.
+    _task         = task;
+    _not_finished = num_workers;
+
+    // Dispatch 'num_workers' number of tasks.
+    _start_semaphore->signal(num_workers);
+
+    // Wait for the last worker to signal the coordinator.
+    _end_semaphore->wait();
+
+    // No workers are allowed to read the state variables after the coordinator has been signaled.
+    assert(_not_finished == 0, err_msg("%d not finished workers?", _not_finished));
+    _task    = NULL;
+    _started = 0;
+
+  }
+
+  WorkData worker_wait_for_task() {
+    // Wait for the coordinator to dispatch a task.
+    _start_semaphore->wait();
+
+    uint num_started = (uint) Atomic::add(1, (volatile jint*)&_started);
+
+    // Subtract one to get a zero-indexed worker id.
+    uint worker_id = num_started - 1;
+
+    return WorkData(_task, worker_id);
+  }
+
+  void worker_done_with_task() {
+    // Mark that the worker is done with the task.
+    // The worker is not allowed to read the state variables after this line.
+    uint not_finished = (uint) Atomic::add(-1, (volatile jint*)&_not_finished);
+
+    // The last worker signals to the coordinator that all work is completed.
+    if (not_finished == 0) {
+      _end_semaphore->signal();
+    }
+  }
+};
+
+class MutexGangTaskDispatcher : public GangTaskDispatcher {
+  AbstractGangTask* _task;
+
+  volatile uint _started;
+  volatile uint _finished;
+  volatile uint _num_workers;
+
+  Monitor* _monitor;
 
-GangWorker::GangWorker(AbstractWorkGang* gang, uint id) {
+ public:
+  MutexGangTaskDispatcher()
+      : _task(NULL),
+        _monitor(new Monitor(Monitor::leaf, "WorkGang dispatcher lock", false, Monitor::_safepoint_check_never)),
+        _started(0),
+        _finished(0),
+        _num_workers(0) {}
+
+  ~MutexGangTaskDispatcher() {
+    delete _monitor;
+  }
+
+  void coordinator_execute_on_workers(AbstractGangTask* task, uint num_workers) {
+    MutexLockerEx ml(_monitor, Mutex::_no_safepoint_check_flag);
+
+    _task        = task;
+    _num_workers = num_workers;
+
+    // Tell the workers to get to work.
+    _monitor->notify_all();
+
+    // Wait for them to finish.
+    while (_finished < _num_workers) {
+      _monitor->wait(/* no_safepoint_check */ true);
+    }
+
+    _task        = NULL;
+    _num_workers = 0;
+    _started     = 0;
+    _finished    = 0;
+  }
+
+  WorkData worker_wait_for_task() {
+    MonitorLockerEx ml(_monitor, Mutex::_no_safepoint_check_flag);
+
+    while (_num_workers == 0 || _started == _num_workers) {
+      _monitor->wait(/* no_safepoint_check */ true);
+    }
+
+    _started++;
+
+    // Subtract one to get a zero-indexed worker id.
+    uint worker_id = _started - 1;
+
+    return WorkData(_task, worker_id);
+  }
+
+  void worker_done_with_task() {
+    MonitorLockerEx ml(_monitor, Mutex::_no_safepoint_check_flag);
+
+    _finished++;
+
+    if (_finished == _num_workers) {
+      // This will wake up all workers and not only the coordinator.
+      _monitor->notify_all();
+    }
+  }
+};
+
+static GangTaskDispatcher* create_dispatcher() {
+  if (UseSemaphoreGCThreadsSynchronization) {
+    return new SemaphoreGangTaskDispatcher();
+  }
+
+  return new MutexGangTaskDispatcher();
+}
+
+WorkGang::WorkGang(const char* name,
+                   uint  workers,
+                   bool  are_GC_task_threads,
+                   bool  are_ConcurrentGC_threads) :
+    AbstractWorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads),
+    _dispatcher(create_dispatcher())
+{ }
+
+AbstractGangWorker* WorkGang::allocate_worker(uint worker_id) {
+  return new GangWorker(this, worker_id);
+}
+
+void WorkGang::run_task(AbstractGangTask* task) {
+  _dispatcher->coordinator_execute_on_workers(task, active_workers());
+}
+
+AbstractGangWorker::AbstractGangWorker(AbstractWorkGang* gang, uint id) {
   _gang = gang;
   set_id(id);
   set_name("%s#%d", gang->name(), id);
 }
 
-void GangWorker::run() {
+void AbstractGangWorker::run() {
   initialize();
   loop();
 }
 
-void GangWorker::initialize() {
+void AbstractGangWorker::initialize() {
   this->initialize_thread_local_storage();
   this->record_stack_base_and_size();
   this->initialize_named_thread();
@@ -224,112 +289,59 @@
          " of a work gang");
 }
 
-void GangWorker::loop() {
-  int previous_sequence_number = 0;
-  Monitor* gang_monitor = gang()->monitor();
-  for ( ; ; ) {
-    WorkData data;
-    int part;  // Initialized below.
-    {
-      // Grab the gang mutex.
-      MutexLocker ml(gang_monitor);
-      // Wait for something to do.
-      // Polling outside the while { wait } avoids missed notifies
-      // in the outer loop.
-      gang()->internal_worker_poll(&data);
-      if (TraceWorkGang) {
-        tty->print("Polled outside for work in gang %s worker %u",
-                   gang()->name(), id());
-        tty->print("  sequence: %d (prev: %d)",
-                   data.sequence_number(), previous_sequence_number);
-        if (data.task() != NULL) {
-          tty->print("  task: %s", data.task()->name());
-        } else {
-          tty->print("  task: NULL");
-        }
-        tty->cr();
-      }
-      for ( ; /* break */; ) {
-        // Check for new work.
-        if ((data.task() != NULL) &&
-            (data.sequence_number() != previous_sequence_number)) {
-          if (gang()->needs_more_workers()) {
-            gang()->internal_note_start();
-            gang_monitor->notify_all();
-            part = gang()->started_workers() - 1;
-            break;
-          }
-        }
-        // Nothing to do.
-        gang_monitor->wait(/* no_safepoint_check */ true);
-        gang()->internal_worker_poll(&data);
-        if (TraceWorkGang) {
-          tty->print("Polled inside for work in gang %s worker %u",
-                     gang()->name(), id());
-          tty->print("  sequence: %d (prev: %d)",
-                     data.sequence_number(), previous_sequence_number);
-          if (data.task() != NULL) {
-            tty->print("  task: %s", data.task()->name());
-          } else {
-            tty->print("  task: NULL");
-          }
-          tty->cr();
-        }
-      }
-      // Drop gang mutex.
-    }
-    if (TraceWorkGang) {
-      tty->print("Work for work gang %s id %u task %s part %d",
-                 gang()->name(), id(), data.task()->name(), part);
-    }
-    assert(data.task() != NULL, "Got null task");
-    data.task()->work(part);
-    {
-      if (TraceWorkGang) {
-        tty->print("Finish for work gang %s id %u task %s part %d",
-                   gang()->name(), id(), data.task()->name(), part);
-      }
-      // Grab the gang mutex.
-      MutexLocker ml(gang_monitor);
-      gang()->internal_note_finish();
-      // Tell the gang you are done.
-      gang_monitor->notify_all();
-      // Drop the gang mutex.
-    }
-    previous_sequence_number = data.sequence_number();
-  }
-}
-
-bool GangWorker::is_GC_task_thread() const {
+bool AbstractGangWorker::is_GC_task_thread() const {
   return gang()->are_GC_task_threads();
 }
 
-bool GangWorker::is_ConcurrentGC_thread() const {
+bool AbstractGangWorker::is_ConcurrentGC_thread() const {
   return gang()->are_ConcurrentGC_threads();
 }
 
-void GangWorker::print_on(outputStream* st) const {
+void AbstractGangWorker::print_on(outputStream* st) const {
   st->print("\"%s\" ", name());
   Thread::print_on(st);
   st->cr();
 }
 
-// Printing methods
+WorkData GangWorker::wait_for_task() {
+  return gang()->dispatcher()->worker_wait_for_task();
+}
 
-const char* AbstractWorkGang::name() const {
-  return _name;
+void GangWorker::signal_task_done() {
+  gang()->dispatcher()->worker_done_with_task();
+}
+
+void GangWorker::print_task_started(WorkData data) {
+  if (TraceWorkGang) {
+    tty->print_cr("Running work gang %s task %s worker %u", name(), data._task->name(), data._worker_id);
+  }
 }
 
-#ifndef PRODUCT
-
-const char* AbstractGangTask::name() const {
-  return _name;
+void GangWorker::print_task_done(WorkData data) {
+  if (TraceWorkGang) {
+    tty->print_cr("\nFinished work gang %s task %s worker %u", name(), data._task->name(), data._worker_id);
+    Thread* me = Thread::current();
+    tty->print_cr("  T: " PTR_FORMAT "  VM_thread: %d", p2i(me), me->is_VM_thread());
+  }
 }
 
-#endif /* PRODUCT */
+void GangWorker::run_task(WorkData data) {
+  print_task_started(data);
+
+  data._task->work(data._worker_id);
+
+  print_task_done(data);
+}
 
-// FlexibleWorkGang
+void GangWorker::loop() {
+  while (true) {
+    WorkData data = wait_for_task();
 
+    run_task(data);
+
+    signal_task_done();
+  }
+}
 
 // *** WorkGangBarrierSync
 
--- a/src/share/vm/gc/shared/workgroup.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/gc/shared/workgroup.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -25,282 +25,111 @@
 #ifndef SHARE_VM_GC_SHARED_WORKGROUP_HPP
 #define SHARE_VM_GC_SHARED_WORKGROUP_HPP
 
-#include "gc/shared/taskqueue.hpp"
-#include "runtime/thread.inline.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/globals.hpp"
+#include "runtime/thread.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/globalDefinitions.hpp"
 
 // Task class hierarchy:
 //   AbstractGangTask
-//     AbstractGangTaskWOopQueues
 //
 // Gang/Group class hierarchy:
 //   AbstractWorkGang
 //     WorkGang
-//       FlexibleWorkGang
-//         YieldingFlexibleWorkGang (defined in another file)
+//     YieldingFlexibleWorkGang (defined in another file)
 //
 // Worker class hierarchy:
-//   GangWorker (subclass of WorkerThread)
+//   AbstractGangWorker (subclass of WorkerThread)
+//     GangWorker
 //     YieldingFlexibleGangWorker   (defined in another file)
 
 // Forward declarations of classes defined here
 
+class AbstractGangWorker;
+class Semaphore;
 class WorkGang;
-class GangWorker;
-class YieldingFlexibleGangWorker;
-class YieldingFlexibleGangTask;
-class WorkData;
-class AbstractWorkGang;
 
 // An abstract task to be worked on by a gang.
 // You subclass this to supply your own work() method
 class AbstractGangTask VALUE_OBJ_CLASS_SPEC {
-public:
+  const char* _name;
+
+ public:
+  AbstractGangTask(const char* name) : _name(name) {}
+
   // The abstract work method.
   // The argument tells you which member of the gang you are.
   virtual void work(uint worker_id) = 0;
 
   // Debugging accessor for the name.
-  const char* name() const PRODUCT_RETURN_(return NULL;);
-  int counter() { return _counter; }
-  void set_counter(int value) { _counter = value; }
-  int *address_of_counter() { return &_counter; }
-
-  // RTTI
-  NOT_PRODUCT(virtual bool is_YieldingFlexibleGang_task() const {
-    return false;
-  })
+  const char* name() const { return _name; }
+};
 
-private:
-  NOT_PRODUCT(const char* _name;)
-  // ??? Should a task have a priority associated with it?
-  // ??? Or can the run method adjust priority as needed?
-  int _counter;
-
-protected:
-  // Constructor and desctructor: only construct subclasses.
-  AbstractGangTask(const char* name)
-  {
-    NOT_PRODUCT(_name = name);
-    _counter = 0;
-  }
-  ~AbstractGangTask() { }
-
-public:
+struct WorkData {
+  AbstractGangTask* _task;
+  uint              _worker_id;
+  WorkData(AbstractGangTask* task, uint worker_id) : _task(task), _worker_id(worker_id) {}
 };
 
-class AbstractGangTaskWOopQueues : public AbstractGangTask {
-  OopTaskQueueSet*       _queues;
-  ParallelTaskTerminator _terminator;
+// Interface to handle the synchronization between the coordinator thread and the worker threads,
+// when a task is dispatched out to the worker threads.
+class GangTaskDispatcher : public CHeapObj<mtGC> {
  public:
-  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues, uint n_threads) :
-    AbstractGangTask(name), _queues(queues), _terminator(n_threads, _queues) {}
-  ParallelTaskTerminator* terminator() { return &_terminator; }
-  OopTaskQueueSet* queues() { return _queues; }
+  virtual ~GangTaskDispatcher() {}
+
+  // Coordinator API.
+
+  // Distributes the task out to num_workers workers.
+  // Returns when the task has been completed by all workers.
+  virtual void coordinator_execute_on_workers(AbstractGangTask* task, uint num_workers) = 0;
+
+  // Worker API.
+
+  // Waits for a task to become available to the worker.
+  // Returns when the worker has been assigned a task.
+  virtual WorkData worker_wait_for_task() = 0;
+
+  // Signal to the coordinator that the worker is done with the assigned task.
+  virtual void     worker_done_with_task() = 0;
 };
 
+// The work gang is the collection of workers to execute tasks.
+// The number of workers run for a task is "_active_workers"
+// while "_total_workers" is the number of available of workers.
+class AbstractWorkGang : public CHeapObj<mtInternal> {
+ protected:
+  // The array of worker threads for this gang.
+  AbstractGangWorker** _workers;
+  // The count of the number of workers in the gang.
+  uint _total_workers;
+  // The currently active workers in this gang.
+  uint _active_workers;
+  // Printing support.
+  const char* _name;
 
-// Class AbstractWorkGang:
-// An abstract class representing a gang of workers.
-// You subclass this to supply an implementation of run_task().
-class AbstractWorkGang: public CHeapObj<mtInternal> {
-protected:
-  // Work gangs are never deleted, so no need to cleanup.
-  ~AbstractWorkGang() { ShouldNotReachHere(); }
-public:
-  // Constructor.
-  AbstractWorkGang(const char* name, bool are_GC_task_threads,
-                   bool are_ConcurrentGC_threads);
-  // Run a task, returns when the task is done (or terminated).
-  virtual void run_task(AbstractGangTask* task) = 0;
-  // Return true if more workers should be applied to the task.
-  virtual bool needs_more_workers() const { return true; }
-public:
-  // Debugging.
-  const char* name() const;
-protected:
+ private:
   // Initialize only instance data.
   const bool _are_GC_task_threads;
   const bool _are_ConcurrentGC_threads;
-  // Printing support.
-  const char* _name;
-  // The monitor which protects these data,
-  // and notifies of changes in it.
-  Monitor*  _monitor;
-  // The count of the number of workers in the gang.
-  uint _total_workers;
-  // The array of worker threads for this gang.
-  // This is only needed for cleaning up.
-  GangWorker** _gang_workers;
-  // The task for this gang.
-  AbstractGangTask* _task;
-  // A sequence number for the current task.
-  int _sequence_number;
-  // The number of started workers.
-  uint _started_workers;
-  // The number of finished workers.
-  uint _finished_workers;
-public:
-  // Accessors for fields
-  Monitor* monitor() const {
-    return _monitor;
-  }
-  uint total_workers() const {
-    return _total_workers;
-  }
-  virtual uint active_workers() const {
-    return _total_workers;
-  }
-  GangWorker** gang_workers() const {
-    return _gang_workers;
-  }
-  AbstractGangTask* task() const {
-    return _task;
-  }
-  int sequence_number() const {
-    return _sequence_number;
-  }
-  uint started_workers() const {
-    return _started_workers;
-  }
-  uint finished_workers() const {
-    return _finished_workers;
-  }
-  bool are_GC_task_threads() const {
-    return _are_GC_task_threads;
-  }
-  bool are_ConcurrentGC_threads() const {
-    return _are_ConcurrentGC_threads;
-  }
-  // Predicates.
-  bool is_idle() const {
-    return (task() == NULL);
-  }
-  // Return the Ith gang worker.
-  GangWorker* gang_worker(uint i) const;
-
-  void threads_do(ThreadClosure* tc) const;
-
-  // Printing
-  void print_worker_threads_on(outputStream *st) const;
-  void print_worker_threads() const {
-    print_worker_threads_on(tty);
-  }
-
-protected:
-  friend class GangWorker;
-  friend class YieldingFlexibleGangWorker;
-  // Note activation and deactivation of workers.
-  // These methods should only be called with the mutex held.
-  void internal_worker_poll(WorkData* data) const;
-  void internal_note_start();
-  void internal_note_finish();
-};
 
-class WorkData: public StackObj {
-  // This would be a struct, but I want accessor methods.
-private:
-  AbstractGangTask* _task;
-  int               _sequence_number;
-public:
-  // Constructor and destructor
-  WorkData() {
-    _task            = NULL;
-    _sequence_number = 0;
-  }
-  ~WorkData() {
-  }
-  AbstractGangTask* task()               const { return _task; }
-  void set_task(AbstractGangTask* value)       { _task = value; }
-  int sequence_number()                  const { return _sequence_number; }
-  void set_sequence_number(int value)          { _sequence_number = value; }
-
-  YieldingFlexibleGangTask* yf_task()    const {
-    return (YieldingFlexibleGangTask*)_task;
-  }
-};
-
-// Class WorkGang:
-class WorkGang: public AbstractWorkGang {
-public:
-  // Constructor
-  WorkGang(const char* name, uint workers,
-           bool are_GC_task_threads, bool are_ConcurrentGC_threads);
-  // Run a task, returns when the task is done (or terminated).
-  virtual void run_task(AbstractGangTask* task);
-  void run_task(AbstractGangTask* task, uint no_of_parallel_workers);
-  // Allocate a worker and return a pointer to it.
-  virtual GangWorker* allocate_worker(uint which);
-  // Initialize workers in the gang.  Return true if initialization
-  // succeeded. The type of the worker can be overridden in a derived
-  // class with the appropriate implementation of allocate_worker().
-  bool initialize_workers();
-};
-
-// Class GangWorker:
-//   Several instances of this class run in parallel as workers for a gang.
-class GangWorker: public WorkerThread {
-public:
-  // Constructors and destructor.
-  GangWorker(AbstractWorkGang* gang, uint id);
+ public:
+  AbstractWorkGang(const char* name, uint workers, bool are_GC_task_threads, bool are_ConcurrentGC_threads) :
+      _name(name),
+      _total_workers(workers),
+      _active_workers(UseDynamicNumberOfGCThreads ? 1U : workers),
+      _are_GC_task_threads(are_GC_task_threads),
+      _are_ConcurrentGC_threads(are_ConcurrentGC_threads)
+  { }
 
-  // The only real method: run a task for the gang.
-  virtual void run();
-  // Predicate for Thread
-  virtual bool is_GC_task_thread() const;
-  virtual bool is_ConcurrentGC_thread() const;
-  // Printing
-  void print_on(outputStream* st) const;
-  virtual void print() const { print_on(tty); }
-protected:
-  AbstractWorkGang* _gang;
-
-  virtual void initialize();
-  virtual void loop();
-
-public:
-  AbstractWorkGang* gang() const { return _gang; }
-};
+  // Initialize workers in the gang.  Return true if initialization succeeded.
+  bool initialize_workers();
 
-// Dynamic number of worker threads
-//
-// This type of work gang is used to run different numbers of
-// worker threads at different times.  The
-// number of workers run for a task is "_active_workers"
-// instead of "_total_workers" in a WorkGang.  The method
-// "needs_more_workers()" returns true until "_active_workers"
-// have been started and returns false afterwards.  The
-// implementation of "needs_more_workers()" in WorkGang always
-// returns true so that all workers are started.  The method
-// "loop()" in GangWorker was modified to ask "needs_more_workers()"
-// in its loop to decide if it should start working on a task.
-// A worker in "loop()" waits for notification on the WorkGang
-// monitor and execution of each worker as it checks for work
-// is serialized via the same monitor.  The "needs_more_workers()"
-// call is serialized and additionally the calculation for the
-// "part" (effectively the worker id for executing the task) is
-// serialized to give each worker a unique "part".  Workers that
-// are not needed for this tasks (i.e., "_active_workers" have
-// been started before it, continue to wait for work.
+  bool are_GC_task_threads()      const { return _are_GC_task_threads; }
+  bool are_ConcurrentGC_threads() const { return _are_ConcurrentGC_threads; }
 
-class FlexibleWorkGang: public WorkGang {
-  // The currently active workers in this gang.
-  // This is a number that is dynamically adjusted
-  // and checked in the run_task() method at each invocation.
-  // As described above _active_workers determines the number
-  // of threads started on a task.  It must also be used to
-  // determine completion.
+  uint total_workers() const { return _total_workers; }
 
- protected:
-  uint _active_workers;
- public:
-  // Constructor and destructor.
-  FlexibleWorkGang(const char* name, uint workers,
-                   bool are_GC_task_threads,
-                   bool  are_ConcurrentGC_threads) :
-    WorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads),
-    _active_workers(UseDynamicNumberOfGCThreads ? 1U : workers) {}
-
-  // Accessors for fields.
   virtual uint active_workers() const {
     assert(_active_workers <= _total_workers,
            err_msg("_active_workers: %u > _total_workers: %u", _active_workers, _total_workers));
@@ -317,10 +146,90 @@
     assert(UseDynamicNumberOfGCThreads || _active_workers == _total_workers,
            "Unless dynamic should use total workers");
   }
+
+  // Return the Ith worker.
+  AbstractGangWorker* worker(uint i) const;
+
+  void threads_do(ThreadClosure* tc) const;
+
+  // Debugging.
+  const char* name() const { return _name; }
+
+  // Printing
+  void print_worker_threads_on(outputStream *st) const;
+  void print_worker_threads() const {
+    print_worker_threads_on(tty);
+  }
+
+ protected:
+  virtual AbstractGangWorker* allocate_worker(uint which) = 0;
+};
+
+// An class representing a gang of workers.
+class WorkGang: public AbstractWorkGang {
+  // To get access to the GangTaskDispatcher instance.
+  friend class GangWorker;
+
+  // Never deleted.
+  ~WorkGang();
+
+  GangTaskDispatcher* const _dispatcher;
+  GangTaskDispatcher* dispatcher() const {
+    return _dispatcher;
+  }
+
+public:
+  WorkGang(const char* name,
+           uint workers,
+           bool are_GC_task_threads,
+           bool are_ConcurrentGC_threads);
+
+  // Run a task, returns when the task is done.
   virtual void run_task(AbstractGangTask* task);
-  virtual bool needs_more_workers() const {
-    return _started_workers < _active_workers;
-  }
+
+protected:
+  virtual AbstractGangWorker* allocate_worker(uint which);
+};
+
+// Several instances of this class run in parallel as workers for a gang.
+class AbstractGangWorker: public WorkerThread {
+public:
+  AbstractGangWorker(AbstractWorkGang* gang, uint id);
+
+  // The only real method: run a task for the gang.
+  virtual void run();
+  // Predicate for Thread
+  virtual bool is_GC_task_thread() const;
+  virtual bool is_ConcurrentGC_thread() const;
+  // Printing
+  void print_on(outputStream* st) const;
+  virtual void print() const { print_on(tty); }
+
+protected:
+  AbstractWorkGang* _gang;
+
+  virtual void initialize();
+  virtual void loop() = 0;
+
+  AbstractWorkGang* gang() const { return _gang; }
+};
+
+class GangWorker: public AbstractGangWorker {
+public:
+  GangWorker(WorkGang* gang, uint id) : AbstractGangWorker(gang, id) {}
+
+protected:
+  virtual void loop();
+
+private:
+  WorkData wait_for_task();
+  void run_task(WorkData work);
+  void signal_task_done();
+
+  void print_task_started(WorkData data);
+  void print_task_done(WorkData data);
+
+  WorkGang* gang() const { return (WorkGang*)_gang; }
 };
 
 // A class that acts as a synchronisation barrier. Workers enter
--- a/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -90,6 +90,10 @@
     java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
     java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
     java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
+    java_lang_Float_intBitsToFloat,                             // implementation of java.lang.Float.intBitsToFloat()
+    java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
+    java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
+    java_lang_Double_doubleToRawLongBits,                       // implementation of java.lang.Double.doubleToRawLongBits()
     number_of_method_entries,
     invalid = -1
   };
--- a/src/share/vm/interpreter/interpreter.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/interpreter/interpreter.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -234,7 +234,15 @@
       case vmIntrinsics::_updateByteBufferCRC32  : return java_util_zip_CRC32_updateByteBuffer;
     }
   }
-#endif
+
+  switch(m->intrinsic_id()) {
+  case vmIntrinsics::_intBitsToFloat:      return java_lang_Float_intBitsToFloat;
+  case vmIntrinsics::_floatToRawIntBits:   return java_lang_Float_floatToRawIntBits;
+  case vmIntrinsics::_longBitsToDouble:    return java_lang_Double_longBitsToDouble;
+  case vmIntrinsics::_doubleToRawLongBits: return java_lang_Double_doubleToRawLongBits;
+  }
+
+#endif // CC_INTERP
 
   // Native method?
   // Note: This test must come _before_ the test for intrinsic
@@ -559,6 +567,25 @@
                                            : // fall thru
   case Interpreter::java_util_zip_CRC32_updateByteBuffer
                                            : entry_point = generate_CRC32_updateBytes_entry(kind); break;
+#if defined(TARGET_ARCH_x86) && !defined(_LP64)
+  // On x86_32 platforms, a special entry is generated for the following four methods.
+  // On other platforms the normal entry is used to enter these methods.
+  case Interpreter::java_lang_Float_intBitsToFloat
+                                           : entry_point = generate_Float_intBitsToFloat_entry(); break;
+  case Interpreter::java_lang_Float_floatToRawIntBits
+                                           : entry_point = generate_Float_floatToRawIntBits_entry(); break;
+  case Interpreter::java_lang_Double_longBitsToDouble
+                                           : entry_point = generate_Double_longBitsToDouble_entry(); break;
+  case Interpreter::java_lang_Double_doubleToRawLongBits
+                                           : entry_point = generate_Double_doubleToRawLongBits_entry(); break;
+#else
+  case Interpreter::java_lang_Float_intBitsToFloat:
+  case Interpreter::java_lang_Float_floatToRawIntBits:
+  case Interpreter::java_lang_Double_longBitsToDouble:
+  case Interpreter::java_lang_Double_doubleToRawLongBits:
+    entry_point = generate_native_entry(false);
+    break;
+#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
 #endif // CC_INTERP
   default:
     fatal(err_msg("unexpected method kind: %d", kind));
--- a/src/share/vm/interpreter/templateInterpreter.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/interpreter/templateInterpreter.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -397,34 +397,39 @@
 
       // all non-native method kinds
       method_entry(zerolocals)
-        method_entry(zerolocals_synchronized)
-        method_entry(empty)
-        method_entry(accessor)
-        method_entry(abstract)
-        method_entry(java_lang_math_sin  )
-        method_entry(java_lang_math_cos  )
-        method_entry(java_lang_math_tan  )
-        method_entry(java_lang_math_abs  )
-        method_entry(java_lang_math_sqrt )
-        method_entry(java_lang_math_log  )
-        method_entry(java_lang_math_log10)
-        method_entry(java_lang_math_exp  )
-        method_entry(java_lang_math_pow  )
-        method_entry(java_lang_ref_reference_get)
+      method_entry(zerolocals_synchronized)
+      method_entry(empty)
+      method_entry(accessor)
+      method_entry(abstract)
+      method_entry(java_lang_math_sin  )
+      method_entry(java_lang_math_cos  )
+      method_entry(java_lang_math_tan  )
+      method_entry(java_lang_math_abs  )
+      method_entry(java_lang_math_sqrt )
+      method_entry(java_lang_math_log  )
+      method_entry(java_lang_math_log10)
+      method_entry(java_lang_math_exp  )
+      method_entry(java_lang_math_pow  )
+      method_entry(java_lang_ref_reference_get)
 
-        if (UseCRC32Intrinsics) {
-          method_entry(java_util_zip_CRC32_update)
-            method_entry(java_util_zip_CRC32_updateBytes)
-            method_entry(java_util_zip_CRC32_updateByteBuffer)
-            }
+      if (UseCRC32Intrinsics) {
+        method_entry(java_util_zip_CRC32_update)
+        method_entry(java_util_zip_CRC32_updateBytes)
+        method_entry(java_util_zip_CRC32_updateByteBuffer)
+      }
+
+      method_entry(java_lang_Float_intBitsToFloat);
+      method_entry(java_lang_Float_floatToRawIntBits);
+      method_entry(java_lang_Double_longBitsToDouble);
+      method_entry(java_lang_Double_doubleToRawLongBits);
 
       initialize_method_handle_entries();
 
       // all native method kinds (must be one contiguous block)
       Interpreter::_native_entry_begin = Interpreter::code()->code_end();
       method_entry(native)
-        method_entry(native_synchronized)
-        Interpreter::_native_entry_end = Interpreter::code()->code_end();
+      method_entry(native_synchronized)
+      Interpreter::_native_entry_end = Interpreter::code()->code_end();
 
 #undef method_entry
 
--- a/src/share/vm/memory/metaspace.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/memory/metaspace.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -254,7 +254,7 @@
   // Debugging support
   void verify();
 
-  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0);
+  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0) NOT_LP64({});
 
   class AllocRecordClosure :  public StackObj {
   public:
--- a/src/share/vm/memory/universe.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/memory/universe.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -77,7 +77,7 @@
 #if INCLUDE_ALL_GCS
 #include "gc/cms/cmsCollectorPolicy.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
-#include "gc/g1/g1CollectorPolicy_ext.hpp"
+#include "gc/g1/g1CollectorPolicy.hpp"
 #include "gc/parallel/parallelScavengeHeap.hpp"
 #include "gc/shared/adaptiveSizePolicy.hpp"
 #endif // INCLUDE_ALL_GCS
@@ -694,13 +694,29 @@
   return JNI_OK;
 }
 
-template <class Heap, class Policy>
-jint Universe::create_heap() {
+CollectedHeap* Universe::create_heap() {
   assert(_collectedHeap == NULL, "Heap already created");
-  Policy* policy = new Policy();
-  policy->initialize_all();
-  _collectedHeap = new Heap(policy);
-  return _collectedHeap->initialize();
+#if !INCLUDE_ALL_GCS
+  if (UseParallelGC) {
+    fatal("UseParallelGC not supported in this VM.");
+  } else if (UseG1GC) {
+    fatal("UseG1GC not supported in this VM.");
+  } else if (UseConcMarkSweepGC) {
+    fatal("UseConcMarkSweepGC not supported in this VM.");
+#else
+  if (UseParallelGC) {
+    return Universe::create_heap_with_policy<ParallelScavengeHeap, GenerationSizer>();
+  } else if (UseG1GC) {
+    return Universe::create_heap_with_policy<G1CollectedHeap, G1CollectorPolicy>();
+  } else if (UseConcMarkSweepGC) {
+    return Universe::create_heap_with_policy<GenCollectedHeap, ConcurrentMarkSweepPolicy>();
+#endif
+  } else if (UseSerialGC) {
+    return Universe::create_heap_with_policy<GenCollectedHeap, MarkSweepPolicy>();
+  }
+
+  ShouldNotReachHere();
+  return NULL;
 }
 
 // Choose the heap base address and oop encoding mode
@@ -714,27 +730,12 @@
 jint Universe::initialize_heap() {
   jint status = JNI_ERR;
 
-#if !INCLUDE_ALL_GCS
-  if (UseParallelGC) {
-    fatal("UseParallelGC not supported in this VM.");
-  } else if (UseG1GC) {
-    fatal("UseG1GC not supported in this VM.");
-  } else if (UseConcMarkSweepGC) {
-    fatal("UseConcMarkSweepGC not supported in this VM.");
-#else
-  if (UseParallelGC) {
-    status = Universe::create_heap<ParallelScavengeHeap, GenerationSizer>();
-  } else if (UseG1GC) {
-    status = Universe::create_heap<G1CollectedHeap, G1CollectorPolicyExt>();
-  } else if (UseConcMarkSweepGC) {
-    status = Universe::create_heap<GenCollectedHeap, ConcurrentMarkSweepPolicy>();
-#endif
-  } else if (UseSerialGC) {
-    status = Universe::create_heap<GenCollectedHeap, MarkSweepPolicy>();
-  } else {
-    ShouldNotReachHere();
+  _collectedHeap = create_heap_ext();
+  if (_collectedHeap == NULL) {
+    _collectedHeap = create_heap();
   }
 
+  status = _collectedHeap->initialize();
   if (status != JNI_OK) {
     return status;
   }
--- a/src/share/vm/memory/universe.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/memory/universe.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -214,7 +214,9 @@
   static size_t _heap_capacity_at_last_gc;
   static size_t _heap_used_at_last_gc;
 
-  template <class Heap, class Policy> static jint create_heap();
+  template <class Heap, class Policy> static CollectedHeap* create_heap_with_policy();
+  static CollectedHeap* create_heap();
+  static CollectedHeap* create_heap_ext();
   static jint initialize_heap();
   static void initialize_basic_type_mirrors(TRAPS);
   static void fixup_mirrors(TRAPS);
--- a/src/share/vm/memory/universe.inline.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/memory/universe.inline.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -49,4 +49,11 @@
   _allocation_context_notification_obj = obj;
 }
 
+template <class Heap, class Policy>
+CollectedHeap* Universe::create_heap_with_policy() {
+  Policy* policy = new Policy();
+  policy->initialize_all();
+  return new Heap(policy);
+}
+
 #endif // SHARE_VM_MEMORY_UNIVERSE_INLINE_HPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/memory/universe_ext.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "memory/universe.hpp"
+
+CollectedHeap* Universe::create_heap_ext() {
+  return NULL;
+}
--- a/src/share/vm/oops/symbol.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/oops/symbol.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -35,7 +35,7 @@
 Symbol::Symbol(const u1* name, int length, int refcount) {
   _refcount = refcount;
   _length = length;
-  _identity_hash = os::random();
+  _identity_hash = (short)os::random();
   for (int i = 0; i < _length; i++) {
     byte_at_put(i, name[i]);
   }
--- a/src/share/vm/oops/symbol.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/oops/symbol.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -106,23 +106,18 @@
 #define PERM_REFCOUNT -1
 #endif
 
-// We separate the fields in SymbolBase from Symbol::_body so that
-// Symbol::size(int) can correctly calculate the space needed.
-class SymbolBase : public MetaspaceObj {
- public:
+class Symbol : public MetaspaceObj {
+  friend class VMStructs;
+  friend class SymbolTable;
+  friend class MoveSymbols;
+
+ private:
   ATOMIC_SHORT_PAIR(
     volatile short _refcount,  // needs atomic operation
     unsigned short _length     // number of UTF8 characters in the symbol (does not need atomic op)
   );
-  int            _identity_hash;
-};
-
-class Symbol : private SymbolBase {
-  friend class VMStructs;
-  friend class SymbolTable;
-  friend class MoveSymbols;
- private:
-  jbyte _body[1];
+  short _identity_hash;
+  jbyte _body[2];
 
   enum {
     // max_symbol_length is constrained by type of _length
@@ -130,7 +125,7 @@
   };
 
   static int size(int length) {
-    size_t sz = heap_word_size(sizeof(SymbolBase) + (length > 0 ? length : 0));
+    size_t sz = heap_word_size(sizeof(Symbol) + (length > 2 ? length - 2 : 0));
     return align_object_size(sz);
   }
 
@@ -154,8 +149,11 @@
 
   // Returns the largest size symbol we can safely hold.
   static int max_length()   { return max_symbol_length; }
-
-  int identity_hash()       { return _identity_hash; }
+  unsigned identity_hash() {
+    unsigned addr_bits = (unsigned)((uintptr_t)this >> (LogMinObjAlignmentInBytes + 3));
+    return ((unsigned)_identity_hash & 0xffff) |
+           ((addr_bits ^ (_length << 8) ^ (( _body[0] << 8) | _body[1])) << 16);
+  }
 
   // For symbol table alternate hashing
   unsigned int new_hash(juint seed);
--- a/src/share/vm/opto/arraycopynode.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/arraycopynode.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -626,3 +626,75 @@
 
   return CallNode::may_modify_arraycopy_helper(dest_t, t_oop, phase);
 }
+
+bool ArrayCopyNode::may_modify_helper(const TypeOopPtr *t_oop, Node* n, PhaseTransform *phase) {
+  if (n->is_Proj()) {
+    n = n->in(0);
+    if (n->is_Call() && n->as_Call()->may_modify(t_oop, phase)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ArrayCopyNode::may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase) {
+  Node* mem = mb->in(TypeFunc::Memory);
+
+  if (mem->is_MergeMem()) {
+    Node* n = mem->as_MergeMem()->memory_at(Compile::AliasIdxRaw);
+    if (may_modify_helper(t_oop, n, phase)) {
+      return true;
+    } else if (n->is_Phi()) {
+      for (uint i = 1; i < n->req(); i++) {
+        if (n->in(i) != NULL) {
+          if (may_modify_helper(t_oop, n->in(i), phase)) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// Does this array copy modify offsets between offset_lo and offset_hi
+// in the destination array
+// if must_modify is false, return true if the copy could write
+// between offset_lo and offset_hi
+// if must_modify is true, return true if the copy is guaranteed to
+// write between offset_lo and offset_hi
+bool ArrayCopyNode::modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify) {
+  assert(_kind == ArrayCopy || _kind == CopyOf || _kind == CopyOfRange, "only for real array copies");
+
+  Node* dest = in(ArrayCopyNode::Dest);
+  Node* src_pos = in(ArrayCopyNode::SrcPos);
+  Node* dest_pos = in(ArrayCopyNode::DestPos);
+  Node* len = in(ArrayCopyNode::Length);
+
+  const TypeInt *dest_pos_t = phase->type(dest_pos)->isa_int();
+  const TypeInt *len_t = phase->type(len)->isa_int();
+  const TypeAryPtr* ary_t = phase->type(dest)->isa_aryptr();
+
+  if (dest_pos_t != NULL && len_t != NULL && ary_t != NULL) {
+    BasicType ary_elem = ary_t->klass()->as_array_klass()->element_type()->basic_type();
+    uint header = arrayOopDesc::base_offset_in_bytes(ary_elem);
+    uint elemsize = type2aelembytes(ary_elem);
+
+    intptr_t dest_pos_plus_len_lo = (((intptr_t)dest_pos_t->_lo) + len_t->_lo) * elemsize + header;
+    intptr_t dest_pos_plus_len_hi = (((intptr_t)dest_pos_t->_hi) + len_t->_hi) * elemsize + header;
+    intptr_t dest_pos_lo = ((intptr_t)dest_pos_t->_lo) * elemsize + header;
+    intptr_t dest_pos_hi = ((intptr_t)dest_pos_t->_hi) * elemsize + header;
+
+    if (must_modify) {
+      if (offset_lo >= dest_pos_hi && offset_hi < dest_pos_plus_len_lo) {
+        return true;
+      }
+    } else {
+      if (offset_hi >= dest_pos_lo && offset_lo < dest_pos_plus_len_hi) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
--- a/src/share/vm/opto/arraycopynode.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/arraycopynode.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -108,6 +108,7 @@
                             BasicType copy_type, const Type* value_type, int count);
   bool finish_transform(PhaseGVN *phase, bool can_reshape,
                         Node* ctl, Node *mem);
+  static bool may_modify_helper(const TypeOopPtr *t_oop, Node* n, PhaseTransform *phase);
 
 public:
 
@@ -162,6 +163,9 @@
 
   bool is_alloc_tightly_coupled() const { return _alloc_tightly_coupled; }
 
+  static bool may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase);
+  bool modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify);
+
 #ifndef PRODUCT
   virtual void dump_spec(outputStream *st) const;
   virtual void dump_compact_spec(outputStream* st) const;
--- a/src/share/vm/opto/c2compiler.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/c2compiler.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -161,7 +161,7 @@
   vmIntrinsics::ID id = method->intrinsic_id();
   assert(id != vmIntrinsics::_none, "must be a VM intrinsic");
 
-  if (id < vmIntrinsics::FIRST_ID || id >= vmIntrinsics::LAST_COMPILER_INLINE) {
+  if (id < vmIntrinsics::FIRST_ID || id > vmIntrinsics::LAST_COMPILER_INLINE) {
     return false;
   }
 
--- a/src/share/vm/opto/callnode.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/callnode.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -742,7 +742,7 @@
 //
 bool CallNode::may_modify(const TypeOopPtr *t_oop, PhaseTransform *phase) {
   assert((t_oop != NULL), "sanity");
-  if (is_call_to_arraycopystub()) {
+  if (is_call_to_arraycopystub() && strcmp(_name, "unsafe_arraycopy") != 0) {
     const TypeTuple* args = _tf->domain();
     Node* dest = NULL;
     // Stubs that can be called once an ArrayCopyNode is expanded have
--- a/src/share/vm/opto/chaitin.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/chaitin.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -990,9 +990,13 @@
         // FOUR registers!
 #ifdef ASSERT
         if (is_vect) {
-          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
-          assert(!lrg._fat_proj, "sanity");
-          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          if (lrg.num_regs() != 0) {
+            assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+            assert(!lrg._fat_proj, "sanity");
+            assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          } else {
+            assert(n->is_Phi(), "not all inputs processed only if Phi");
+          }
         }
 #endif
         if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
--- a/src/share/vm/opto/compile.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/compile.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -93,7 +93,7 @@
  public:
 
   void set_idx(node_idx_t idx) {
-    _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF00000000 | idx;
+    _idx_clone_orig = _idx_clone_orig & CONST64(0xFFFFFFFF00000000) | idx;
   }
   node_idx_t idx() const { return (node_idx_t)(_idx_clone_orig & 0xFFFFFFFF); }
 
--- a/src/share/vm/opto/library_call.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/library_call.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -2730,7 +2730,22 @@
         load_store = _gvn.transform(new CompareAndSwapPNode(control(), mem, adr, newval, oldval));
       }
     }
-    post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
+    if (kind == LS_cmpxchg) {
+      // Emit the post barrier only when the actual store happened.
+      // This makes sense to check only for compareAndSet that can fail to set the value.
+      // CAS success path is marked more likely since we anticipate this is a performance
+      // critical path, while CAS failure path can use the penalty for going through unlikely
+      // path as backoff. Which is still better than doing a store barrier there.
+      IdealKit ideal(this);
+      ideal.if_then(load_store, BoolTest::ne, ideal.ConI(0), PROB_STATIC_FREQUENT); {
+        sync_kit(ideal);
+        post_barrier(ideal.ctrl(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
+        ideal.sync_kit(this);
+      } ideal.end_if();
+      final_sync(ideal);
+    } else {
+      post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
+    }
     break;
   default:
     fatal(err_msg_res("unexpected type %d: %s", type, type2name(type)));
--- a/src/share/vm/opto/loopnode.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/loopnode.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -1175,7 +1175,7 @@
 //=============================================================================
 //------------------------------is_member--------------------------------------
 // Is 'l' a member of 'this'?
-int IdealLoopTree::is_member( const IdealLoopTree *l ) const {
+bool IdealLoopTree::is_member(const IdealLoopTree *l) const {
   while( l->_nest > _nest ) l = l->_parent;
   return l == this;
 }
--- a/src/share/vm/opto/loopnode.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/loopnode.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -384,7 +384,7 @@
   { }
 
   // Is 'l' a member of 'this'?
-  int is_member( const IdealLoopTree *l ) const; // Test for nested membership
+  bool is_member(const IdealLoopTree *l) const; // Test for nested membership
 
   // Set loop nesting depth.  Accumulate has_call bits.
   int set_nest( uint depth );
@@ -1086,6 +1086,8 @@
   bool split_up( Node *n, Node *blk1, Node *blk2 );
   void sink_use( Node *use, Node *post_loop );
   Node *place_near_use( Node *useblock ) const;
+  Node* try_move_store_before_loop(Node* n, Node *n_ctrl);
+  void try_move_store_after_loop(Node* n);
 
   bool _created_loop_node;
 public:
--- a/src/share/vm/opto/loopopts.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/loopopts.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -653,6 +653,209 @@
   return iff->in(1);
 }
 
+#ifdef ASSERT
+static void enqueue_cfg_uses(Node* m, Unique_Node_List& wq) {
+  for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) {
+    Node* u = m->fast_out(i);
+    if (u->is_CFG()) {
+      if (u->Opcode() == Op_NeverBranch) {
+        u = ((NeverBranchNode*)u)->proj_out(0);
+        enqueue_cfg_uses(u, wq);
+      } else {
+        wq.push(u);
+      }
+    }
+  }
+}
+#endif
+
+// Try moving a store out of a loop, right before the loop
+Node* PhaseIdealLoop::try_move_store_before_loop(Node* n, Node *n_ctrl) {
+  // Store has to be first in the loop body
+  IdealLoopTree *n_loop = get_loop(n_ctrl);
+  if (n->is_Store() && n_loop != _ltree_root && n_loop->is_loop()) {
+    assert(n->in(0), "store should have control set");
+    Node* address = n->in(MemNode::Address);
+    Node* value = n->in(MemNode::ValueIn);
+    Node* mem = n->in(MemNode::Memory);
+    IdealLoopTree* address_loop = get_loop(get_ctrl(address));
+    IdealLoopTree* value_loop = get_loop(get_ctrl(value));
+
+    // - address and value must be loop invariant
+    // - memory must be a memory Phi for the loop
+    // - Store must be the only store on this memory slice in the
+    // loop: if there's another store following this one then value
+    // written at iteration i by the second store could be overwritten
+    // at iteration i+n by the first store: it's not safe to move the
+    // first store out of the loop
+    // - nothing must observe the Phi memory: it guarantees no read
+    // before the store and no early exit out of the loop
+    // With those conditions, we are also guaranteed the store post
+    // dominates the loop head. Otherwise there would be extra Phi
+    // involved between the loop's Phi and the store.
+
+    if (!n_loop->is_member(address_loop) &&
+        !n_loop->is_member(value_loop) &&
+        mem->is_Phi() && mem->in(0) == n_loop->_head &&
+        mem->outcnt() == 1 &&
+        mem->in(LoopNode::LoopBackControl) == n) {
+
+#ifdef ASSERT
+      // Verify that store's control does post dominate loop entry and
+      // that there's no early exit of the loop before the store.
+      bool ctrl_ok = false;
+      {
+        // Follow control from loop head until n, we exit the loop or
+        // we reach the tail
+        ResourceMark rm;
+        Unique_Node_List wq;
+        wq.push(n_loop->_head);
+        assert(n_loop->_tail != NULL, "need a tail");
+        for (uint next = 0; next < wq.size(); ++next) {
+          Node *m = wq.at(next);
+          if (m == n->in(0)) {
+            ctrl_ok = true;
+            continue;
+          }
+          assert(!has_ctrl(m), "should be CFG");
+          if (!n_loop->is_member(get_loop(m)) || m == n_loop->_tail) {
+            ctrl_ok = false;
+            break;
+          }
+          enqueue_cfg_uses(m, wq);
+        }
+      }
+      assert(ctrl_ok, "bad control");
+#endif
+
+      // move the Store
+      _igvn.replace_input_of(mem, LoopNode::LoopBackControl, mem);
+      _igvn.replace_input_of(n, 0, n_loop->_head->in(LoopNode::EntryControl));
+      _igvn.replace_input_of(n, MemNode::Memory, mem->in(LoopNode::EntryControl));
+      // Disconnect the phi now. An empty phi can confuse other
+      // optimizations in this pass of loop opts.
+      _igvn.replace_node(mem, mem->in(LoopNode::EntryControl));
+      n_loop->_body.yank(mem);
+
+      IdealLoopTree* new_loop = get_loop(n->in(0));
+      set_ctrl_and_loop(n, n->in(0));
+
+      return n;
+    }
+  }
+  return NULL;
+}
+
+// Try moving a store out of a loop, right after the loop
+void PhaseIdealLoop::try_move_store_after_loop(Node* n) {
+  if (n->is_Store()) {
+    assert(n->in(0), "store should have control set");
+    Node *n_ctrl = get_ctrl(n);
+    IdealLoopTree *n_loop = get_loop(n_ctrl);
+    // Store must be in a loop
+    if (n_loop != _ltree_root && !n_loop->_irreducible) {
+      Node* address = n->in(MemNode::Address);
+      Node* value = n->in(MemNode::ValueIn);
+      IdealLoopTree* address_loop = get_loop(get_ctrl(address));
+      // address must be loop invariant
+      if (!n_loop->is_member(address_loop)) {
+        // Store must be last on this memory slice in the loop and
+        // nothing in the loop must observe it
+        Node* phi = NULL;
+        for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+          Node* u = n->fast_out(i);
+          if (has_ctrl(u)) { // control use?
+            IdealLoopTree *u_loop = get_loop(get_ctrl(u));
+            if (!n_loop->is_member(u_loop)) {
+              continue;
+            }
+            if (u->is_Phi() && u->in(0) == n_loop->_head) {
+              assert(_igvn.type(u) == Type::MEMORY, "bad phi");
+              assert(phi == NULL, "already found");
+              phi = u;
+              continue;
+            }
+          }
+          phi = NULL;
+          break;
+        }
+        if (phi != NULL) {
+          // Nothing in the loop before the store (next iteration)
+          // must observe the stored value
+          bool mem_ok = true;
+          {
+            ResourceMark rm;
+            Unique_Node_List wq;
+            wq.push(phi);
+            for (uint next = 0; next < wq.size() && mem_ok; ++next) {
+              Node *m = wq.at(next);
+              for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax && mem_ok; i++) {
+                Node* u = m->fast_out(i);
+                if (u->is_Store() || u->is_Phi()) {
+                  if (u != n) {
+                    wq.push(u);
+                    mem_ok = (wq.size() <= 10);
+                  }
+                } else {
+                  mem_ok = false;
+                  break;
+                }
+              }
+            }
+          }
+          if (mem_ok) {
+            // Move the Store out of the loop creating clones along
+            // all paths out of the loop that observe the stored value
+            _igvn.rehash_node_delayed(phi);
+            int count = phi->replace_edge(n, n->in(MemNode::Memory));
+            assert(count > 0, "inconsistent phi");
+            for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+              Node* u = n->fast_out(i);
+              Node* c = get_ctrl(u);
+
+              if (u->is_Phi()) {
+                c = u->in(0)->in(u->find_edge(n));
+              }
+              IdealLoopTree *u_loop = get_loop(c);
+              assert (!n_loop->is_member(u_loop), "only the phi should have been a use in the loop");
+              while(true) {
+                Node* next_c = find_non_split_ctrl(idom(c));
+                if (n_loop->is_member(get_loop(next_c))) {
+                  break;
+                }
+                c = next_c;
+              }
+
+              Node* st = n->clone();
+              st->set_req(0, c);
+              _igvn.register_new_node_with_optimizer(st);
+
+              set_ctrl(st, c);
+              IdealLoopTree* new_loop = get_loop(c);
+              assert(new_loop != n_loop, "should be moved out of loop");
+              if (new_loop->_child == NULL) new_loop->_body.push(st);
+
+              _igvn.replace_input_of(u, u->find_edge(n), st);
+              --imax;
+              --i;
+            }
+
+
+            assert(n->outcnt() == 0, "all uses should be gone");
+            _igvn.replace_input_of(n, MemNode::Memory, C->top());
+            // Disconnect the phi now. An empty phi can confuse other
+            // optimizations in this pass of loop opts..
+            if (phi->in(LoopNode::LoopBackControl) == phi) {
+              _igvn.replace_node(phi, phi->in(LoopNode::EntryControl));
+              n_loop->_body.yank(phi);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 //------------------------------split_if_with_blocks_pre-----------------------
 // Do the real work in a non-recursive function.  Data nodes want to be
 // cloned in the pre-order so they can feed each other nicely.
@@ -683,6 +886,11 @@
   Node *n_ctrl = get_ctrl(n);
   if( !n_ctrl ) return n;       // Dead node
 
+  Node* res = try_move_store_before_loop(n, n_ctrl);
+  if (res != NULL) {
+    return n;
+  }
+
   // Attempt to remix address expressions for loop invariants
   Node *m = remix_address_expressions( n );
   if( m ) return m;
@@ -691,16 +899,18 @@
   // Returns the block to clone thru.
   Node *n_blk = has_local_phi_input( n );
   if( !n_blk ) return n;
+
   // Do not clone the trip counter through on a CountedLoop
   // (messes up the canonical shape).
   if( n_blk->is_CountedLoop() && n->Opcode() == Op_AddI ) return n;
 
   // Check for having no control input; not pinned.  Allow
   // dominating control.
-  if( n->in(0) ) {
+  if (n->in(0)) {
     Node *dom = idom(n_blk);
-    if( dom_lca( n->in(0), dom ) != n->in(0) )
+    if (dom_lca(n->in(0), dom) != n->in(0)) {
       return n;
+    }
   }
   // Policy: when is it profitable.  You must get more wins than
   // policy before it is considered profitable.  Policy is usually 0,
@@ -1029,6 +1239,8 @@
     }
   }
 
+  try_move_store_after_loop(n);
+
   // Check for Opaque2's who's loop has disappeared - who's input is in the
   // same loop nest as their output.  Remove 'em, they are no longer useful.
   if( n_op == Op_Opaque2 &&
--- a/src/share/vm/opto/macro.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/macro.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -324,18 +324,28 @@
         return in;
       } else if (in->is_Call()) {
         CallNode *call = in->as_Call();
-        if (!call->may_modify(tinst, phase)) {
-          mem = call->in(TypeFunc::Memory);
+        if (call->may_modify(tinst, phase)) {
+          assert(call->is_ArrayCopy(), "ArrayCopy is the only call node that doesn't make allocation escape");
+
+          if (call->as_ArrayCopy()->modifies(offset, offset, phase, false)) {
+            return in;
+          }
         }
         mem = in->in(TypeFunc::Memory);
       } else if (in->is_MemBar()) {
+        if (ArrayCopyNode::may_modify(tinst, in->as_MemBar(), phase)) {
+          assert(in->in(0)->is_Proj() && in->in(0)->in(0)->is_ArrayCopy(), "should be arraycopy");
+          ArrayCopyNode* ac = in->in(0)->in(0)->as_ArrayCopy();
+          assert(ac->is_clonebasic(), "Only basic clone is a non escaping clone");
+          return ac;
+        }
         mem = in->in(TypeFunc::Memory);
       } else {
         assert(false, "unexpected projection");
       }
     } else if (mem->is_Store()) {
       const TypePtr* atype = mem->as_Store()->adr_type();
-      int adr_idx = Compile::current()->get_alias_index(atype);
+      int adr_idx = phase->C->get_alias_index(atype);
       if (adr_idx == alias_idx) {
         assert(atype->isa_oopptr(), "address type must be oopptr");
         int adr_offset = atype->offset();
@@ -373,7 +383,7 @@
         adr = mem->in(3); // Destination array
       }
       const TypePtr* atype = adr->bottom_type()->is_ptr();
-      int adr_idx = Compile::current()->get_alias_index(atype);
+      int adr_idx = phase->C->get_alias_index(atype);
       if (adr_idx == alias_idx) {
         assert(false, "Object is not scalar replaceable if a LoadStore node access its field");
         return NULL;
@@ -386,12 +396,63 @@
   }
 }
 
+// Generate loads from source of the arraycopy for fields of
+// destination needed at a deoptimization point
+Node* PhaseMacroExpand::make_arraycopy_load(ArrayCopyNode* ac, intptr_t offset, Node* ctl, BasicType ft, const Type *ftype, AllocateNode *alloc) {
+  BasicType bt = ft;
+  const Type *type = ftype;
+  if (ft == T_NARROWOOP) {
+    bt = T_OBJECT;
+    type = ftype->make_oopptr();
+  }
+  Node* res = NULL;
+  if (ac->is_clonebasic()) {
+    Node* base = ac->in(ArrayCopyNode::Src)->in(AddPNode::Base);
+    Node* adr = _igvn.transform(new AddPNode(base, base, MakeConX(offset)));
+    const TypePtr* adr_type = _igvn.type(base)->is_ptr()->add_offset(offset);
+    Node* m = ac->in(TypeFunc::Memory);
+    while (m->is_MergeMem()) {
+      m = m->as_MergeMem()->memory_at(C->get_alias_index(adr_type));
+      if (m->is_Proj() && m->in(0)->is_MemBar()) {
+        m = m->in(0)->in(TypeFunc::Memory);
+      }
+    }
+    res = LoadNode::make(_igvn, ctl, m, adr, adr_type, type, bt, MemNode::unordered, LoadNode::Pinned);
+  } else {
+    if (ac->modifies(offset, offset, &_igvn, true)) {
+      assert(ac->in(ArrayCopyNode::Dest) == alloc->result_cast(), "arraycopy destination should be allocation's result");
+      uint shift  = exact_log2(type2aelembytes(bt));
+      Node* diff = _igvn.transform(new SubINode(ac->in(ArrayCopyNode::SrcPos), ac->in(ArrayCopyNode::DestPos)));
+#ifdef _LP64
+      diff = _igvn.transform(new ConvI2LNode(diff));
+#endif
+      diff = _igvn.transform(new LShiftXNode(diff, intcon(shift)));
+
+      Node* off = _igvn.transform(new AddXNode(MakeConX(offset), diff));
+      Node* base = ac->in(ArrayCopyNode::Src);
+      Node* adr = _igvn.transform(new AddPNode(base, base, off));
+      const TypePtr* adr_type = _igvn.type(base)->is_ptr()->add_offset(offset);
+      Node* m = ac->in(TypeFunc::Memory);
+      res = LoadNode::make(_igvn, ctl, m, adr, adr_type, type, bt, MemNode::unordered, LoadNode::Pinned);
+    }
+  }
+  if (res != NULL) {
+    res = _igvn.transform(res);
+    if (ftype->isa_narrowoop()) {
+      // PhaseMacroExpand::scalar_replacement adds DecodeN nodes
+      res = _igvn.transform(new EncodePNode(res, ftype));
+    }
+    return res;
+  }
+  return NULL;
+}
+
 //
 // Given a Memory Phi, compute a value Phi containing the values from stores
 // on the input paths.
-// Note: this function is recursive, its depth is limied by the "level" argument
+// Note: this function is recursive, its depth is limited by the "level" argument
 // Returns the computed Phi, or NULL if it cannot compute it.
-Node *PhaseMacroExpand::value_from_mem_phi(Node *mem, BasicType ft, const Type *phi_type, const TypeOopPtr *adr_t, Node *alloc, Node_Stack *value_phis, int level) {
+Node *PhaseMacroExpand::value_from_mem_phi(Node *mem, BasicType ft, const Type *phi_type, const TypeOopPtr *adr_t, AllocateNode *alloc, Node_Stack *value_phis, int level) {
   assert(mem->is_Phi(), "sanity");
   int alias_idx = C->get_alias_index(adr_t);
   int offset = adr_t->offset();
@@ -458,6 +519,12 @@
         assert(val->in(0)->is_LoadStore() || val->in(0)->Opcode() == Op_EncodeISOArray, "sanity");
         assert(false, "Object is not scalar replaceable if a LoadStore node access its field");
         return NULL;
+      } else if (val->is_ArrayCopy()) {
+        Node* res = make_arraycopy_load(val->as_ArrayCopy(), offset, val->in(0), ft, phi_type, alloc);
+        if (res == NULL) {
+          return NULL;
+        }
+        values.at_put(j, res);
       } else {
 #ifdef ASSERT
         val->dump();
@@ -479,7 +546,7 @@
 }
 
 // Search the last value stored into the object's field.
-Node *PhaseMacroExpand::value_from_mem(Node *sfpt_mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, Node *alloc) {
+Node *PhaseMacroExpand::value_from_mem(Node *sfpt_mem, Node *sfpt_ctl, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, AllocateNode *alloc) {
   assert(adr_t->is_known_instance_field(), "instance required");
   int instance_id = adr_t->instance_id();
   assert((uint)instance_id == alloc->_idx, "wrong allocation");
@@ -538,6 +605,8 @@
       } else {
         done = true;
       }
+    } else if (mem->is_ArrayCopy()) {
+      done = true;
     } else {
       assert(false, "unexpected node");
     }
@@ -562,6 +631,13 @@
           value_phis.pop();
         }
       }
+    } else if (mem->is_ArrayCopy()) {
+      Node* ctl = mem->in(0);
+      if (sfpt_ctl->is_Proj() && sfpt_ctl->as_Proj()->is_uncommon_trap_proj(Deoptimization::Reason_none)) {
+        // pin the loads in the uncommon trap path
+        ctl = sfpt_ctl;
+      }
+      return make_arraycopy_load(mem->as_ArrayCopy(), offset, ctl, ft, ftype, alloc);
     }
   }
   // Something go wrong.
@@ -738,6 +814,7 @@
   while (safepoints.length() > 0) {
     SafePointNode* sfpt = safepoints.pop();
     Node* mem = sfpt->memory();
+    Node* ctl = sfpt->control();
     assert(sfpt->jvms() != NULL, "missed JVMS");
     // Fields of scalar objs are referenced only at the end
     // of regular debuginfo at the last (youngest) JVMS.
@@ -789,7 +866,7 @@
 
       const TypeOopPtr *field_addr_type = res_type->add_offset(offset)->isa_oopptr();
 
-      Node *field_val = value_from_mem(mem, basic_elem_type, field_type, field_addr_type, alloc);
+      Node *field_val = value_from_mem(mem, ctl, basic_elem_type, field_type, field_addr_type, alloc);
       if (field_val == NULL) {
         // We weren't able to find a value for this field,
         // give up on eliminating this allocation.
--- a/src/share/vm/opto/macro.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/macro.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -85,8 +85,8 @@
                               Node* length,
                               const TypeFunc* slow_call_type,
                               address slow_call_address);
-  Node *value_from_mem(Node *mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, Node *alloc);
-  Node *value_from_mem_phi(Node *mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, Node *alloc, Node_Stack *value_phis, int level);
+  Node *value_from_mem(Node *mem, Node *ctl, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, AllocateNode *alloc);
+  Node *value_from_mem_phi(Node *mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, AllocateNode *alloc, Node_Stack *value_phis, int level);
 
   bool eliminate_boxing_node(CallStaticJavaNode *boxing);
   bool eliminate_allocate_node(AllocateNode *alloc);
@@ -200,6 +200,8 @@
                             Node* old_eden_top, Node* new_eden_top,
                             Node* length);
 
+  Node* make_arraycopy_load(ArrayCopyNode* ac, intptr_t offset, Node* ctl, BasicType ft, const Type *ftype, AllocateNode *alloc);
+
 public:
   PhaseMacroExpand(PhaseIterGVN &igvn) : Phase(Macro_Expand), _igvn(igvn), _has_locks(false) {
     _igvn.set_delay_transform(true);
--- a/src/share/vm/opto/memnode.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/memnode.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -108,37 +108,6 @@
 
 #endif
 
-static bool membar_for_arraycopy_helper(const TypeOopPtr *t_oop, Node* n, PhaseTransform *phase) {
-  if (n->is_Proj()) {
-    n = n->in(0);
-    if (n->is_Call() && n->as_Call()->may_modify(t_oop, phase)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool membar_for_arraycopy(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase) {
-  Node* mem = mb->in(TypeFunc::Memory);
-
-  if (mem->is_MergeMem()) {
-    Node* n = mem->as_MergeMem()->memory_at(Compile::AliasIdxRaw);
-    if (membar_for_arraycopy_helper(t_oop, n, phase)) {
-      return true;
-    } else if (n->is_Phi()) {
-      for (uint i = 1; i < n->req(); i++) {
-        if (n->in(i) != NULL) {
-          if (membar_for_arraycopy_helper(t_oop, n->in(i), phase)) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
 Node *MemNode::optimize_simple_memory_chain(Node *mchain, const TypeOopPtr *t_oop, Node *load, PhaseGVN *phase) {
   assert((t_oop != NULL), "sanity");
   bool is_instance = t_oop->is_known_instance_field();
@@ -183,7 +152,7 @@
           }
         }
       } else if (proj_in->is_MemBar()) {
-        if (membar_for_arraycopy(t_oop, proj_in->as_MemBar(), phase)) {
+        if (ArrayCopyNode::may_modify(t_oop, proj_in->as_MemBar(), phase)) {
           break;
         }
         result = proj_in->in(TypeFunc::Memory);
@@ -545,35 +514,12 @@
         Node* dest = ac->in(ArrayCopyNode::Dest);
 
         if (dest == ld_base) {
-          Node* src_pos = ac->in(ArrayCopyNode::SrcPos);
-          Node* dest_pos = ac->in(ArrayCopyNode::DestPos);
-          Node* len = ac->in(ArrayCopyNode::Length);
-
-          const TypeInt *dest_pos_t = phase->type(dest_pos)->isa_int();
           const TypeX *ld_offs_t = phase->type(ld_offs)->isa_intptr_t();
-          const TypeInt *len_t = phase->type(len)->isa_int();
-          const TypeAryPtr* ary_t = phase->type(dest)->isa_aryptr();
-
-          if (dest_pos_t != NULL && ld_offs_t != NULL && len_t != NULL && ary_t != NULL) {
-            BasicType ary_elem  = ary_t->klass()->as_array_klass()->element_type()->basic_type();
-            uint header = arrayOopDesc::base_offset_in_bytes(ary_elem);
-            uint elemsize = type2aelembytes(ary_elem);
-
-            intptr_t dest_pos_plus_len_lo = (((intptr_t)dest_pos_t->_lo) + len_t->_lo) * elemsize + header;
-            intptr_t dest_pos_plus_len_hi = (((intptr_t)dest_pos_t->_hi) + len_t->_hi) * elemsize + header;
-            intptr_t dest_pos_lo = ((intptr_t)dest_pos_t->_lo) * elemsize + header;
-            intptr_t dest_pos_hi = ((intptr_t)dest_pos_t->_hi) * elemsize + header;
-
-            if (can_see_stored_value) {
-              if (ld_offs_t->_lo >= dest_pos_hi && ld_offs_t->_hi < dest_pos_plus_len_lo) {
-                return ac;
-              }
-            } else {
-              if (ld_offs_t->_hi < dest_pos_lo || ld_offs_t->_lo >= dest_pos_plus_len_hi) {
-                mem = ac->in(TypeFunc::Memory);
-              }
-              return ac;
-            }
+          if (ac->modifies(ld_offs_t->_lo, ld_offs_t->_hi, phase, can_see_stored_value)) {
+            return ac;
+          }
+          if (!can_see_stored_value) {
+            mem = ac->in(TypeFunc::Memory);
           }
         }
       }
@@ -703,7 +649,7 @@
           continue;         // (a) advance through independent call memory
         }
       } else if (mem->is_Proj() && mem->in(0)->is_MemBar()) {
-        if (membar_for_arraycopy(addr_t, mem->in(0)->as_MemBar(), phase)) {
+        if (ArrayCopyNode::may_modify(addr_t, mem->in(0)->as_MemBar(), phase)) {
           break;
         }
         mem = mem->in(0)->in(TypeFunc::Memory);
@@ -883,18 +829,17 @@
 // Is the value loaded previously stored by an arraycopy? If so return
 // a load node that reads from the source array so we may be able to
 // optimize out the ArrayCopy node later.
-Node* MemNode::can_see_arraycopy_value(Node* st, PhaseTransform* phase) const {
+Node* LoadNode::can_see_arraycopy_value(Node* st, PhaseTransform* phase) const {
   Node* ld_adr = in(MemNode::Address);
   intptr_t ld_off = 0;
   AllocateNode* ld_alloc = AllocateNode::Ideal_allocation(ld_adr, phase, ld_off);
   Node* ac = find_previous_arraycopy(phase, ld_alloc, st, true);
   if (ac != NULL) {
     assert(ac->is_ArrayCopy(), "what kind of node can this be?");
-    assert(is_Load(), "only for loads");
-
+
+    Node* ld = clone();
     if (ac->as_ArrayCopy()->is_clonebasic()) {
       assert(ld_alloc != NULL, "need an alloc");
-      Node* ld = clone();
       Node* addp = in(MemNode::Address)->clone();
       assert(addp->is_AddP(), "address must be addp");
       assert(addp->in(AddPNode::Base) == ac->in(ArrayCopyNode::Dest)->in(AddPNode::Base), "strange pattern");
@@ -906,9 +851,7 @@
         assert(ld_alloc->in(0) != NULL, "alloc must have control");
         ld->set_req(0, ld_alloc->in(0));
       }
-      return ld;
     } else {
-      Node* ld = clone();
       Node* addp = in(MemNode::Address)->clone();
       assert(addp->in(AddPNode::Base) == addp->in(AddPNode::Address), "should be");
       addp->set_req(AddPNode::Base, ac->in(ArrayCopyNode::Src));
@@ -933,8 +876,10 @@
         assert(ac->in(0) != NULL, "alloc must have control");
         ld->set_req(0, ac->in(0));
       }
-      return ld;
     }
+    // load depends on the tests that validate the arraycopy
+    ld->as_Load()->_depends_only_on_test = Pinned;
+    return ld;
   }
   return NULL;
 }
@@ -2426,40 +2371,47 @@
 
   Node* mem     = in(MemNode::Memory);
   Node* address = in(MemNode::Address);
-
   // Back-to-back stores to same address?  Fold em up.  Generally
   // unsafe if I have intervening uses...  Also disallowed for StoreCM
   // since they must follow each StoreP operation.  Redundant StoreCMs
   // are eliminated just before matching in final_graph_reshape.
-  if (mem->is_Store() && mem->in(MemNode::Address)->eqv_uncast(address) &&
-      mem->Opcode() != Op_StoreCM) {
-    // Looking at a dead closed cycle of memory?
-    assert(mem != mem->in(MemNode::Memory), "dead loop in StoreNode::Ideal");
-
-    assert(Opcode() == mem->Opcode() ||
-           phase->C->get_alias_index(adr_type()) == Compile::AliasIdxRaw,
-           "no mismatched stores, except on raw memory");
-
-    if (mem->outcnt() == 1 &&           // check for intervening uses
-        mem->as_Store()->memory_size() <= this->memory_size()) {
-      // If anybody other than 'this' uses 'mem', we cannot fold 'mem' away.
-      // For example, 'mem' might be the final state at a conditional return.
-      // Or, 'mem' might be used by some node which is live at the same time
-      // 'this' is live, which might be unschedulable.  So, require exactly
-      // ONE user, the 'this' store, until such time as we clone 'mem' for
-      // each of 'mem's uses (thus making the exactly-1-user-rule hold true).
-      if (can_reshape) {  // (%%% is this an anachronism?)
-        set_req_X(MemNode::Memory, mem->in(MemNode::Memory),
-                  phase->is_IterGVN());
-      } else {
-        // It's OK to do this in the parser, since DU info is always accurate,
-        // and the parser always refers to nodes via SafePointNode maps.
-        set_req(MemNode::Memory, mem->in(MemNode::Memory));
+  {
+    Node* st = mem;
+    // If Store 'st' has more than one use, we cannot fold 'st' away.
+    // For example, 'st' might be the final state at a conditional
+    // return.  Or, 'st' might be used by some node which is live at
+    // the same time 'st' is live, which might be unschedulable.  So,
+    // require exactly ONE user until such time as we clone 'mem' for
+    // each of 'mem's uses (thus making the exactly-1-user-rule hold
+    // true).
+    while (st->is_Store() && st->outcnt() == 1 && st->Opcode() != Op_StoreCM) {
+      // Looking at a dead closed cycle of memory?
+      assert(st != st->in(MemNode::Memory), "dead loop in StoreNode::Ideal");
+      assert(Opcode() == st->Opcode() ||
+             st->Opcode() == Op_StoreVector ||
+             Opcode() == Op_StoreVector ||
+             phase->C->get_alias_index(adr_type()) == Compile::AliasIdxRaw ||
+             (Opcode() == Op_StoreL && st->Opcode() == Op_StoreI), // expanded ClearArrayNode
+             err_msg_res("no mismatched stores, except on raw memory: %s %s", NodeClassNames[Opcode()], NodeClassNames[st->Opcode()]));
+
+      if (st->in(MemNode::Address)->eqv_uncast(address) &&
+          st->as_Store()->memory_size() <= this->memory_size()) {
+        Node* use = st->raw_out(0);
+        phase->igvn_rehash_node_delayed(use);
+        if (can_reshape) {
+          use->set_req_X(MemNode::Memory, st->in(MemNode::Memory), phase->is_IterGVN());
+        } else {
+          // It's OK to do this in the parser, since DU info is always accurate,
+          // and the parser always refers to nodes via SafePointNode maps.
+          use->set_req(MemNode::Memory, st->in(MemNode::Memory));
+        }
+        return this;
       }
-      return this;
+      st = st->in(MemNode::Memory);
     }
   }
 
+
   // Capture an unaliased, unconditional, simple store into an initializer.
   // Or, if it is independent of the allocation, hoist it above the allocation.
   if (ReduceFieldZeroing && /*can_reshape &&*/
--- a/src/share/vm/opto/memnode.hpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/opto/memnode.hpp	Thu Aug 27 14:40:19 2015 -0700
@@ -126,7 +126,6 @@
   // Can this node (load or store) accurately see a stored value in
   // the given memory state?  (The state may or may not be in(Memory).)
   Node* can_see_stored_value(Node* st, PhaseTransform* phase) const;
-  Node* can_see_arraycopy_value(Node* st, PhaseTransform* phase) const;
 
 #ifndef PRODUCT
   static void dump_adr_type(const Node* mem, const TypePtr* adr_type, outputStream *st);
@@ -252,6 +251,9 @@
 protected:
   const Type* load_array_final_field(const TypeKlassPtr *tkls,
                                      ciKlass* klass) const;
+
+  Node* can_see_arraycopy_value(Node* st, PhaseTransform* phase) const;
+
   // depends_only_on_test is almost always true, and needs to be almost always
   // true to enable key hoisting & commoning optimizations.  However, for the
   // special case of RawPtr loads from TLS top & end, and other loads performed by
--- a/src/share/vm/prims/jvmtiRedefineClasses.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/prims/jvmtiRedefineClasses.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -3771,7 +3771,7 @@
     // Deoptimize all activations depending on marked nmethods
     Deoptimization::deoptimize_dependents();
 
-    // Make the dependent methods not entrant (in VM_Deoptimize they are made zombies)
+    // Make the dependent methods not entrant
     CodeCache::make_marked_nmethods_not_entrant();
 
     // From now on we know that the dependency information is complete
--- a/src/share/vm/runtime/arguments.cpp	Thu Aug 27 12:59:50 2015 -0700
+++ b/src/share/vm/runtime/arguments.cpp	Thu Aug 27 14:40:19 2015 -0700
@@ -81,8 +81,6 @@
 bool   Arguments::_has_profile                  = false;
 size_t Arguments::_conservative_max_heap_alignment = 0;
 size_t Arguments::_min_heap_size                = 0;
-uintx  Arguments::_min_heap_free_ratio          = 0;
-uintx  Arguments::_max_heap_free_ratio          = 0;
 Arguments::Mode Arguments::_mode                = _mixed;
 bool   Arguments::_java_compiler                = false;
 bool   Arguments::_xdebug_mode                  = false;
@@ -1614,11 +1612,9 @@
     // unless the user actually sets these flags.
     if (FLAG_IS_DEFAULT(MinHeapFreeRatio)) {
       FLAG_SET_DEFAULT(MinHeapFreeRatio, 0);
-      _min_heap_