microsoft · swesonga · Mar 6, 2026 · Mar 9, 2026
diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4
@@ -784,6 +784,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
   elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
     if test "x$FLAGS_CPU" = xx86; then
       $1_CFLAGS_CPU_JVM="-arch:IA32"
+    elif test "x$FLAGS_CPU" = xaarch64; then
+      # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
+      # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
+      # was written assuming volatile provides acquire/release semantics (as on x86
+      # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
+      # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
+      # lock-free algorithms that use plain volatile fields.
+      $1_CFLAGS_CPU_JVM="-volatile:ms"
     elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
       if test "x$DEBUG_LEVEL" != xrelease; then
         # NOTE: This is probably redundant; -homeparams is default on

diff --git a/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp
@@ -114,4 +114,201 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 #undef DEFINE_INTRINSIC_CMPXCHG
 
+// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
+//
+// The generic PlatformLoad and PlatformStore use plain volatile dereferences.
+// With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles
+// those to LDAR/STLR, so these overrides produce identical codegen. They are
+// retained as defense-in-depth: they guarantee acquire/release semantics for
+// AtomicAccess::load()/AtomicAccess::store() regardless of the compiler flag setting,
+// ensuring correct cross-core visibility for HotSpot's lock-free algorithms
+// (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if
+// /volatile:ms were ever removed or overridden.
+
+template<>
+struct AtomicAccess::PlatformLoad<1> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformLoad<2> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformLoad<4> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformLoad<8> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformStore<1> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest),
+            PrimitiveConversions::cast<unsigned __int8>(new_value));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformStore<2> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int16>(new_value));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformStore<4> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int32>(new_value));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformStore<8> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int64>(new_value));
+  }
+};
+
+// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
+// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
+// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
+// These emit single LDAR/STLR instructions that have acquire/release semantics
+// baked in, rather than the generic fallback of separate dmb + plain load/store.
+// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
+// for cross-core visibility (Dekker patterns, etc.).
+
+template<>
+struct AtomicAccess::PlatformOrderedLoad<1, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedLoad<2, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedLoad<4, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedLoad<8, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedStore<1, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
+            PrimitiveConversions::cast<unsigned __int8>(v));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedStore<2, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int16>(v));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedStore<4, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int32>(v));
+  }
+};
+
+template<>
+struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int64>(v));
+  }
+};
+
+// release_store + fence combination, matching Linux AArch64
+template<size_t byte_size>
+struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    AtomicAccess::release_store(p, v);
+    OrderAccess::fence();
+  }
+};
+
 #endif // OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP
diff --git a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
@@ -26,22 +26,29 @@
 #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP
 
 // Included in orderAccess.hpp header file.
-#include <atomic>
-using std::atomic_thread_fence;
 #include <arm64intr.h>
 #include "vm_version_aarch64.hpp"
 #include "runtime/vm_version.hpp"
 
 // Implementation of class OrderAccess.
+//
+// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence().
+// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction
+// reordering" in addition to emitting the hardware DMB instruction. This is critical
+// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and
+// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic
+// operations — it may not act as a compiler barrier for volatile/non-atomic accesses
+// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker
+// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot.
 
 inline void OrderAccess::loadload()   { acquire(); }
 inline void OrderAccess::storestore() { release(); }
 inline void OrderAccess::loadstore()  { acquire(); }
 inline void OrderAccess::storeload()  { fence(); }
 
-#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire);
-#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release);
-#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst);
+#define READ_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISHLD)
+#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)
+#define FULL_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISH)
 
 inline void OrderAccess::acquire() {
   READ_MEM_BARRIER;

diff --git a/src/java.base/share/classes/java/lang/VirtualThread.java b/src/java.base/share/classes/java/lang/VirtualThread.java
@@ -579,6 +579,13 @@ private void afterYield() {
                 setState(newState = TIMED_PARKED);
             }
 
+            // Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state
+            // is visible before reading parkPermit (Dekker pattern with
+            // unpark which writes parkPermit then reads state).
+            // Note: storeFence is insufficient — on ARM64 it only emits
+            // LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish).
+            U.fullFence();
+
             // may have been unparked while parking
             if (parkPermit && compareAndSetState(newState, UNPARKED)) {
                 // lazy submit if local queue is empty
@@ -604,6 +611,10 @@ private void afterYield() {
         if (s == BLOCKING) {
             setState(BLOCKED);
 
+            // Full fence (StoreLoad) for Dekker pattern with unblock
+            // which writes blockPermit then reads state.
+            U.fullFence();
+
             // may have been unblocked while blocking
             if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) {
                 // lazy submit if local queue is empty
@@ -619,6 +630,9 @@ private void afterYield() {
             boolean interruptible = interruptibleWait;
             if (s == WAITING) {
                 setState(newState = WAIT);
+                // Full fence (StoreLoad) for Dekker pattern with notify
+                // which writes notified then reads state.
+                U.fullFence();
                 // may have been notified while in transition
                 blocked = notified && compareAndSetState(WAIT, BLOCKED);
             } else {
@@ -635,6 +649,9 @@ private void afterYield() {
                     byte seqNo = ++timedWaitSeqNo;
                     timeoutTask = schedule(() -> waitTimeoutExpired(seqNo), timeout, MILLISECONDS);
                     setState(newState = TIMED_WAIT);
+                    // Full fence (StoreLoad) for Dekker pattern with notify
+                    // which writes notified then reads state.
+                    U.fullFence();
                     // May have been notified while in transition. This must be done while
                     // holding the monitor to avoid changing the state of a new timed wait call.
                     blocked = notified && compareAndSetState(TIMED_WAIT, BLOCKED);
@@ -675,6 +692,15 @@ private void afterDone(boolean notifyContainer) {
         assert carrierThread == null;
         setState(TERMINATED);
 
+        // Full fence (StoreLoad) to ensure the TERMINATED state is
+        // visible before reading notifyAllAfterTerminate (Dekker pattern
+        // with beforeJoin which writes notifyAllAfterTerminate then
+        // reads state). Without this, on ARM64 the volatile write of
+        // state and the subsequent volatile read can be reordered,
+        // causing a missed-wakeup where both sides miss each other's
+        // store.
+        U.fullFence();
+
         // notifyAll to wakeup any threads waiting for this thread to terminate
         if (notifyAllAfterTerminate) {
             synchronized (this) {
@@ -870,6 +896,10 @@ private void parkOnCarrierThread(boolean timed, long nanos) {
      */
     private void unpark(boolean lazySubmit) {
         if (!getAndSetParkPermit(true) && currentThread() != this) {
+            // Full fence (StoreLoad) to ensure parkPermit=true is visible
+            // before reading state (Dekker pattern with afterYield PARKING
+            // path which writes state then reads parkPermit).
+            U.fullFence();
             int s = state();
 
             // unparked while parked
@@ -912,6 +942,7 @@ void unpark() {
     private void unblock() {
         assert !Thread.currentThread().isVirtual();
         blockPermit = true;
+        U.fullFence();  // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path
         if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) {
             submitRunContinuation();
         }

diff --git a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
@@ -590,6 +590,11 @@ final Object xfer(Object e, long ns) {
                 q = p.next;
                 if (p.isData != haveData && haveData != (m != null)) {
                     if (p.cmpExItem(m, e) == m) {
+                        // Full fence (StoreLoad) for Dekker with await() which
+                        // writes waiter then reads item. On ARM64, CAS
+                        // (ldaxr/stlxr) + plain load to a different field does
+                        // NOT provide StoreLoad ordering.
+                        VarHandle.fullFence();
                         Thread w = p.waiter; // matched complementary node
                         if (p != h && h == cmpExHead(h, (q == null) ? p : q))
                             h.next = h;     // advance head; self-link old

diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
@@ -177,6 +177,11 @@ final Object xferLifo(Object e, long ns) {
                     else if (p.cmpExItem(m, e) != m)
                         p = head;                  // missed; restart
                     else {                         // matched complementary node
+                        // Full fence (StoreLoad) for Dekker with await() which
+                        // writes waiter then reads item. On ARM64, CAS
+                        // (ldaxr/stlxr) + plain load to a different field does
+                        // NOT provide StoreLoad ordering.
+                        VarHandle.fullFence();
                         Thread w = p.waiter;
                         cmpExHead(p, p.next);
                         LockSupport.unpark(w);

diff --git a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
@@ -782,6 +782,13 @@ final int acquire(Node node, int arg, boolean shared,
                 Thread.onSpinWait();
             } else if (node.status == 0) {
                 node.status = WAITING;          // enable signal and recheck
+                // Full fence (StoreLoad) to ensure WAITING status is visible
+                // before re-reading state in tryAcquire/tryAcquireShared
+                // (Dekker pattern with releaseShared/release which writes
+                // state then reads node.status in signalNext).
+                // On ARM64, volatile write (stlr) + volatile read (ldar) to
+                // different addresses does NOT provide StoreLoad ordering.
+                U.fullFence();
             } else {
                 spins = postSpins = (byte)((postSpins << 1) | 1);
                 try {
@@ -1097,6 +1104,13 @@ public final boolean tryAcquireNanos(int arg, long nanosTimeout)
      */
     public final boolean release(int arg) {
         if (tryRelease(arg)) {
+            // Full fence (StoreLoad) to ensure the state update from
+            // tryRelease is visible before reading node.status in signalNext
+            // (Dekker pattern: release writes state then reads status,
+            // acquire writes status then reads state).
+            // On ARM64, CAS (stlxr/release) + ldar to different addresses
+            // does NOT provide StoreLoad ordering.
+            U.fullFence();
             signalNext(head);
             return true;
         }
@@ -1184,6 +1198,8 @@ public final boolean tryAcquireSharedNanos(int arg, long nanosTimeout)
      */
     public final boolean releaseShared(int arg) {
         if (tryReleaseShared(arg)) {
+            // Full fence (StoreLoad) — see comment in release()
+            U.fullFence();
             signalNext(head);
             return true;
         }