Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions make/autoconf/flags-cflags.m4
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
if test "x$FLAGS_CPU" = xx86; then
$1_CFLAGS_CPU_JVM="-arch:IA32"
elif test "x$FLAGS_CPU" = xaarch64; then
# MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
# plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
# was written assuming volatile provides acquire/release semantics (as on x86
# and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
# prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
# lock-free algorithms that use plain volatile fields.
$1_CFLAGS_CPU_JVM="-volatile:ms"
elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
if test "x$DEBUG_LEVEL" != xrelease; then
# NOTE: This is probably redundant; -homeparams is default on
Expand Down
197 changes: 197 additions & 0 deletions src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,201 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)

#undef DEFINE_INTRINSIC_CMPXCHG

// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
//
// The generic PlatformLoad and PlatformStore use plain volatile dereferences.
// With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles
// those to LDAR/STLR, so these overrides produce identical codegen. They are
// retained as defense-in-depth: they guarantee acquire/release semantics for
// AtomicAccess::load()/AtomicAccess::store() regardless of the compiler flag setting,
// ensuring correct cross-core visibility for HotSpot's lock-free algorithms
// (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if
// /volatile:ms were ever removed or overridden.

template<>
struct AtomicAccess::PlatformLoad<1> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 1);
return PrimitiveConversions::cast<T>(
__ldar8(reinterpret_cast<unsigned __int8 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct AtomicAccess::PlatformLoad<2> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 2);
return PrimitiveConversions::cast<T>(
__ldar16(reinterpret_cast<unsigned __int16 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct AtomicAccess::PlatformLoad<4> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 4);
return PrimitiveConversions::cast<T>(
__ldar32(reinterpret_cast<unsigned __int32 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct AtomicAccess::PlatformLoad<8> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 8);
return PrimitiveConversions::cast<T>(
__ldar64(reinterpret_cast<unsigned __int64 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct AtomicAccess::PlatformStore<1> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 1);
__stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int8>(new_value));
}
};

template<>
struct AtomicAccess::PlatformStore<2> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 2);
__stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int16>(new_value));
}
};

template<>
struct AtomicAccess::PlatformStore<4> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 4);
__stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int32>(new_value));
}
};

template<>
struct AtomicAccess::PlatformStore<8> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 8);
__stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int64>(new_value));
}
};

// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
// These emit single LDAR/STLR instructions that have acquire/release semantics
// baked in, rather than the generic fallback of separate dmb + plain load/store.
// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
// for cross-core visibility (Dekker patterns, etc.).

template<>
struct AtomicAccess::PlatformOrderedLoad<1, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 1);
return PrimitiveConversions::cast<T>(
__ldar8(reinterpret_cast<unsigned __int8 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct AtomicAccess::PlatformOrderedLoad<2, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 2);
return PrimitiveConversions::cast<T>(
__ldar16(reinterpret_cast<unsigned __int16 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct AtomicAccess::PlatformOrderedLoad<4, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 4);
return PrimitiveConversions::cast<T>(
__ldar32(reinterpret_cast<unsigned __int32 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct AtomicAccess::PlatformOrderedLoad<8, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 8);
return PrimitiveConversions::cast<T>(
__ldar64(reinterpret_cast<unsigned __int64 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct AtomicAccess::PlatformOrderedStore<1, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 1);
__stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
PrimitiveConversions::cast<unsigned __int8>(v));
}
};

template<>
struct AtomicAccess::PlatformOrderedStore<2, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 2);
__stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
PrimitiveConversions::cast<unsigned __int16>(v));
}
};

template<>
struct AtomicAccess::PlatformOrderedStore<4, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 4);
__stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
PrimitiveConversions::cast<unsigned __int32>(v));
}
};

template<>
struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 8);
__stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
PrimitiveConversions::cast<unsigned __int64>(v));
}
};

// release_store + fence combination, matching Linux AArch64
template<size_t byte_size>
struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
template <typename T>
void operator()(volatile T* p, T v) const {
AtomicAccess::release_store(p, v);
OrderAccess::fence();
}
};

#endif // OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP
17 changes: 12 additions & 5 deletions src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,29 @@
#define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP

// Included in orderAccess.hpp header file.
#include <atomic>
using std::atomic_thread_fence;
#include <arm64intr.h>
#include "vm_version_aarch64.hpp"
#include "runtime/vm_version.hpp"

// Implementation of class OrderAccess.
//
// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence().
// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction
// reordering" in addition to emitting the hardware DMB instruction. This is critical
// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and
// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic
// operations — it may not act as a compiler barrier for volatile/non-atomic accesses
// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker
// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot.

inline void OrderAccess::loadload() { acquire(); }
inline void OrderAccess::storestore() { release(); }
inline void OrderAccess::loadstore() { acquire(); }
inline void OrderAccess::storeload() { fence(); }

#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire);
#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release);
#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst);
#define READ_MEM_BARRIER __dmb(_ARM64_BARRIER_ISHLD)
#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)
#define FULL_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)

inline void OrderAccess::acquire() {
READ_MEM_BARRIER;
Expand Down
31 changes: 31 additions & 0 deletions src/java.base/share/classes/java/lang/VirtualThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,13 @@ private void afterYield() {
setState(newState = TIMED_PARKED);
}

// Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state
// is visible before reading parkPermit (Dekker pattern with
// unpark which writes parkPermit then reads state).
// Note: storeFence is insufficient — on ARM64 it only emits
// LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish).
U.fullFence();

// may have been unparked while parking
if (parkPermit && compareAndSetState(newState, UNPARKED)) {
// lazy submit if local queue is empty
Expand All @@ -604,6 +611,10 @@ private void afterYield() {
if (s == BLOCKING) {
setState(BLOCKED);

// Full fence (StoreLoad) for Dekker pattern with unblock
// which writes blockPermit then reads state.
U.fullFence();

// may have been unblocked while blocking
if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) {
// lazy submit if local queue is empty
Expand All @@ -619,6 +630,9 @@ private void afterYield() {
boolean interruptible = interruptibleWait;
if (s == WAITING) {
setState(newState = WAIT);
// Full fence (StoreLoad) for Dekker pattern with notify
// which writes notified then reads state.
U.fullFence();
// may have been notified while in transition
blocked = notified && compareAndSetState(WAIT, BLOCKED);
} else {
Expand All @@ -635,6 +649,9 @@ private void afterYield() {
byte seqNo = ++timedWaitSeqNo;
timeoutTask = schedule(() -> waitTimeoutExpired(seqNo), timeout, MILLISECONDS);
setState(newState = TIMED_WAIT);
// Full fence (StoreLoad) for Dekker pattern with notify
// which writes notified then reads state.
U.fullFence();
// May have been notified while in transition. This must be done while
// holding the monitor to avoid changing the state of a new timed wait call.
blocked = notified && compareAndSetState(TIMED_WAIT, BLOCKED);
Expand Down Expand Up @@ -675,6 +692,15 @@ private void afterDone(boolean notifyContainer) {
assert carrierThread == null;
setState(TERMINATED);

// Full fence (StoreLoad) to ensure the TERMINATED state is
// visible before reading notifyAllAfterTerminate (Dekker pattern
// with beforeJoin which writes notifyAllAfterTerminate then
// reads state). Without this, on ARM64 the volatile write of
// state and the subsequent volatile read can be reordered,
// causing a missed-wakeup where both sides miss each other's
// store.
U.fullFence();

// notifyAll to wakeup any threads waiting for this thread to terminate
if (notifyAllAfterTerminate) {
synchronized (this) {
Expand Down Expand Up @@ -870,6 +896,10 @@ private void parkOnCarrierThread(boolean timed, long nanos) {
*/
private void unpark(boolean lazySubmit) {
if (!getAndSetParkPermit(true) && currentThread() != this) {
// Full fence (StoreLoad) to ensure parkPermit=true is visible
// before reading state (Dekker pattern with afterYield PARKING
// path which writes state then reads parkPermit).
U.fullFence();
int s = state();

// unparked while parked
Expand Down Expand Up @@ -912,6 +942,7 @@ void unpark() {
private void unblock() {
assert !Thread.currentThread().isVirtual();
blockPermit = true;
U.fullFence(); // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path
if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) {
submitRunContinuation();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,11 @@ final Object xfer(Object e, long ns) {
q = p.next;
if (p.isData != haveData && haveData != (m != null)) {
if (p.cmpExItem(m, e) == m) {
// Full fence (StoreLoad) for Dekker with await() which
// writes waiter then reads item. On ARM64, CAS
// (ldaxr/stlxr) + plain load to a different field does
// NOT provide StoreLoad ordering.
VarHandle.fullFence();
Thread w = p.waiter; // matched complementary node
if (p != h && h == cmpExHead(h, (q == null) ? p : q))
h.next = h; // advance head; self-link old
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ final Object xferLifo(Object e, long ns) {
else if (p.cmpExItem(m, e) != m)
p = head; // missed; restart
else { // matched complementary node
// Full fence (StoreLoad) for Dekker with await() which
// writes waiter then reads item. On ARM64, CAS
// (ldaxr/stlxr) + plain load to a different field does
// NOT provide StoreLoad ordering.
VarHandle.fullFence();
Thread w = p.waiter;
cmpExHead(p, p.next);
LockSupport.unpark(w);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,13 @@ final int acquire(Node node, int arg, boolean shared,
Thread.onSpinWait();
} else if (node.status == 0) {
node.status = WAITING; // enable signal and recheck
// Full fence (StoreLoad) to ensure WAITING status is visible
// before re-reading state in tryAcquire/tryAcquireShared
// (Dekker pattern with releaseShared/release which writes
// state then reads node.status in signalNext).
// On ARM64, volatile write (stlr) + volatile read (ldar) to
// different addresses does NOT provide StoreLoad ordering.
U.fullFence();
} else {
spins = postSpins = (byte)((postSpins << 1) | 1);
try {
Expand Down Expand Up @@ -1097,6 +1104,13 @@ public final boolean tryAcquireNanos(int arg, long nanosTimeout)
*/
public final boolean release(int arg) {
if (tryRelease(arg)) {
// Full fence (StoreLoad) to ensure the state update from
// tryRelease is visible before reading node.status in signalNext
// (Dekker pattern: release writes state then reads status,
// acquire writes status then reads state).
// On ARM64, CAS (stlxr/release) + ldar to different addresses
// does NOT provide StoreLoad ordering.
U.fullFence();
signalNext(head);
return true;
}
Expand Down Expand Up @@ -1184,6 +1198,8 @@ public final boolean tryAcquireSharedNanos(int arg, long nanosTimeout)
*/
public final boolean releaseShared(int arg) {
if (tryReleaseShared(arg)) {
// Full fence (StoreLoad) — see comment in release()
U.fullFence();
signalNext(head);
return true;
}
Expand Down
Loading