diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
index 836920e1c83952..d30c305a7e6fae 100644
--- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
+++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
@@ -2839,6 +2839,7 @@
     <Compile Include="$(MSBuildThisFileDirectory)System\Threading\ThreadPoolBoundHandle.Windows.cs" Condition="'$(TargetsWindows)' == 'true'" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Threading\ThreadPoolBoundHandleOverlapped.cs" />
     <Compile Include="$(MSBuildThisFileDirectory)System\Threading\ThreadPoolCallbackWrapper.cs" Condition="'$(FeatureNativeAot)' != 'true'" />
+    <Compile Include="$(MSBuildThisFileDirectory)System\Threading\Backoff.cs" />
   </ItemGroup>
   <ItemGroup Condition="('$(TargetsBrowser)' == 'true' or '$(TargetsWasi)' == 'true') and '$(FeatureWasmManagedThreads)' != 'true'">
     <Compile Include="$(MSBuildThisFileDirectory)System\Threading\PreAllocatedOverlapped.Browser.cs" />
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/Backoff.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/Backoff.cs
new file mode 100644
index 00000000000000..9cc96d95a5d064
--- /dev/null
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/Backoff.cs
@@ -0,0 +1,35 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.InteropServices;
+
+namespace System.Threading
+{
+    internal static class Backoff
+    {
+        // We will use exponential backoff in rare cases when we need to change state atomically and cannot
+        // make progress due to concurrent state changes by other threads.
+        // While we cannot know the ideal amount of wait needed before making a successful attempt,
+        // the exponential backoff will generally be not more than 2X worse than the perfect guess and
+        // will do a lot less attempts than a simple retry. On multiprocessor machine fruitless attempts
+        // will cause unnecessary sharing of the contended state which may make modifying the state more expensive.
+        // To protect against degenerate cases we will cap the per-iteration wait to a few thousand spinwaits.
+        private const uint MaxExponentialBackoffBits = 14;
+
+        internal static unsafe void Exponential(uint attempt)
+        {
+            attempt = Math.Min(attempt, MaxExponentialBackoffBits);
+            // We will backoff for some random number of spins that roughly grows as attempt^2
+            // No need for much randomness here, randomness is "good to have", we could do without it,
+            // so we will just cheaply hash in the stack location.
+            uint rand = (uint)&attempt * 2654435769u;
+            // Set the highmost bit to ensure minimum number of spins is exponentially increasing.
+            // It basically guarantees that we spin at least 0, 1, 2, 4, 8, 16, times, and so on
+            rand |= (1u << 31);
+            uint spins = rand >> (byte)(32 - attempt);
+            Thread.SpinWait((int)spins);
+        }
+    }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelLifoSemaphore.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelLifoSemaphore.cs
index 0f789de54ee214..73464549ff9dea 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelLifoSemaphore.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelLifoSemaphore.cs
@@ -16,20 +16,16 @@ internal sealed partial class LowLevelLifoSemaphore : IDisposable
         private CacheLineSeparatedCounts _separated;
 
         private readonly int _maximumSignalCount;
-        private readonly int _spinCount;
+        private readonly uint _spinCount;
         private readonly Action _onWait;
 
-        private const int SpinSleep0Threshold = 10;
-
-        public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount, int spinCount, Action onWait)
+        public LowLevelLifoSemaphore(int maximumSignalCount, uint spinCount, Action onWait)
         {
-            Debug.Assert(initialSignalCount >= 0);
-            Debug.Assert(initialSignalCount <= maximumSignalCount);
             Debug.Assert(maximumSignalCount > 0);
+            Debug.Assert(maximumSignalCount <= short.MaxValue);
             Debug.Assert(spinCount >= 0);
 
             _separated = default;
-            _separated._counts.SignalCount = (uint)initialSignalCount;
             _maximumSignalCount = maximumSignalCount;
             _spinCount = spinCount;
             _onWait = onWait;
@@ -37,7 +33,7 @@ public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount, int
             Create(maximumSignalCount);
         }
 
-        public bool Wait(int timeoutMs, bool spinWait)
+        public bool Wait(int timeoutMs)
         {
             Debug.Assert(timeoutMs >= -1);
 
@@ -45,86 +41,59 @@ public bool Wait(int timeoutMs, bool spinWait)
             Thread.AssureBlockingPossible();
 #endif
 
-            int spinCount = spinWait ? _spinCount : 0;
-
-            // Try to acquire the semaphore or
-            // a) register as a spinner if spinCount > 0 and timeoutMs > 0
-            // b) register as a waiter if there's already too many spinners or spinCount == 0 and timeoutMs > 0
-            // c) bail out if timeoutMs == 0 and return false
+            // Try one-shot acquire first
             Counts counts = _separated._counts;
-            while (true)
+            if (counts.SignalCount != 0)
             {
-                Debug.Assert(counts.SignalCount <= _maximumSignalCount);
                 Counts newCounts = counts;
-                if (counts.SignalCount != 0)
-                {
-                    newCounts.DecrementSignalCount();
-                }
-                else if (timeoutMs != 0)
-                {
-                    if (spinCount > 0 && newCounts.SpinnerCount < byte.MaxValue)
-                    {
-                        newCounts.IncrementSpinnerCount();
-                    }
-                    else
-                    {
-                        // Maximum number of spinners reached, register as a waiter instead
-                        newCounts.IncrementWaiterCount();
-                    }
-                }
-
+                newCounts.DecrementSignalCount();
                 Counts countsBeforeUpdate = _separated._counts.InterlockedCompareExchange(newCounts, counts);
                 if (countsBeforeUpdate == counts)
                 {
-                    if (counts.SignalCount != 0)
-                    {
-                        return true;
-                    }
-                    if (newCounts.WaiterCount != counts.WaiterCount)
-                    {
-                        return WaitForSignal(timeoutMs);
-                    }
-                    if (timeoutMs == 0)
-                    {
-                        return false;
-                    }
-                    break;
+                    // we've consumed a signal
+                    return true;
                 }
-
-                counts = countsBeforeUpdate;
             }
 
-            bool isSingleProcessor = Environment.IsSingleProcessor;
-            int spinIndex = isSingleProcessor ? SpinSleep0Threshold : 0;
-            while (spinIndex < spinCount)
+            return WaitSlow(timeoutMs);
+        }
+
+        private bool WaitSlow(int timeoutMs)
+        {
+            // Now spin briefly with exponential backoff.
+            // We use random exponential backoff because:
+            // - we do not know how soon a signal appears, but with exponential backoff we will not be more than 2x off the ideal guess
+            // - it gives mild preference to the most recent spinners. We want LIFO here so that hot(er) threads keep running.
+            // - it is possible that spinning workers prevent non-pool threads from submitting more work to the pool,
+            //   so we want some workers to sleep earlier than others.
+            uint spinCount = Environment.IsSingleProcessor ? 0 : _spinCount;
+            for (uint iteration = 0; iteration < spinCount; iteration++)
             {
-                LowLevelSpinWaiter.Wait(spinIndex, SpinSleep0Threshold, isSingleProcessor);
-                spinIndex++;
+                Backoff.Exponential(iteration);
 
-                // Try to acquire the semaphore and unregister as a spinner
-                counts = _separated._counts;
-                while (counts.SignalCount > 0)
+                Counts counts = _separated._counts;
+                if (counts.SignalCount != 0)
                 {
                     Counts newCounts = counts;
                     newCounts.DecrementSignalCount();
-                    newCounts.DecrementSpinnerCount();
-
                     Counts countsBeforeUpdate = _separated._counts.InterlockedCompareExchange(newCounts, counts);
                     if (countsBeforeUpdate == counts)
                     {
+                        // we've consumed a signal
                         return true;
                     }
-
-                    counts = countsBeforeUpdate;
                 }
             }
 
-            // Unregister as spinner, and acquire the semaphore or register as a waiter
-            counts = _separated._counts;
+            // Now we will try registering as a waiter and wait.
+            // If signaled before that, we have to acquire as this can be the last thread that could take that signal.
+            // The difference with spinning above is that we are not waiting for a signal. We should immediately succeed
+            // unless a lot of threads are trying to update the counts. Thus we use a different attempt counter.
+            uint collisionCount = 0;
             while (true)
             {
+                Counts counts = _separated._counts;
                 Counts newCounts = counts;
-                newCounts.DecrementSpinnerCount();
                 if (counts.SignalCount != 0)
                 {
                     newCounts.DecrementSignalCount();
@@ -140,7 +109,7 @@ public bool Wait(int timeoutMs, bool spinWait)
                     return counts.SignalCount != 0 || WaitForSignal(timeoutMs);
                 }
 
-                counts = countsBeforeUpdate;
+                Backoff.Exponential(collisionCount++);
             }
         }
 
@@ -162,130 +131,113 @@ private bool WaitForSignal(int timeoutMs)
                 }
                 int endWaitTicks = timeoutMs != -1 ? Environment.TickCount : 0;
 
-                // Unregister the waiter if this thread will not be waiting anymore, and try to acquire the semaphore
-                Counts counts = _separated._counts;
+                uint collisionCount = 0;
                 while (true)
                 {
-                    Debug.Assert(counts.WaiterCount != 0);
+                    Counts counts = _separated._counts;
                     Counts newCounts = counts;
-                    if (counts.SignalCount != 0)
+
+                    Debug.Assert(counts.WaiterCount != 0);
+                    Debug.Assert(counts.CountOfWaitersSignaledToWake != 0);
+
+                    newCounts.DecrementCountOfWaitersSignaledToWake();
+                    if (newCounts.SignalCount != 0)
                     {
                         newCounts.DecrementSignalCount();
                         newCounts.DecrementWaiterCount();
                     }
 
-                    // This waiter has woken up and this needs to be reflected in the count of waiters signaled to wake
-                    if (counts.CountOfWaitersSignaledToWake != 0)
-                    {
-                        newCounts.DecrementCountOfWaitersSignaledToWake();
-                    }
-
                     Counts countsBeforeUpdate = _separated._counts.InterlockedCompareExchange(newCounts, counts);
                     if (countsBeforeUpdate == counts)
                     {
                         if (counts.SignalCount != 0)
                         {
+                            // success
                             return true;
                         }
+
+                        // we've consumed a wake, but there was no signal, we will wait again.
                         break;
                     }
 
-                    counts = countsBeforeUpdate;
-                    if (timeoutMs != -1) {
-                        int waitMs = endWaitTicks - startWaitTicks;
-                        if (waitMs >= 0 && waitMs < timeoutMs)
-                            timeoutMs -= waitMs;
-                        else
-                            timeoutMs = 0;
-                    }
+                    // collision, try again.
+                    Backoff.Exponential(collisionCount++);
+                }
+
+                // we will wait again, reduce timeout
+                if (timeoutMs != -1)
+                {
+                    int waitMs = endWaitTicks - startWaitTicks;
+                    if (waitMs >= 0 && waitMs < timeoutMs)
+                        timeoutMs -= waitMs;
+                    else
+                        timeoutMs = 0;
                 }
             }
         }
 
-        public void Release(int releaseCount)
+        public void Signal()
         {
-            Debug.Assert(releaseCount > 0);
-            Debug.Assert(releaseCount <= _maximumSignalCount);
+            // Increment signal count. This enables one-shot acquire.
+            Counts counts = _separated._counts.InterlockedIncrementSignalCount();
 
-            int countOfWaitersToWake;
-            Counts counts = _separated._counts;
+            // Now check if waiters need to be woken
+            uint collisionCount = 0;
             while (true)
             {
-                Counts newCounts = counts;
-
-                // Increase the signal count. The addition doesn't overflow because of the limit on the max signal count in constructor.
-                newCounts.AddSignalCount((uint)releaseCount);
-
-                // Determine how many waiters to wake, taking into account how many spinners and waiters there are and how many waiters
-                // have previously been signaled to wake but have not yet woken
-                countOfWaitersToWake =
-                    (int)Math.Min(newCounts.SignalCount, (uint)counts.WaiterCount + counts.SpinnerCount) -
-                    counts.SpinnerCount -
-                    counts.CountOfWaitersSignaledToWake;
-                if (countOfWaitersToWake > 0)
+                // Determine how many waiters to wake.
+                // The number of wakes should not be more than the signal count, not more than waiter count and discount any pending wakes.
+                int countOfWaitersToWake = (int)Math.Min(counts.SignalCount, counts.WaiterCount) - counts.CountOfWaitersSignaledToWake;
+                if (countOfWaitersToWake <= 0)
                 {
-                    // Ideally, limiting to a maximum of releaseCount would not be necessary and could be an assert instead, but since
-                    // WaitForSignal() does not have enough information to tell whether a woken thread was signaled, and due to the cap
-                    // below, it's possible for countOfWaitersSignaledToWake to be less than the number of threads that have actually
-                    // been signaled to wake.
-                    if (countOfWaitersToWake > releaseCount)
-                    {
-                        countOfWaitersToWake = releaseCount;
-                    }
-
-                    // Cap countOfWaitersSignaledToWake to its max value. It's ok to ignore some woken threads in this count, it just
-                    // means some more threads will be woken next time. Typically, it won't reach the max anyway.
-                    newCounts.AddUpToMaxCountOfWaitersSignaledToWake((uint)countOfWaitersToWake);
+                    // No waiters to wake. This is the most common case.
+                    return;
                 }
 
+                Counts newCounts = counts;
+                newCounts.AddCountOfWaitersSignaledToWake((uint)countOfWaitersToWake);
                 Counts countsBeforeUpdate = _separated._counts.InterlockedCompareExchange(newCounts, counts);
                 if (countsBeforeUpdate == counts)
                 {
-                    Debug.Assert(releaseCount <= _maximumSignalCount - counts.SignalCount);
+                    Debug.Assert(_maximumSignalCount - counts.SignalCount >= 1);
                     if (countOfWaitersToWake > 0)
                         ReleaseCore(countOfWaitersToWake);
                     return;
                 }
 
-                counts = countsBeforeUpdate;
+                // collision, try again.
+                Backoff.Exponential(collisionCount++);
+
+                counts = _separated._counts;
             }
         }
 
         private struct Counts : IEquatable<Counts>
         {
             private const byte SignalCountShift = 0;
-            private const byte WaiterCountShift = 32;
-            private const byte SpinnerCountShift = 48;
-            private const byte CountOfWaitersSignaledToWakeShift = 56;
+            private const byte WaiterCountShift = 16;
+            private const byte CountOfWaitersSignaledToWakeShift = 32;
 
             private ulong _data;
 
             private Counts(ulong data) => _data = data;
 
-            private uint GetUInt32Value(byte shift) => (uint)(_data >> shift);
-            private void SetUInt32Value(uint value, byte shift) =>
-                _data = (_data & ~((ulong)uint.MaxValue << shift)) | ((ulong)value << shift);
             private ushort GetUInt16Value(byte shift) => (ushort)(_data >> shift);
             private void SetUInt16Value(ushort value, byte shift) =>
                 _data = (_data & ~((ulong)ushort.MaxValue << shift)) | ((ulong)value << shift);
-            private byte GetByteValue(byte shift) => (byte)(_data >> shift);
-            private void SetByteValue(byte value, byte shift) =>
-                _data = (_data & ~((ulong)byte.MaxValue << shift)) | ((ulong)value << shift);
 
-            public uint SignalCount
+            public ushort SignalCount
             {
-                get => GetUInt32Value(SignalCountShift);
-                set => SetUInt32Value(value, SignalCountShift);
+                get => GetUInt16Value(SignalCountShift);
             }
 
-            public void AddSignalCount(uint value)
+            public Counts InterlockedIncrementSignalCount()
             {
-                Debug.Assert(value <= uint.MaxValue - SignalCount);
-                _data += (ulong)value << SignalCountShift;
+                var countsAfterUpdate = new Counts(Interlocked.Add(ref _data, 1ul << SignalCountShift));
+                Debug.Assert(countsAfterUpdate.SignalCount != ushort.MaxValue); // overflow check
+                return countsAfterUpdate;
             }
 
-            public void IncrementSignalCount() => AddSignalCount(1);
-
             public void DecrementSignalCount()
             {
                 Debug.Assert(SignalCount != 0);
@@ -295,19 +247,18 @@ public void DecrementSignalCount()
             public ushort WaiterCount
             {
                 get => GetUInt16Value(WaiterCountShift);
-                set => SetUInt16Value(value, WaiterCountShift);
             }
 
-            public void IncrementWaiterCount()
+            public void DecrementWaiterCount()
             {
-                Debug.Assert(WaiterCount < ushort.MaxValue);
-                _data += (ulong)1 << WaiterCountShift;
+                Debug.Assert(WaiterCount != 0);
+                _data -= (ulong)1 << WaiterCountShift;
             }
 
-            public void DecrementWaiterCount()
+            public void IncrementWaiterCount()
             {
+                _data += (ulong)1 << WaiterCountShift;
                 Debug.Assert(WaiterCount != 0);
-                _data -= (ulong)1 << WaiterCountShift;
             }
 
             public void InterlockedDecrementWaiterCount()
@@ -316,38 +267,16 @@ public void InterlockedDecrementWaiterCount()
                 Debug.Assert(countsAfterUpdate.WaiterCount != ushort.MaxValue); // underflow check
             }
 
-            public byte SpinnerCount
+            public ushort CountOfWaitersSignaledToWake
             {
-                get => GetByteValue(SpinnerCountShift);
-                set => SetByteValue(value, SpinnerCountShift);
+                get => GetUInt16Value(CountOfWaitersSignaledToWakeShift);
             }
 
-            public void IncrementSpinnerCount()
+            public void AddCountOfWaitersSignaledToWake(uint value)
             {
-                Debug.Assert(SpinnerCount < byte.MaxValue);
-                _data += (ulong)1 << SpinnerCountShift;
-            }
-
-            public void DecrementSpinnerCount()
-            {
-                Debug.Assert(SpinnerCount != 0);
-                _data -= (ulong)1 << SpinnerCountShift;
-            }
-
-            public byte CountOfWaitersSignaledToWake
-            {
-                get => GetByteValue(CountOfWaitersSignaledToWakeShift);
-                set => SetByteValue(value, CountOfWaitersSignaledToWakeShift);
-            }
-
-            public void AddUpToMaxCountOfWaitersSignaledToWake(uint value)
-            {
-                uint availableCount = (uint)(byte.MaxValue - CountOfWaitersSignaledToWake);
-                if (value > availableCount)
-                {
-                    value = availableCount;
-                }
                 _data += (ulong)value << CountOfWaitersSignaledToWakeShift;
+                var countsAfterUpdate = new Counts(_data);
+                Debug.Assert(countsAfterUpdate.CountOfWaitersSignaledToWake != ushort.MaxValue); // overflow check
             }
 
             public void DecrementCountOfWaitersSignaledToWake()
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.Blocking.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.Blocking.cs
index 73fe2afe9a4d0e..bd6773321374b9 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.Blocking.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.Blocking.cs
@@ -230,7 +230,8 @@ private uint PerformBlockingAdjustment(bool previousDelayElapsed, out bool addWo
                 HillClimbing.ThreadPoolHillClimber.ForceChange(
                     newNumThreadsGoal,
                     HillClimbing.StateOrTransition.CooperativeBlocking);
-                if (counts.NumProcessingWork >= numThreadsGoal && _separated.numRequestedWorkers > 0)
+
+                if (counts.NumProcessingWork >= numThreadsGoal && _separated._hasOutstandingThreadRequest != 0)
                 {
                     addWorker = true;
                 }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.GateThread.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.GateThread.cs
index a0434cdfa9abb3..e545b660908538 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.GateThread.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.GateThread.cs
@@ -132,7 +132,7 @@ private static void GateThreadStart()
 
                         if (!disableStarvationDetection &&
                             threadPoolInstance._pendingBlockingAdjustment == PendingBlockingAdjustment.None &&
-                            threadPoolInstance._separated.numRequestedWorkers > 0 &&
+                            threadPoolInstance._separated._hasOutstandingThreadRequest != 0 &&
                             SufficientDelaySinceLastDequeue(threadPoolInstance))
                         {
                             bool addWorker = false;
@@ -187,7 +187,7 @@ private static void GateThreadStart()
                             }
                         }
 
-                        if (threadPoolInstance._separated.numRequestedWorkers <= 0 &&
+                        if (threadPoolInstance._separated._hasOutstandingThreadRequest == 0 &&
                             threadPoolInstance._pendingBlockingAdjustment == PendingBlockingAdjustment.None &&
                             Interlocked.Decrement(ref threadPoolInstance._separated.gateThreadRunningState) <= GetRunningStateForNumRuns(0))
                         {
@@ -208,7 +208,7 @@ public static void Wake(PortableThreadPool threadPoolInstance)
             // in deciding "too long"
             private static bool SufficientDelaySinceLastDequeue(PortableThreadPool threadPoolInstance)
             {
-                uint delay = (uint)(Environment.TickCount - threadPoolInstance._separated.lastDequeueTime);
+                uint delay = (uint)(Environment.TickCount - threadPoolInstance._separated.lastDispatchTime);
                 uint minimumDelay;
                 if (threadPoolInstance._cpuUtilization < CpuUtilizationLow)
                 {
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.ThreadCounts.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.ThreadCounts.cs
index 26b6ab0cf0ac48..3d3ed4123ae25e 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.ThreadCounts.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.ThreadCounts.cs
@@ -44,6 +44,65 @@ public short NumProcessingWork
                 }
             }
 
+            // Returns "true" if adding NumProcessingWork has reached the limit.
+            // NOTE: it is possible to have overflow and NumProcessingWork under the limit
+            // at the same time if the limit has been changed afterwards. That is ok.
+            // While changes in NumProcessingWork need to be matched with semaphore Wait/Signal,
+            // the redundantly set overflow is mostly harmless and should self-correct when
+            // a worker that sees no work calls TryDecrementProcessingWork, possibly at a cost of
+            // redundant check for work.
+            public bool IsOverflow
+            {
+                get
+                {
+                    return (long)_data < 0;
+                }
+            }
+
+            /// <summary>
+            /// Tries to increase the number of threads processing work items by one.
+            /// If at or above goal, returns false and sets overflow flag instead.
+            /// NOTE: only if "true" is returned the NumProcessingWork is incremented.
+            /// </summary>
+            public bool TryIncrementProcessingWork()
+            {
+                Debug.Assert(NumProcessingWork >= 0);
+                if (NumProcessingWork < NumThreadsGoal)
+                {
+                    NumProcessingWork++;
+                    // This should never overflow
+                    Debug.Assert(NumProcessingWork > 0);
+                    return true;
+                }
+                else
+                {
+                    _data |= (1ul << 63);
+                    return false;
+                }
+            }
+
+            /// <summary>
+            /// Tries to reduce the number of threads processing work items by one.
+            /// If in an overflow state, clears the overflow flag and returns false.
+            /// NOTE: only if "true" is returned the NumProcessingWork is decremented.
+            /// </summary>
+            public bool TryDecrementProcessingWork()
+            {
+                Debug.Assert(NumProcessingWork > 0);
+                if (IsOverflow)
+                {
+                    _data &= ~(1ul << 63);
+                    return false;
+                }
+                else
+                {
+                    NumProcessingWork--;
+                    // This should never underflow
+                    Debug.Assert(NumProcessingWork >= 0);
+                    return true;
+                }
+            }
+
             /// <summary>
             /// Number of thread pool threads that currently exist.
             /// </summary>
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs
index a34e0f8ff98c4e..4380722d78f32d 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Diagnostics;
 using System.Diagnostics.Tracing;
 using System.Runtime.CompilerServices;
 
@@ -15,7 +16,11 @@ private static partial class WorkerThread
         {
             private static readonly short ThreadsToKeepAlive = DetermineThreadsToKeepAlive();
 
-            private const int SemaphoreSpinCountDefault = 70;
+            // Spinning in the threadpool semaphore is not always useful.
+            // For example the new workitems may be produced by non-pool threads and could only arrive if pool threads start blocking.
+            // We will limit spinning to roughly 512-1024 spinwaits, each taking 35-50ns. That should be under 50 usec total.
+            // For reference the wakeup latency of a futex/event with threads queued up is reported to be in 5-50 usec range. (year 2025)
+            private const int SemaphoreSpinCountDefault = 9;
 
             // This value represents an assumption of how much uncommitted stack space a worker thread may use in the future.
             // Used in calculations to estimate when to throttle the rate of thread injection to reduce the possibility of
@@ -42,9 +47,8 @@ private static short DetermineThreadsToKeepAlive()
             /// </summary>
             private static readonly LowLevelLifoSemaphore s_semaphore =
                 new LowLevelLifoSemaphore(
-                    0,
                     MaxPossibleThreadCount,
-                    AppContextConfigHelper.GetInt32ComPlusOrDotNetConfig(
+                    (uint)AppContextConfigHelper.GetInt32ComPlusOrDotNetConfig(
                         "System.Threading.ThreadPool.UnfairSemaphoreSpinLimit",
                         "ThreadPool_UnfairSemaphoreSpinLimit",
                         SemaphoreSpinCountDefault,
@@ -112,12 +116,13 @@ private static void WorkerThreadStart()
 
                 while (true)
                 {
-                    bool spinWait = true;
-                    while (semaphore.Wait(timeoutMs, spinWait))
+                    while (semaphore.Wait(timeoutMs))
                     {
-                        WorkerDoWork(threadPoolInstance, ref spinWait);
+                        WorkerDoWork(threadPoolInstance);
                     }
 
+                    // We've timed out waiting on the semaphore. Time to exit.
+                    // In rare cases we may be asked to keep running/waiting.
                     if (ShouldExitWorker(threadPoolInstance, threadAdjustmentLock))
                     {
                         break;
@@ -125,57 +130,33 @@ private static void WorkerThreadStart()
                 }
             }
 
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            private static void WorkerDoWork(PortableThreadPool threadPoolInstance, ref bool spinWait)
+            private static void WorkerDoWork(PortableThreadPool threadPoolInstance)
             {
-                bool alreadyRemovedWorkingWorker = false;
-                while (TakeActiveRequest(threadPoolInstance))
+                do
                 {
-                    threadPoolInstance._separated.lastDequeueTime = Environment.TickCount;
-                    if (!ThreadPoolWorkQueue.Dispatch())
+                    // We generally avoid spurious wakes as they are wasteful, so we nearly always should see a request.
+                    // However, we allow external wakes when thread goals change, which can result in "stolen" requests,
+                    // thus sometimes there is no active request and we need to check.
+                    if (threadPoolInstance._separated._hasOutstandingThreadRequest != 0 &&
+                        Interlocked.Exchange(ref threadPoolInstance._separated._hasOutstandingThreadRequest, 0) != 0)
                     {
-                        // ShouldStopProcessingWorkNow() caused the thread to stop processing work, and it would have
-                        // already removed this working worker in the counts. This typically happens when hill climbing
-                        // decreases the worker thread count goal.
-                        alreadyRemovedWorkingWorker = true;
-                        break;
-                    }
-
-                    if (threadPoolInstance._separated.numRequestedWorkers <= 0)
-                    {
-                        break;
-                    }
-
-                    // In highly bursty cases with short bursts of work, especially in the portable thread pool
-                    // implementation, worker threads are being released and entering Dispatch very quickly, not finding
-                    // much work in Dispatch, and soon afterwards going back to Dispatch, causing extra thrashing on
-                    // data and some interlocked operations, and similarly when the thread pool runs out of work. Since
-                    // there is a pending request for work, introduce a slight delay before serving the next request.
-                    // The spin-wait is mainly for when the sleep is not effective due to there being no other threads
-                    // to schedule.
-                    Thread.UninterruptibleSleep0();
-                    if (!Environment.IsSingleProcessor)
-                    {
-                        Thread.SpinWait(1);
+                        // We took the request, now we must Dispatch some work items.
+                        threadPoolInstance.NotifyDispatchProgress(Environment.TickCount);
+                        if (!ThreadPoolWorkQueue.Dispatch())
+                        {
+                            // We are above goal and would have already removed this working worker in the counts.
+                            return;
+                        }
                     }
-                }
 
-                // Don't spin-wait on the semaphore next time if the thread was actively stopped from processing work,
-                // as it's unlikely that the worker thread count goal would be increased again so soon afterwards that
-                // the semaphore would be released within the spin-wait window
-                spinWait = !alreadyRemovedWorkingWorker;
-
-                if (!alreadyRemovedWorkingWorker)
-                {
-                    // If we woke up but couldn't find a request, or ran out of work items to process, we need to update
-                    // the number of working workers to reflect that we are done working for now
-                    RemoveWorkingWorker(threadPoolInstance);
-                }
+                    // We could not find more work in the queue and will try to stop being active.
+                    // One caveat - in overflow state we may have cleared a work request without asking for a worker.
+                    // Thus if there is uncleared overflow, one thread will be back for another round - without consuming a wake.
+                } while (!TryRemoveWorkingWorker(threadPoolInstance));
             }
 
             // returns true if the worker is shutting down
             // returns false if we should do another iteration
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private static bool ShouldExitWorker(PortableThreadPool threadPoolInstance, LowLevelLock threadAdjustmentLock)
             {
                 // The thread cannot exit if it has IO pending, otherwise the IO may be canceled
@@ -235,80 +216,65 @@ private static bool ShouldExitWorker(PortableThreadPool threadPoolInstance, LowL
             }
 
             /// <summary>
-            /// Reduce the number of working workers by one, but maybe add back a worker (possibily this thread) if a thread request comes in while we are marking this thread as not working.
+            /// Tries to reduce the number of working workers by one.
+            /// If we are in a state of overflow, clears the overflow instead and returns false.
+            /// Returns true if number of active threads was actually reduced.
             /// </summary>
-            private static void RemoveWorkingWorker(PortableThreadPool threadPoolInstance)
+            private static bool TryRemoveWorkingWorker(PortableThreadPool threadPoolInstance)
             {
-                // A compare-exchange loop is used instead of Interlocked.Decrement or Interlocked.Add to defensively prevent
-                // NumProcessingWork from underflowing. See the setter for NumProcessingWork.
-                ThreadCounts counts = threadPoolInstance._separated.counts;
+                uint collisionCount = 0;
                 while (true)
                 {
-                    ThreadCounts newCounts = counts;
-                    newCounts.NumProcessingWork--;
-
-                    ThreadCounts countsBeforeUpdate =
-                        threadPoolInstance._separated.counts.InterlockedCompareExchange(newCounts, counts);
-                    if (countsBeforeUpdate == counts)
+                    ThreadCounts oldCounts = threadPoolInstance._separated.counts;
+                    ThreadCounts newCounts = oldCounts;
+                    bool decremented = newCounts.TryDecrementProcessingWork();
+                    if (threadPoolInstance._separated.counts.InterlockedCompareExchange(newCounts, oldCounts) == oldCounts)
                     {
-                        break;
+                        return decremented;
                     }
 
-                    counts = countsBeforeUpdate;
-                }
-
-                // It's possible that we decided we had thread requests just before a request came in,
-                // but reduced the worker count *after* the request came in.  In this case, we might
-                // miss the notification of a thread request.  So we wake up a thread (maybe this one!)
-                // if there is work to do.
-                if (threadPoolInstance._separated.numRequestedWorkers > 0)
-                {
-                    MaybeAddWorkingWorker(threadPoolInstance);
+                    // This can be fairly contentious when threadpool runs out of work and all threads try to leave.
+                    Backoff.Exponential(collisionCount++);
                 }
             }
 
+            /// In a state of overflow does nothing.
+            /// Otherwise increments the active worker count and signals the semaphore.
+            /// Incrementing the count turns on the overflow state if the active thread limit is reached.
             internal static void MaybeAddWorkingWorker(PortableThreadPool threadPoolInstance)
             {
-                ThreadCounts counts = threadPoolInstance._separated.counts;
-                short numExistingThreads, numProcessingWork, newNumExistingThreads, newNumProcessingWork;
+                ThreadCounts oldCounts, newCounts;
+                bool incremented;
+                uint collisionCount = 0;
                 while (true)
                 {
-                    numProcessingWork = counts.NumProcessingWork;
-                    if (numProcessingWork >= counts.NumThreadsGoal)
-                    {
-                        return;
-                    }
-
-                    newNumProcessingWork = (short)(numProcessingWork + 1);
-                    numExistingThreads = counts.NumExistingThreads;
-                    newNumExistingThreads = Math.Max(numExistingThreads, newNumProcessingWork);
-
-                    ThreadCounts newCounts = counts;
-                    newCounts.NumProcessingWork = newNumProcessingWork;
-                    newCounts.NumExistingThreads = newNumExistingThreads;
-
-                    ThreadCounts oldCounts = threadPoolInstance._separated.counts.InterlockedCompareExchange(newCounts, counts);
-
-                    if (oldCounts == counts)
+                    oldCounts = threadPoolInstance._separated.counts;
+                    newCounts = oldCounts;
+                    incremented = newCounts.TryIncrementProcessingWork();
+                    newCounts.NumExistingThreads = Math.Max(newCounts.NumProcessingWork, newCounts.NumExistingThreads);
+                    if (threadPoolInstance._separated.counts.InterlockedCompareExchange(newCounts, oldCounts) == oldCounts)
                     {
                         break;
                     }
 
-                    counts = oldCounts;
+                    // This is less contentious than Remove as reasons to add threads are more complex to avoid adding too many too fast.
+                    // We can still see some amount of failed interlocked operations here when a burst of work is scheduled.
+                    Backoff.Exponential(collisionCount++);
                 }
 
-                int toCreate = newNumExistingThreads - numExistingThreads;
-                int toRelease = newNumProcessingWork - numProcessingWork;
-
-                if (toRelease > 0)
+                if (!incremented)
                 {
-                    s_semaphore.Release(toRelease);
+                    return;
                 }
 
-                while (toCreate > 0)
+                Debug.Assert(newCounts.NumProcessingWork - oldCounts.NumProcessingWork == 1);
+                s_semaphore.Signal();
+
+                int toCreate = newCounts.NumExistingThreads - oldCounts.NumExistingThreads;
+                Debug.Assert(toCreate == 0 || toCreate == 1);
+                if (toCreate != 0)
                 {
                     CreateWorkerThread();
-                    toCreate--;
                 }
             }
 
@@ -326,10 +292,7 @@ internal static bool ShouldStopProcessingWorkNow(PortableThreadPool threadPoolIn
                     // When there are more threads processing work than the thread count goal, it may have been decided
                     // to decrease the number of threads. Stop processing if the counts can be updated. We may have more
                     // threads existing than the thread count goal and that is ok, the cold ones will eventually time out if
-                    // the thread count goal is not increased again. This logic is a bit different from the original CoreCLR
-                    // code from which this implementation was ported, which turns a processing thread into a retired thread
-                    // and checks for pending requests like RemoveWorkingWorker. In this implementation there are
-                    // no retired threads, so only the count of threads processing work is considered.
+                    // the thread count goal is not increased again.
                     if (counts.NumProcessingWork <= counts.NumThreadsGoal)
                     {
                         return false;
@@ -347,21 +310,6 @@ internal static bool ShouldStopProcessingWorkNow(PortableThreadPool threadPoolIn
                     counts = oldCounts;
                 }
             }
-
-            private static bool TakeActiveRequest(PortableThreadPool threadPoolInstance)
-            {
-                int count = threadPoolInstance._separated.numRequestedWorkers;
-                while (count > 0)
-                {
-                    int prevCount = Interlocked.CompareExchange(ref threadPoolInstance._separated.numRequestedWorkers, count - 1, count);
-                    if (prevCount == count)
-                    {
-                        return true;
-                    }
-                    count = prevCount;
-                }
-                return false;
-            }
         }
     }
 }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.cs
index a2184f544cf1ad..d53f03ce2a81f7 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.cs
@@ -87,8 +87,10 @@ private struct CacheLineSeparated
             [FieldOffset(Internal.PaddingHelpers.CACHE_LINE_SIZE * 1)]
             public ThreadCounts counts; // SOS's ThreadPool command depends on this name
 
+            // Periodically updated heartbeat timestamp to indicate that we are making progress.
+            // Used in starvation detection.
             [FieldOffset(Internal.PaddingHelpers.CACHE_LINE_SIZE * 2)]
-            public int lastDequeueTime;
+            public int lastDispatchTime;
 
             [FieldOffset(Internal.PaddingHelpers.CACHE_LINE_SIZE * 3)]
             public int priorCompletionCount;
@@ -97,8 +99,20 @@ private struct CacheLineSeparated
             [FieldOffset(Internal.PaddingHelpers.CACHE_LINE_SIZE * 3 + sizeof(int) * 2)]
             public int nextCompletedWorkRequestsTime;
 
+            // This flag is used for communication between item enqueuing and workers that process the items.
+            // There are two states of this flag:
+            // 0: has no guarantees
+            // 1: means a worker will check work queues and ensure that
+            //    any work items inserted in work queue before setting the flag
+            //    are picked up.
+            //    Note: The state must be cleared by the worker thread _before_
+            //       checking. Otherwise there is a window between finding no work
+            //       and resetting the flag, when the flag is in a wrong state.
+            //       A new work item may be added right before the flag is reset
+            //       without asking for a worker, while the last worker is quitting.
             [FieldOffset(Internal.PaddingHelpers.CACHE_LINE_SIZE * 4)]
-            public volatile int numRequestedWorkers;
+            public int _hasOutstandingThreadRequest;
+
             [FieldOffset(Internal.PaddingHelpers.CACHE_LINE_SIZE * 4 + sizeof(int))]
             public int gateThreadRunningState;
         }
@@ -209,7 +223,7 @@ public bool SetMinThreads(int workerThreads, int ioCompletionThreads)
                 else if (_separated.counts.NumThreadsGoal < newMinThreads)
                 {
                     _separated.counts.InterlockedSetNumThreadsGoal(newMinThreads);
-                    if (_separated.numRequestedWorkers > 0)
+                    if (_separated._hasOutstandingThreadRequest != 0)
                     {
                         addWorker = true;
                     }
@@ -330,26 +344,30 @@ private ThreadInt64PersistentCounter.ThreadLocalNode CreateThreadLocalCompletion
             return threadLocalCompletionCountNode;
         }
 
-        private void NotifyWorkItemProgress(ThreadInt64PersistentCounter.ThreadLocalNode threadLocalCompletionCountNode, int currentTimeMs)
+        private static void NotifyWorkItemProgress(ThreadInt64PersistentCounter.ThreadLocalNode threadLocalCompletionCountNode)
         {
             threadLocalCompletionCountNode.Increment();
-            _separated.lastDequeueTime = currentTimeMs;
+        }
 
+        internal void NotifyWorkItemProgress()
+        {
+            NotifyWorkItemProgress(GetOrCreateThreadLocalCompletionCountNode());
+        }
+
+        internal bool NotifyWorkItemComplete(ThreadInt64PersistentCounter.ThreadLocalNode threadLocalCompletionCountNode, int currentTimeMs)
+        {
+            NotifyWorkItemProgress(threadLocalCompletionCountNode);
             if (ShouldAdjustMaxWorkersActive(currentTimeMs))
             {
                 AdjustMaxWorkersActive();
             }
-        }
 
-        internal void NotifyWorkItemProgress() =>
-            NotifyWorkItemProgress(GetOrCreateThreadLocalCompletionCountNode(), Environment.TickCount);
+            return !WorkerThread.ShouldStopProcessingWorkNow(this);
+        }
 
-        internal bool NotifyWorkItemComplete(ThreadInt64PersistentCounter.ThreadLocalNode? threadLocalCompletionCountNode, int currentTimeMs)
+        internal void NotifyDispatchProgress(int currentTickCount)
         {
-            Debug.Assert(threadLocalCompletionCountNode != null);
-
-            NotifyWorkItemProgress(threadLocalCompletionCountNode, currentTimeMs);
-            return !WorkerThread.ShouldStopProcessingWorkNow(this);
+            _separated.lastDispatchTime = currentTickCount;
         }
 
         //
@@ -459,13 +477,15 @@ private bool ShouldAdjustMaxWorkersActive(int currentTimeMs)
             return _pendingBlockingAdjustment == PendingBlockingAdjustment.None;
         }
 
-        internal void RequestWorker()
+        internal void EnsureWorkerRequested()
         {
-            // The order of operations here is important. MaybeAddWorkingWorker() and EnsureRunning() use speculative checks to
-            // do their work and the memory barrier from the interlocked operation is necessary in this case for correctness.
-            Interlocked.Increment(ref _separated.numRequestedWorkers);
-            WorkerThread.MaybeAddWorkingWorker(this);
-            GateThread.EnsureRunning(this);
+            // Only one worker is requested at a time to mitigate Thundering Herd problem.
+            if (_separated._hasOutstandingThreadRequest == 0 &&
+                Interlocked.Exchange(ref _separated._hasOutstandingThreadRequest, 1) == 0)
+            {
+                WorkerThread.MaybeAddWorkingWorker(this);
+                GateThread.EnsureRunning(this);
+            }
         }
 
         private bool OnGen2GCCallback()
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.Threads.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.Threads.cs
index 7933e49db422b9..d2515952b4d8e1 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.Threads.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.Threads.cs
@@ -8,6 +8,7 @@ public static partial class ThreadPool
         // Indicates that the threadpool should yield the thread from the dispatch loop to the
         // runtime periodically.  We use this to return back to the JS event loop so that the JS
         // event queue can be drained
-        internal static bool YieldFromDispatchLoop => true;
+#pragma warning disable IDE0060 // Remove unused parameter
+        internal static bool YieldFromDispatchLoop(int currentTickCount) => true;
+#pragma warning restore IDE0060    }
     }
-}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.cs
index 5250e1df5f1ab9..3d1428d0c53b62 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Browser.cs
@@ -35,7 +35,9 @@ public static partial class ThreadPool
     {
         // Indicates whether the thread pool should yield the thread from the dispatch loop to the runtime periodically so that
         // the runtime may use the thread for processing other work
-        internal static bool YieldFromDispatchLoop => true;
+#pragma warning disable IDE0060 // Remove unused parameter
+        internal static bool YieldFromDispatchLoop(int currentTickCount) => true;
+#pragma warning restore IDE0060
 
         private const bool IsWorkerTrackingEnabledInConfig = false;
 
@@ -78,7 +80,7 @@ public static void GetAvailableThreads(out int workerThreads, out int completion
         public static long CompletedWorkItemCount => 0;
 
         [DynamicDependency("BackgroundJobHandler")] // https://github.com/dotnet/runtime/issues/101434
-        internal static unsafe void RequestWorkerThread()
+        internal static unsafe void EnsureWorkerRequested()
         {
             if (_callbackQueued)
                 return;
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Unix.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Unix.cs
index b100409793ba20..236aa07bc32aae 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Unix.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Unix.cs
@@ -19,7 +19,11 @@ public static partial class ThreadPool
 #if !(TARGET_BROWSER && FEATURE_WASM_MANAGED_THREADS)
         // Indicates whether the thread pool should yield the thread from the dispatch loop to the runtime periodically so that
         // the runtime may use the thread for processing other work.
-        internal static bool YieldFromDispatchLoop => false;
+        internal static bool YieldFromDispatchLoop(int currentTickCount)
+        {
+            PortableThreadPool.ThreadPoolInstance.NotifyDispatchProgress(currentTickCount);
+            return false;
+        }
 #endif
 
         internal static ThreadInt64PersistentCounter.ThreadLocalNode GetOrCreateThreadLocalCompletionCountNode() =>
@@ -67,9 +71,9 @@ internal static bool NotifyWorkItemComplete(ThreadInt64PersistentCounter.ThreadL
         /// <summary>
         /// This method is called to request a new thread pool worker to handle pending work.
         /// </summary>
-        internal static unsafe void RequestWorkerThread()
+        internal static unsafe void EnsureWorkerRequested()
         {
-            PortableThreadPool.ThreadPoolInstance.RequestWorker();
+            PortableThreadPool.ThreadPoolInstance.EnsureWorkerRequested();
         }
 
         internal static void ReportThreadStatus(bool isWorking)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Wasi.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Wasi.cs
index fa499f0fd857fb..5f983e1d3c4389 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Wasi.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Wasi.cs
@@ -35,7 +35,9 @@ public static partial class ThreadPool
     {
         // Indicates whether the thread pool should yield the thread from the dispatch loop to the runtime periodically so that
         // the runtime may use the thread for processing other work
-        internal static bool YieldFromDispatchLoop => true;
+#pragma warning disable IDE0060 // Remove unused parameter
+        internal static bool YieldFromDispatchLoop(int currentTickCount) => true;
+#pragma warning restore IDE0060
 
         private const bool IsWorkerTrackingEnabledInConfig = false;
 
@@ -75,7 +77,7 @@ public static void GetAvailableThreads(out int workerThreads, out int completion
 
         public static long CompletedWorkItemCount => 0;
 
-        internal static unsafe void RequestWorkerThread()
+        internal static unsafe void EnsureWorkerRequested()
         {
         }
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Windows.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Windows.cs
index 0ad70d35a92c32..223cea9c318e4e 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Windows.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPool.Windows.cs
@@ -30,11 +30,19 @@ public static partial class ThreadPool
 
         // Indicates whether the thread pool should yield the thread from the dispatch loop to the runtime periodically so that
         // the runtime may use the thread for processing other work.
-        //
-        // Windows thread pool threads need to yield back to the thread pool periodically, otherwise those threads may be
-        // considered to be doing long-running work and change thread pool heuristics, such as slowing or halting thread
-        // injection.
-        internal static bool YieldFromDispatchLoop => UseWindowsThreadPool;
+        internal static bool YieldFromDispatchLoop(int currentTickCount)
+        {
+            if (UseWindowsThreadPool)
+            {
+                // Windows thread pool threads need to yield back to the thread pool periodically, otherwise those threads may be
+                // considered to be doing long-running work and change thread pool heuristics, such as slowing or halting thread
+                // injection.
+                return true;
+            }
+
+            PortableThreadPool.ThreadPoolInstance.NotifyDispatchProgress(currentTickCount);
+            return false;
+        }
 
         [CLSCompliant(false)]
         [SupportedOSPlatform("windows")]
@@ -155,17 +163,24 @@ internal static void NotifyThreadUnblocked()
         }
 
         /// <summary>
-        /// This method is called to request a new thread pool worker to handle pending work.
+        /// This method is called to notify the thread pool about pending work.
+        /// It will start with an ordinary read to check if a request is already pending as we
+        /// optimize for a case when queues already have items and this flag is already set.
+        /// Make sure that the presence of the item that is being added to the queue is visible
+        /// before calling this.
+        /// Typically this is not a problem when enqueing uses an interlocked update of the queue
+        /// index to establish the presence of the new item. More care may be needed when an item
+        /// is inserted via ordinary or volatile writes.
         /// </summary>
-        internal static void RequestWorkerThread()
+        internal static void EnsureWorkerRequested()
         {
             if (ThreadPool.UseWindowsThreadPool)
             {
-                WindowsThreadPool.RequestWorkerThread();
+                WindowsThreadPool.EnsureWorkerRequested();
             }
             else
             {
-                PortableThreadPool.ThreadPoolInstance.RequestWorker();
+                PortableThreadPool.ThreadPoolInstance.EnsureWorkerRequested();
             }
         }
 
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPoolWorkQueue.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPoolWorkQueue.cs
index b3bdc1b80f7077..0480f93dfa35b1 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPoolWorkQueue.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/ThreadPoolWorkQueue.cs
@@ -127,7 +127,11 @@ public void LocalPush(object obj)
                 // When there are at least 2 elements' worth of space, we can take the fast path.
                 if (tail < m_headIndex + m_mask)
                 {
-                    Volatile.Write(ref m_array[tail & m_mask], obj);
+                    m_array[tail & m_mask] = obj;
+                    // The following write makes the slot to "appear" in the queue.
+                    // It must happen after the write of the item, and it does, since m_tailIndex is volatile.
+                    // NOTE: we also must be sure this write is not delayed past our check for a
+                    // pending thread request.
                     m_tailIndex = tail + 1;
                 }
                 else
@@ -156,7 +160,11 @@ public void LocalPush(object obj)
                             m_mask = (m_mask << 1) | 1;
                         }
 
-                        Volatile.Write(ref m_array[tail & m_mask], obj);
+                        m_array[tail & m_mask] = obj;
+                        // The following write makes the slot to "appear" in the queue.
+                        // It must happen after the write of the item, and it does, since m_tailIndex is volatile.
+                        // NOTE: we also must be sure this write is not delayed past our check for a
+                        // pending thread request.
                         m_tailIndex = tail + 1;
                     }
                     finally
@@ -165,6 +173,10 @@ public void LocalPush(object obj)
                             m_foreignLock.Exit(useMemoryBarrier: false);
                     }
                 }
+
+                // Our caller will check for a thread request now (with an ordinary read),
+                // make sure the check happens after the new slot appears in the queue.
+                Interlocked.MemoryBarrier();
             }
 
             [MethodImpl(MethodImplOptions.NoInlining)]
@@ -410,7 +422,6 @@ public int Count
 
         private bool _loggingEnabled;
         private bool _dispatchNormalPriorityWorkFirst;
-        private bool _mayHaveHighPriorityWorkItems;
 
         // SOS's ThreadPool command depends on the following names
         internal readonly WorkQueue workItems = new WorkQueue();
@@ -431,29 +442,6 @@ public int Count
         private readonly int[] _assignedWorkItemQueueThreadCounts =
             s_assignableWorkItemQueueCount > 0 ? new int[s_assignableWorkItemQueueCount] : Array.Empty<int>();
 
-        [StructLayout(LayoutKind.Sequential)]
-        private struct CacheLineSeparated
-        {
-            private readonly Internal.PaddingFor32 pad1;
-
-            // This flag is used for communication between item enqueuing and workers that process the items.
-            // There are two states of this flag:
-            // 0: has no guarantees
-            // 1: means a worker will check work queues and ensure that
-            //    any work items inserted in work queue before setting the flag
-            //    are picked up.
-            //    Note: The state must be cleared by the worker thread _before_
-            //       checking. Otherwise there is a window between finding no work
-            //       and resetting the flag, when the flag is in a wrong state.
-            //       A new work item may be added right before the flag is reset
-            //       without asking for a worker, while the last worker is quitting.
-            public int _hasOutstandingThreadRequest;
-
-            private readonly Internal.PaddingFor32 pad2;
-        }
-
-        private CacheLineSeparated _separated;
-
         public ThreadPoolWorkQueue()
         {
             for (int i = 0; i < s_assignableWorkItemQueueCount; i++)
@@ -464,48 +452,6 @@ public ThreadPoolWorkQueue()
             RefreshLoggingEnabled();
         }
 
-        private void AssignWorkItemQueue(ThreadPoolWorkQueueThreadLocals tl)
-        {
-            Debug.Assert(s_assignableWorkItemQueueCount > 0);
-
-            _queueAssignmentLock.Acquire();
-
-            // Determine the first queue that has not yet been assigned to the limit of worker threads
-            int queueIndex = -1;
-            int minCount = int.MaxValue;
-            int minCountQueueIndex = 0;
-            for (int i = 0; i < s_assignableWorkItemQueueCount; i++)
-            {
-                int count = _assignedWorkItemQueueThreadCounts[i];
-                Debug.Assert(count >= 0);
-                if (count < ProcessorsPerAssignableWorkItemQueue)
-                {
-                    queueIndex = i;
-                    _assignedWorkItemQueueThreadCounts[queueIndex] = count + 1;
-                    break;
-                }
-
-                if (count < minCount)
-                {
-                    minCount = count;
-                    minCountQueueIndex = i;
-                }
-            }
-
-            if (queueIndex < 0)
-            {
-                // All queues have been fully assigned. Choose the queue that has been assigned to the least number of worker
-                // threads.
-                queueIndex = minCountQueueIndex;
-                _assignedWorkItemQueueThreadCounts[queueIndex]++;
-            }
-
-            _queueAssignmentLock.Release();
-
-            tl.queueIndex = queueIndex;
-            tl.assignedGlobalWorkItemQueue = _assignableWorkItemQueues[queueIndex];
-        }
-
         private void TryReassignWorkItemQueue(ThreadPoolWorkQueueThreadLocals tl)
         {
             Debug.Assert(s_assignableWorkItemQueueCount > 0);
@@ -521,6 +467,13 @@ private void TryReassignWorkItemQueue(ThreadPoolWorkQueueThreadLocals tl)
                 return;
             }
 
+            // if not assigned yet, assume temporarily that the last queue is assigned
+            if (queueIndex == -1)
+            {
+                queueIndex = _assignedWorkItemQueueThreadCounts.Length - 1;
+                _assignedWorkItemQueueThreadCounts[queueIndex]++;
+            }
+
             // If the currently assigned queue is assigned to other worker threads, try to reassign an earlier queue to this
             // worker thread if the earlier queue is not assigned to the limit of worker threads
             Debug.Assert(_assignedWorkItemQueueThreadCounts[queueIndex] >= 0);
@@ -549,6 +502,11 @@ private void UnassignWorkItemQueue(ThreadPoolWorkQueueThreadLocals tl)
             Debug.Assert(s_assignableWorkItemQueueCount > 0);
 
             int queueIndex = tl.queueIndex;
+            if (queueIndex == -1)
+            {
+                // a queue was never assigned
+                return;
+            }
 
             _queueAssignmentLock.Acquire();
             int newCount = --_assignedWorkItemQueueThreadCounts[queueIndex];
@@ -572,8 +530,12 @@ private void UnassignWorkItemQueue(ThreadPoolWorkQueueThreadLocals tl)
 
             if (movedWorkItem)
             {
-                EnsureThreadRequested();
+                ThreadPool.EnsureWorkerRequested();
             }
+
+            // unassigned state
+            tl.queueIndex = -1;
+            tl.assignedGlobalWorkItemQueue = workItems;
         }
 
         public ThreadPoolWorkQueueThreadLocals GetOrCreateThreadLocals() =>
@@ -608,16 +570,6 @@ public void RefreshLoggingEnabledFull()
             _loggingEnabled = FrameworkEventSource.Log.IsEnabled(EventLevel.Verbose, FrameworkEventSource.Keywords.ThreadPool | FrameworkEventSource.Keywords.ThreadTransfer);
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal void EnsureThreadRequested()
-        {
-            // Only one worker is requested at a time to mitigate Thundering Herd problem.
-            if (Interlocked.Exchange(ref _separated._hasOutstandingThreadRequest, 1) == 0)
-            {
-                ThreadPool.RequestWorkerThread();
-            }
-        }
-
         public void Enqueue(object callback, bool forceGlobal)
         {
             Debug.Assert((callback is IThreadPoolWorkItem) ^ (callback is Task));
@@ -648,7 +600,7 @@ public void Enqueue(object callback, bool forceGlobal)
                 }
             }
 
-            EnsureThreadRequested();
+            ThreadPool.EnsureWorkerRequested();
         }
 
 #if CORECLR
@@ -696,10 +648,7 @@ public void EnqueueAtHighPriority(object workItem)
 
             highPriorityWorkItems.Enqueue(workItem);
 
-            // If the change below is seen by another thread, ensure that the enqueued work item will also be visible
-            Volatile.Write(ref _mayHaveHighPriorityWorkItems, true);
-
-            EnsureThreadRequested();
+            ThreadPool.EnsureWorkerRequested();
         }
 
         internal static void TransferAllLocalWorkItemsToHighPriorityGlobalQueue()
@@ -713,39 +662,32 @@ internal static void TransferAllLocalWorkItemsToHighPriorityGlobalQueue()
             // Pop each work item off the local queue and push it onto the global. This is a
             // bounded loop as no other thread is allowed to push into this thread's queue.
             ThreadPoolWorkQueue queue = ThreadPool.s_workQueue;
-            bool addedHighPriorityWorkItem = false;
-            bool ensureThreadRequest = false;
+            bool ensureWorkerRequest = false;
             while (tl.workStealingQueue.LocalPop() is object workItem)
             {
+                // A work item had been removed temporarily and other threads may have missed stealing it, so ensure that
+                // there will be a thread request
+                ensureWorkerRequest = true;
+
                 // If there's an unexpected exception here that happens to get handled, the lost work item, or missing thread
                 // request, etc., may lead to other issues. A fail-fast or try-finally here could reduce the effect of such
                 // uncommon issues to various degrees, but it's also uncommon to check for unexpected exceptions.
                 try
                 {
                     queue.highPriorityWorkItems.Enqueue(workItem);
-                    addedHighPriorityWorkItem = true;
                 }
                 catch (OutOfMemoryException)
                 {
                     // This is not expected to throw under normal circumstances
                     tl.workStealingQueue.LocalPush(workItem);
 
-                    // A work item had been removed temporarily and other threads may have missed stealing it, so ensure that
-                    // there will be a thread request
-                    ensureThreadRequest = true;
                     break;
                 }
             }
 
-            if (addedHighPriorityWorkItem)
-            {
-                Volatile.Write(ref queue._mayHaveHighPriorityWorkItems, true);
-                ensureThreadRequest = true;
-            }
-
-            if (ensureThreadRequest)
+            if (ensureWorkerRequest)
             {
-                queue.EnsureThreadRequested();
+                ThreadPool.EnsureWorkerRequested();
             }
         }
 
@@ -774,9 +716,11 @@ internal static bool LocalFindAndPop(object callback)
 
                 tl.isProcessingHighPriorityWorkItems = false;
             }
-            else if (
-                _mayHaveHighPriorityWorkItems &&
-                Interlocked.CompareExchange(ref _mayHaveHighPriorityWorkItems, false, true) &&
+#if FEATURE_SINGLE_THREADED
+            else if (highPriorityWorkItems.Count == 0 &&
+#else
+            else if (!highPriorityWorkItems.IsEmpty &&
+#endif
                 TryStartProcessingHighPriorityWorkItemsAndDequeue(tl, out workItem))
             {
                 return workItem;
@@ -855,7 +799,6 @@ private bool TryStartProcessingHighPriorityWorkItemsAndDequeue(
             }
 
             tl.isProcessingHighPriorityWorkItems = true;
-            _mayHaveHighPriorityWorkItems = true;
             return true;
         }
 
@@ -934,33 +877,15 @@ internal static bool Dispatch()
         {
             ThreadPoolWorkQueue workQueue = ThreadPool.s_workQueue;
             ThreadPoolWorkQueueThreadLocals tl = workQueue.GetOrCreateThreadLocals();
-
-            if (s_assignableWorkItemQueueCount > 0)
-            {
-                workQueue.AssignWorkItemQueue(tl);
-            }
-
-            // Before dequeuing the first work item, acknowledge that the thread request has been satisfied
-            workQueue._separated._hasOutstandingThreadRequest = 0;
-
-            // The state change must happen before sweeping queues for items.
-            Interlocked.MemoryBarrier();
-
             object? workItem = DequeueWithPriorityAlternation(workQueue, tl, out bool missedSteal);
             if (workItem == null)
             {
-                if (s_assignableWorkItemQueueCount > 0)
-                {
-                    workQueue.UnassignWorkItemQueue(tl);
-                }
-
                 // Missing a steal means there may be an item that we were unable to get.
-                // Effectively, we failed to fulfill our promise to check the queues after
-                // clearing "Scheduled" flag.
+                // Effectively, we failed to fulfill our promise to check the queues for work.
                 // We need to make sure someone will do another pass.
                 if (missedSteal)
                 {
-                    workQueue.EnsureThreadRequested();
+                    ThreadPool.EnsureWorkerRequested();
                 }
 
                 // Tell the VM we're returning normally, not because Hill Climbing asked us to return.
@@ -972,7 +897,7 @@ internal static bool Dispatch()
             // In a worst case the current workitem will indirectly depend on progress of other
             // items and that would lead to a deadlock if no one else checks the queue.
             // We must ensure at least one more worker is coming if the queue is not empty.
-            workQueue.EnsureThreadRequested();
+            ThreadPool.EnsureWorkerRequested();
 
             //
             // After this point, this method is no longer responsible for ensuring thread requests except for missed steals
@@ -1019,7 +944,7 @@ internal static bool Dispatch()
                         //
                         if (missedSteal)
                         {
-                            workQueue.EnsureThreadRequested();
+                            ThreadPool.EnsureWorkerRequested();
                         }
 
                         return true;
@@ -1089,7 +1014,7 @@ internal static bool Dispatch()
 
                 // The quantum expired, do any necessary periodic activities
 
-                if (ThreadPool.YieldFromDispatchLoop)
+                if (ThreadPool.YieldFromDispatchLoop(currentTickCount))
                 {
                     // The runtime-specific thread pool implementation requires the Dispatch loop to return to the VM
                     // periodically to let it perform its own work
@@ -1105,6 +1030,10 @@ internal static bool Dispatch()
                 {
                     // Due to hill climbing, over time arbitrary worker threads may stop working and eventually unbalance the
                     // queue assignments. Periodically try to reassign a queue to keep the assigned queues busy.
+                    //
+                    // This can also be the first time the queue is assigned.
+                    // We do not assign eagerly at the beginning of Dispatch as we would need to take _queueAssignmentLock
+                    // and that lock may cause massive contentions if many threads start dispatching.
                     workQueue.TryReassignWorkItemQueue(tl);
                 }
 
@@ -1179,6 +1108,7 @@ internal sealed class ThreadPoolWorkQueueThreadLocals
         public ThreadPoolWorkQueueThreadLocals(ThreadPoolWorkQueue tpq)
         {
             assignedGlobalWorkItemQueue = tpq.workItems;
+            queueIndex = -1;
             workQueue = tpq;
             workStealingQueue = new ThreadPoolWorkQueue.WorkStealingQueue();
             ThreadPoolWorkQueue.WorkStealingQueueList.Add(workStealingQueue);
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/WindowsThreadPool.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/WindowsThreadPool.cs
index 25fa8cda1ad3e6..341dadb705db28 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/WindowsThreadPool.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/WindowsThreadPool.cs
@@ -27,6 +27,29 @@ internal static class WindowsThreadPool
 
         private static IntPtr s_work;
 
+        [StructLayout(LayoutKind.Sequential)]
+        private struct CacheLineSeparated
+        {
+            private readonly Internal.PaddingFor32 pad1;
+
+            // This flag is used for communication between item enqueuing and workers that process the items.
+            // There are two states of this flag:
+            // 0: has no guarantees
+            // 1: means a worker will check work queues and ensure that
+            //    any work items inserted in work queue before setting the flag
+            //    are picked up.
+            //    Note: The state must be cleared by the worker thread _before_
+            //       checking. Otherwise there is a window between finding no work
+            //       and resetting the flag, when the flag is in a wrong state.
+            //       A new work item may be added right before the flag is reset
+            //       without asking for a worker, while the last worker is quitting.
+            public int _hasOutstandingThreadRequest;
+
+            private readonly Internal.PaddingFor32 pad2;
+        }
+
+        private static CacheLineSeparated _separated;
+
         private sealed class ThreadCountHolder
         {
             internal ThreadCountHolder() => Interlocked.Increment(ref s_threadCount);
@@ -147,6 +170,10 @@ private static void DispatchCallback(IntPtr instance, IntPtr context, IntPtr wor
             var wrapper = ThreadPoolCallbackWrapper.Enter();
 
             Debug.Assert(s_work == work);
+            // Before looking for work items, acknowledge that the thread request has been satisfied
+            _separated._hasOutstandingThreadRequest = 0;
+            // NOTE: the thread request must be cleared before doing Dispatch.
+            //       the following Interlocked.Increment will guarantee the ordering.
             Interlocked.Increment(ref s_workingThreadCounter.Count);
             ThreadPoolWorkQueue.Dispatch();
             Interlocked.Decrement(ref s_workingThreadCounter.Count);
@@ -155,7 +182,17 @@ private static void DispatchCallback(IntPtr instance, IntPtr context, IntPtr wor
             wrapper.Exit(resetThread: false);
         }
 
-        internal static unsafe void RequestWorkerThread()
+        internal static void EnsureWorkerRequested()
+        {
+            // Only one worker is requested at a time to mitigate Thundering Herd problem.
+            if (_separated._hasOutstandingThreadRequest == 0 &&
+                Interlocked.Exchange(ref _separated._hasOutstandingThreadRequest, 1) == 0)
+            {
+                RequestWorkerThread();
+            }
+        }
+
+        private static unsafe void RequestWorkerThread()
         {
             if (s_work == IntPtr.Zero)
             {