diff --git a/src/Box2D.NET.Samples/Primitives/SampleTask.cs b/src/Box2D.NET.Samples/Primitives/SampleTask.cs deleted file mode 100644 index 9e88e3bd..00000000 --- a/src/Box2D.NET.Samples/Primitives/SampleTask.cs +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2025 Erin Catto -// SPDX-FileCopyrightText: 2025 Ikpil Choi(ikpil@naver.com) -// SPDX-License-Identifier: MIT - -namespace Box2D.NET.Samples.Primitives; - -public class SampleTask -{ - public b2TaskCallback m_task; - public object m_taskContext; - - public int m_SetSize; - public int m_MinRange; - - - public SampleTask() - { - } - - public virtual void ExecuteRange(int start, int end, uint threadIndex) - { - m_task.Invoke(start, end, threadIndex, m_taskContext); - } -} \ No newline at end of file diff --git a/src/Box2D.NET.Samples/Primitives/TaskScheduler.cs b/src/Box2D.NET.Samples/Primitives/TaskScheduler.cs deleted file mode 100644 index 08961654..00000000 --- a/src/Box2D.NET.Samples/Primitives/TaskScheduler.cs +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: 2025 Ikpil Choi(ikpil@naver.com) -// SPDX-License-Identifier: MIT - -using System; -using System.Collections.Concurrent; -using System.Threading; -using System.Threading.Tasks; - -namespace Box2D.NET.Samples.Primitives; - -public class TaskScheduler -{ - private ConcurrentQueue _runningTasks; - private ConcurrentQueue _workers; - private SemaphoreSlim _semaphore; - - private int _workerCount; - - public void Initialize(int workerCount) - { - _workerCount = workerCount; - _semaphore = new SemaphoreSlim(workerCount); - _runningTasks = new ConcurrentQueue(); - _workers = new ConcurrentQueue(); - for (int i = 0; i < _workerCount; ++i) - { - _workers.Enqueue((uint)i); - } - } - - public void AddTaskSetToPipe(SampleTask task) - { - // single thread - if (1 >= _workerCount) - { - task.m_task.Invoke(0, task.m_SetSize, 0, task.m_taskContext); - return; - } - - uint loop = 0; - int index = 0; - int remain = task.m_SetSize; - int minRange = task.m_MinRange; - while (0 < remain) - { - var stepCount = Math.Min(remain, minRange); - remain -= stepCount; - - var startIndex = index; - var endIndex = startIndex + stepCount; - - index = endIndex; - - var running = Task.Run(async () => - { - await _semaphore.WaitAsync(); - _workers.TryDequeue(out uint workerIndex); - try - { - task.m_task.Invoke(startIndex, endIndex, workerIndex, task.m_taskContext); - } - finally - { - _workers.Enqueue(workerIndex); - _semaphore.Release(); - } - }); - - _runningTasks.Enqueue(running); - } - } - - public void WaitforTask(SampleTask task) - { - // wait! - while (_runningTasks.TryDequeue(out var runningTask)) - { - runningTask.Wait(); - } - } -} \ No newline at end of file diff --git a/src/Box2D.NET.Samples/SampleApp.cs b/src/Box2D.NET.Samples/SampleApp.cs index d7fa3bac..a1a6248f 100644 --- a/src/Box2D.NET.Samples/SampleApp.cs +++ b/src/Box2D.NET.Samples/SampleApp.cs @@ -19,6 +19,7 @@ using Silk.NET.OpenGL.Extensions.ImGui; using Silk.NET.Windowing; using static Box2D.NET.B2Cores; +using static Box2D.NET.B2Constants; using static Box2D.NET.B2Diagnostics; using static Box2D.NET.B2Buffers; using static Box2D.NET.B2MathFunction; @@ -758,7 +759,7 @@ private unsafe void ScrollCallback(WindowHandle* window, double dx, double dy) private void UpdateUI() { - int maxWorkers = (int)(Environment.ProcessorCount * 1.5f); + int maxWorkers = B2_MAX_WORKERS; float fontSize = ImGui.GetFontSize(); float menuWidth = 13.0f * fontSize; diff --git a/src/Box2D.NET.Samples/Samples/Sample.cs b/src/Box2D.NET.Samples/Samples/Sample.cs index 13aa2d8d..03e3bc20 100644 --- a/src/Box2D.NET.Samples/Samples/Sample.cs +++ b/src/Box2D.NET.Samples/Samples/Sample.cs @@ -27,8 +27,7 @@ namespace Box2D.NET.Samples.Samples; public class Sample : IDisposable { - public const int k_maxContactPoints = 12 * 2048; - public const int m_maxTasks = 64; + public const int m_maxTasks = 512; public const int m_maxThreads = 64; public const int m_profileCapacity = 512; @@ -42,11 +41,6 @@ public class Sample : IDisposable protected Camera m_camera; protected Draw m_draw; - private TaskScheduler m_scheduler; - private SampleTask[] m_tasks; - private int m_taskCount; - protected int m_threadCount; - private B2BodyId m_mouseBodyId; // @@ -79,19 +73,6 @@ public Sample(SampleContext context) m_camera = context.camera; m_draw = context.draw; - m_scheduler = new TaskScheduler(); - m_scheduler.Initialize(m_context.workerCount); - - m_tasks = new SampleTask[m_maxTasks]; - for (int i = 0; i < m_maxTasks; ++i) - { - m_tasks[i] = new SampleTask(); - } - - m_taskCount = 0; - - m_threadCount = 1 + m_context.workerCount; - m_worldId = b2_nullWorldId; m_textIncrement = 26; @@ -125,9 +106,7 @@ public virtual void Dispose() { // By deleting the world, we delete the bomb, mouse joint, etc. b2DestroyWorld(m_worldId); - - // delete m_scheduler; - // delete[] m_tasks; + } public void CreateWorld() @@ -140,8 +119,8 @@ public void CreateWorld() B2WorldDef worldDef = b2DefaultWorldDef(); worldDef.workerCount = m_context.workerCount; - worldDef.enqueueTask = EnqueueTask; - worldDef.finishTask = FinishTask; + // worldDef.enqueueTask = EnqueueTask; + // worldDef.finishTask = FinishTask; worldDef.userTaskContext = this; worldDef.enableSleep = m_context.enableSleep; @@ -340,41 +319,6 @@ private void DrawProfileSeries(ImDrawListPtr drawList, Vector2 origin, Vector2 s previous = current; } } - - - private static object EnqueueTask(b2TaskCallback task, int itemCount, int minRange, object taskContext, object userContext) - { - Sample sample = userContext as Sample; - if (sample.m_taskCount < m_maxTasks) - { - SampleTask sampleTask = sample.m_tasks[sample.m_taskCount]; - sampleTask.m_SetSize = itemCount; - sampleTask.m_MinRange = minRange; - sampleTask.m_task = task; - sampleTask.m_taskContext = taskContext; - sample.m_scheduler.AddTaskSetToPipe(sampleTask); - ++sample.m_taskCount; - return sampleTask; - } - else - { - // This is not fatal but the maxTasks should be increased - B2_ASSERT(false); - task(0, itemCount, 0, taskContext); - return null; - } - } - - private static void FinishTask(object taskPtr, object userContext) - { - if (taskPtr != null) - { - SampleTask sampleTask = taskPtr as SampleTask; - Sample sample = userContext as Sample; - sample.m_scheduler.WaitforTask(sampleTask); - } - } - public void ResetText() { m_textLine = m_textIncrement; @@ -568,7 +512,7 @@ public virtual void Step() for (int i = 0; i < 1; ++i) { b2World_Step(m_worldId, timeStep, m_context.subStepCount); - m_taskCount = 0; + // m_taskCount = 0; } if (timeStep > 0.0f) diff --git a/src/Box2D.NET/B2Arrays.cs b/src/Box2D.NET/B2Arrays.cs index c0c8769e..6a48e4bb 100644 --- a/src/Box2D.NET/B2Arrays.cs +++ b/src/Box2D.NET/B2Arrays.cs @@ -3,11 +3,9 @@ // SPDX-License-Identifier: MIT using System; -using System.Runtime.InteropServices; using System.Runtime.CompilerServices; using static Box2D.NET.B2Constants; using static Box2D.NET.B2Buffers; -using static Box2D.NET.B2Diagnostics; namespace Box2D.NET { @@ -205,5 +203,12 @@ public static void b2Array_Destroy(ref B2Array a) a.count = 0; a.capacity = 0; } + + public static void b2Array_ResizeAndSetZero(ref B2Array a, int n) where T : new() + { + b2Array_Reserve(ref a, n); + // memset(0, ...) + a.count = n; + } } -} +} \ No newline at end of file diff --git a/src/Box2D.NET/B2BroadPhases.cs b/src/Box2D.NET/B2BroadPhases.cs index 2d5fa213..17a0ad92 100644 --- a/src/Box2D.NET/B2BroadPhases.cs +++ b/src/Box2D.NET/B2BroadPhases.cs @@ -17,6 +17,7 @@ using static Box2D.NET.B2ArenaAllocators; using static Box2D.NET.B2MathFunction; using static Box2D.NET.B2Shapes; +using static Box2D.NET.B2ParallelFors; namespace Box2D.NET { @@ -341,12 +342,11 @@ public static bool b2PairQueryCallback(int proxyId, ulong userData, ref B2QueryP } - public static void b2FindPairsTask(int startIndex, int endIndex, uint threadIndex, object context) + public static void b2FindPairsTask(int startIndex, int endIndex, int workerIndex, object context) { - b2TracyCZoneNC(B2TracyCZone.pair_task, "Pair", B2HexColor.b2_colorMediumSlateBlue, true); - - B2_UNUSED(threadIndex); + B2_UNUSED(workerIndex); + b2TracyCZoneNC(B2TracyCZone.pair_task, "Pair", B2HexColor.b2_colorMediumSlateBlue, true); B2World world = context as B2World; B2BroadPhase bp = world.broadPhase; @@ -406,12 +406,8 @@ public static void b2FindPairsTask(int startIndex, int endIndex, uint threadInde b2TracyCZoneEnd(B2TracyCZone.pair_task); } - public static void b2UpdateTreesTask(int startIndex, int endIndex, uint threadIndex, object context) + public static void b2UpdateTreesTask(object context) { - B2_UNUSED(startIndex); - B2_UNUSED(endIndex); - B2_UNUSED(threadIndex); - b2TracyCZoneNC(B2TracyCZone.tree_task, "Rebuild BVH", B2HexColor.b2_colorFireBrick, true); B2World world = (B2World)context; @@ -440,7 +436,7 @@ public static void b2UpdateBroadPhasePairs(B2World world) bp.moveResults = b2AllocateArenaItem(alloc, moveCount, "move results"); // This capacity can be exceeded if there are many overlapping pairs (e.g. all shapes at the origin) - bp.movePairCapacity = 8 * moveCount; + bp.movePairCapacity = 32 * moveCount; bp.movePairs = b2AllocateArenaItem(alloc, bp.movePairCapacity, "move pairs"); b2AtomicStoreInt(ref bp.movePairIndex, 0); @@ -451,20 +447,23 @@ public static void b2UpdateBroadPhasePairs(B2World world) #endif int minRange = 64; - object userPairTask = world.enqueueTaskFcn(b2FindPairsTask, moveCount, minRange, world, world.userTaskContext); - if (userPairTask != null) - { - world.finishTaskFcn(userPairTask, world.userTaskContext); - world.taskCount += 1; - } + b2ParallelFor(world, b2FindPairsTask, moveCount, minRange, world); + + b2TracyCZoneNC(B2TracyCZone.create_contacts, "Create Contacts", B2HexColor.b2_colorCoral, true); // Task that can be done in parallel with the narrow-phase // - rebuild the collision tree for dynamic and kinematic bodies to keep their query performance good - world.userTreeTask = world.enqueueTaskFcn(b2UpdateTreesTask, 1, 1, world, world.userTaskContext); - world.taskCount += 1; - world.activeTaskCount += world.userTreeTask == null ? 0 : 1; - - b2TracyCZoneNC(B2TracyCZone.create_contacts, "Create Contacts", B2HexColor.b2_colorCoral, true); + if (world.taskCount < B2_MAX_TASKS) + { + world.userTreeTask = world.enqueueTaskFcn(b2UpdateTreesTask, world, world.userTaskContext); + world.taskCount += 1; + world.activeTaskCount += world.userTreeTask == null ? 0 : 1; + } + else + { + world.userTreeTask = null; + b2UpdateTreesTask(world); + } // Single-threaded work // - Clear move flags @@ -579,4 +578,4 @@ internal static void b2ValidateNoEnlarged(B2BroadPhase bp) #endif } } -} \ No newline at end of file +} diff --git a/src/Box2D.NET/B2Constants.cs b/src/Box2D.NET/B2Constants.cs index a24634e7..0983d6c4 100644 --- a/src/Box2D.NET/B2Constants.cs +++ b/src/Box2D.NET/B2Constants.cs @@ -12,8 +12,14 @@ public static class B2Constants // problems, so 100km as a limit should be fine in all cases. public static float B2_HUGE => (100000.0f * b2GetLengthUnitsPerMeter()); - // Maximum parallel workers. Used to size some static arrays. - public const int B2_MAX_WORKERS = 64; + // Maximum parallel workers. Used for some fixed size arrays. + public const int B2_MAX_WORKERS = 32; + + // Maximum number of tasks queued per world step. b2EnqueueTaskCallback will never be called + // more than this per world step. This is related to B2_MAX_WORKERS. With 32 workers, + // the maximum observed task count is 130. This allows an external task system to use a fixed + // size array for Box2D task, which may help with creating stable user task pointers. + public const int B2_MAX_TASKS = 256; // Maximum number of colors in the constraint graph. Constraints that cannot // find a color are added to the overflow set which are solved single-threaded. @@ -81,4 +87,4 @@ public static class B2Constants /// Simple djb2 hash function for determinism testing public const int B2_HASH_INIT = 5381; } -} \ No newline at end of file +} diff --git a/src/Box2D.NET/B2ContactSolvers.cs b/src/Box2D.NET/B2ContactSolvers.cs index 2570cf42..16c6ddc1 100644 --- a/src/Box2D.NET/B2ContactSolvers.cs +++ b/src/Box2D.NET/B2ContactSolvers.cs @@ -789,10 +789,11 @@ public static B2Vec2W b2RotateVectorW(in B2RotW q, in B2Vec2W v) } - public static int b2GetContactConstraintSIMDByteCount() + // This function allows hiding SIMD intrinsics in the source file to improve compilation performance. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int b2GetWideContactConstraintByteCount() { - //return sizeof( b2ContactConstraintSIMD ); - return -1; + return B2SizeOf.Size; } @@ -1959,4 +1960,4 @@ internal static void b2StoreImpulsesTask(int startIndex, int endIndex, B2StepCon b2TracyCZoneEnd(B2TracyCZone.store_impulses); } } -} \ No newline at end of file +} diff --git a/src/Box2D.NET/B2Delegates.cs b/src/Box2D.NET/B2Delegates.cs index b9bad37b..9c9f0cf1 100644 --- a/src/Box2D.NET/B2Delegates.cs +++ b/src/Box2D.NET/B2Delegates.cs @@ -23,35 +23,16 @@ namespace Box2D.NET public delegate void b2LogFcn(in string message); /// Task interface - /// This is prototype for a Box2D task. Your task system is expected to invoke the Box2D task with these arguments. - /// The task spans a range of the parallel-for: [startIndex, endIndex) - /// The worker index must correctly identify each worker in the user thread pool, expected in [0, workerCount). - /// A worker must only exist on only one thread at a time and is analogous to the thread index. - /// The task context is the context pointer sent from Box2D when it is enqueued. - /// The startIndex and endIndex are expected in the range [0, itemCount) where itemCount is the argument to b2EnqueueTaskCallback - /// below. Box2D expects startIndex < endIndex and will execute a loop like this: - /// - /// @code{.c} - /// for (int i = startIndex; i < endIndex; ++i) - /// { - /// DoWork(); - /// } - /// @endcode + /// This is the prototype for a Box2D task. Your task system is expected to run this callback on a worker thread, + /// exactly once per enqueue, passing back the same taskContext pointer supplied to b2EnqueueTaskCallback. /// @ingroup world - public delegate void b2TaskCallback(int startIndex, int endIndex, uint workerIndex, object taskContext); + public delegate void b2TaskCallback(object taskContext); - /// These functions can be provided to Box2D to invoke a task system. These are designed to work well with enkiTS. + /// These functions can be provided to Box2D to invoke a task system. /// Returns a pointer to the user's task object. May be nullptr. A nullptr indicates to Box2D that the work was executed /// serially within the callback and there is no need to call b2FinishTaskCallback. - /// The itemCount is the number of Box2D work items that are to be partitioned among workers by the user's task system. - /// This is essentially a parallel-for. The minRange parameter is a suggestion of the minimum number of items to assign - /// per worker to reduce overhead. For example, suppose the task is small and that itemCount is 16. A minRange of 8 suggests - /// that your task system should split the work items among just two workers, even if you have more available. - /// In general the range [startIndex, endIndex) send to b2TaskCallback should obey: - /// endIndex - startIndex >= minRange - /// The exception of course is when itemCount < minRange. /// @ingroup world - public delegate object b2EnqueueTaskCallback(b2TaskCallback task, int itemCount, int minRange, object taskContext, object userContext); + public delegate object b2EnqueueTaskCallback(b2TaskCallback task, object taskContext, object userContext); /// Finishes a user task object that wraps a Box2D task. /// @ingroup world @@ -192,4 +173,4 @@ namespace Box2D.NET /// Draw a string in world space public delegate void DrawStringFcn(in B2Vec2 p, string s, B2HexColor color, object context); -} \ No newline at end of file +} diff --git a/src/Box2D.NET/B2Islands.cs b/src/Box2D.NET/B2Islands.cs index d3e9a658..e8d8e4aa 100644 --- a/src/Box2D.NET/B2Islands.cs +++ b/src/Box2D.NET/B2Islands.cs @@ -678,12 +678,10 @@ public static void b2SplitIsland(B2World world, int baseId) // Note: static bodies are never in an island // Note: this task interacts with some allocators without locks under the assumption that no other tasks // are interacting with these data structures. - public static void b2SplitIslandTask(int startIndex, int endIndex, uint threadIndex, object context) + public static void b2SplitIslandTask(object context) { b2TracyCZoneNC(B2TracyCZone.split, "Split Island", B2HexColor.b2_colorOlive, true); - B2_UNUSED(startIndex, endIndex, threadIndex); - ulong ticks = b2GetTicks(); B2World world = (B2World)context; diff --git a/src/Box2D.NET/B2Mutexes.cs b/src/Box2D.NET/B2Mutexes.cs index b843af46..43618e09 100644 --- a/src/Box2D.NET/B2Mutexes.cs +++ b/src/Box2D.NET/B2Mutexes.cs @@ -20,12 +20,12 @@ public static void b2DestroyMutex(ref B2Mutex m) m.lc = null; } - internal static void b2LockMutex(ref B2Mutex m) + public static void b2LockMutex(ref B2Mutex m) { Monitor.Enter(m.lc); } - internal static void b2UnlockMutex(ref B2Mutex m) + public static void b2UnlockMutex(ref B2Mutex m) { Monitor.Exit(m.lc); } diff --git a/src/Box2D.NET/B2ParallelFor.cs b/src/Box2D.NET/B2ParallelFor.cs new file mode 100644 index 00000000..8bccff59 --- /dev/null +++ b/src/Box2D.NET/B2ParallelFor.cs @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using System; +using static Box2D.NET.B2Atomics; +using static Box2D.NET.B2Constants; +using static Box2D.NET.B2Diagnostics; +using static Box2D.NET.B2MathFunction; + +namespace Box2D.NET +{ + // Callback invoked by b2ParallelFor to process a range of items. May be called + // multiple times per worker: work is divided into blocks that workers claim + // atomically, so a worker that finishes early picks up the next unclaimed + // block instead of sitting idle. workerIndex is the worker identity and is + // stable across all invocations from the same worker, so it is safe to use as + // an index into per-worker state (e.g. world.taskContexts.data[workerIndex]). + public delegate void b2ParallelForCallback(int startIndex, int endIndex, int workerIndex, object context); + + public static class B2ParallelFors + { + private static void b2ParallelForTrampoline(object taskContext) + { + B2ParallelForTask task = (B2ParallelForTask)taskContext; + B2ParallelForShared shared = task.shared; + int workerIndex = task.workerIndex; + object context = shared.context; + b2ParallelForCallback callback = shared.callback; + + int blockCount = shared.blockCount; + int blockSize = shared.blockSize; + int itemCount = shared.itemCount; + + while (true) + { + int blockIndex = b2AtomicFetchAddInt(ref shared.nextBlock, 1); + if (blockIndex >= blockCount) + { + break; + } + + int start = blockIndex * blockSize; + int end = start + blockSize; + if (end > itemCount) + { + end = itemCount; + } + + callback(start, end, workerIndex, context); + } + } + + // Divide [0, itemCount) into blocks and process them with cooperative claiming: + // up to world.workerCount tasks are enqueued, and each task loops, atomically + // claiming the next unclaimed block until the range is drained. Blocks the + // caller until all work is complete. minRange is the minimum block size; block + // size grows once itemCount exceeds 4 * workerCount * minRange so block count + // stays bounded. + public static void b2ParallelFor(B2World world, b2ParallelForCallback callback, int itemCount, int minRange, object context) + { + if (itemCount <= 0) + { + return; + } + + B2_ASSERT(minRange > 0); + + int workerCount = world.workerCount; + B2_ASSERT(0 < workerCount && workerCount <= B2_MAX_WORKERS); + + // Target multiple blocks per worker to reduce thread stalls. + // block size grows once items exceed maxBlockCount * minRange + // so the block count stays bounded and per-block sync overhead stays low. + int blocksPerWorker = 4; + int maxBlockCount = blocksPerWorker * workerCount; + + int blockSize; + int blockCount; + if (itemCount <= minRange * maxBlockCount) + { + blockSize = minRange; + blockCount = (itemCount + blockSize - 1) / blockSize; + } + else + { + blockSize = (itemCount + maxBlockCount - 1) / maxBlockCount; + blockCount = (itemCount + blockSize - 1) / blockSize; + } + + B2_ASSERT(blockCount >= 1); + B2_ASSERT(blockSize * blockCount >= itemCount); + + // No point enqueueing more tasks than blocks. + int taskCount = b2MinInt(workerCount, blockCount); + + B2ParallelForShared shared = new B2ParallelForShared + { + blockCount = blockCount, + blockSize = blockSize, + itemCount = itemCount, + callback = callback, + context = context, + }; + b2AtomicStoreInt(ref shared.nextBlock, 0); + + Span tasks = new B2ParallelForTask[B2_MAX_WORKERS]; + object[] handles = new object[B2_MAX_WORKERS]; + for (int i = 0; i < taskCount; ++i) + { + tasks[i] = new B2ParallelForTask + { + shared = shared, + workerIndex = i, + }; + + if (world.taskCount < B2_MAX_TASKS) + { + handles[i] = world.enqueueTaskFcn(b2ParallelForTrampoline, tasks[i], world.userTaskContext); + world.taskCount += 1; + world.activeTaskCount += handles[i] == null ? 0 : 1; + } + else + { + handles[i] = null; + b2ParallelForTrampoline(tasks[i]); + } + } + + for (int i = 0; i < taskCount; ++i) + { + if (handles[i] != null) + { + world.finishTaskFcn(handles[i], world.userTaskContext); + world.activeTaskCount -= 1; + } + } + } + } +} \ No newline at end of file diff --git a/src/Box2D.NET/B2ParallelForShared.cs b/src/Box2D.NET/B2ParallelForShared.cs new file mode 100644 index 00000000..5b5734dd --- /dev/null +++ b/src/Box2D.NET/B2ParallelForShared.cs @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +namespace Box2D.NET +{ + // Shared state for one b2ParallelFor invocation. Workers race on nextBlock to + // claim work, so a slow chunk can't strand the other threads. + internal class B2ParallelForShared + { + public B2AtomicInt nextBlock; + public int blockCount; + public int blockSize; + public int itemCount; + public b2ParallelForCallback callback; + public object context; + } +} \ No newline at end of file diff --git a/src/Box2D.NET/B2ParallelForTask.cs b/src/Box2D.NET/B2ParallelForTask.cs new file mode 100644 index 00000000..8144b24c --- /dev/null +++ b/src/Box2D.NET/B2ParallelForTask.cs @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +namespace Box2D.NET +{ + internal class B2ParallelForTask + { + public B2ParallelForShared shared; + public int workerIndex; + } + +} \ No newline at end of file diff --git a/src/Box2D.NET/B2Scheduler.cs b/src/Box2D.NET/B2Scheduler.cs new file mode 100644 index 00000000..02cd2ffc --- /dev/null +++ b/src/Box2D.NET/B2Scheduler.cs @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using static Box2D.NET.B2Constants; + +namespace Box2D.NET +{ + public sealed class B2Scheduler + { + public B2Thread[] threads = new B2Thread[B2_MAX_WORKERS]; + public B2SchedulerWorkerContext[] workerContexts = new B2SchedulerWorkerContext[B2_MAX_WORKERS]; + + // total workers including main thread + public int workerCount; + + // threads created = workerCount - 1 + public int threadCount; + + public readonly B2SchedulerTask[] tasks = new B2SchedulerTask[B2_MAX_TASKS]; + public B2AtomicInt nextSlot; + + public B2Semaphore taskSemaphore; + public B2AtomicInt shutdown; + } +} \ No newline at end of file diff --git a/src/Box2D.NET/B2SchedulerTask.cs b/src/Box2D.NET/B2SchedulerTask.cs new file mode 100644 index 00000000..26357c4a --- /dev/null +++ b/src/Box2D.NET/B2SchedulerTask.cs @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +namespace Box2D.NET +{ + public class B2SchedulerTask + { + public b2TaskCallback callback; + public object taskContext; + public B2AtomicInt status; + } +} diff --git a/src/Box2D.NET/B2SchedulerTaskStatus.cs b/src/Box2D.NET/B2SchedulerTaskStatus.cs new file mode 100644 index 00000000..a7d98699 --- /dev/null +++ b/src/Box2D.NET/B2SchedulerTaskStatus.cs @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +namespace Box2D.NET +{ + public enum B2SchedulerTaskStatus + { + b2_schedulerFree = 0, + b2_schedulerPending = 1, + b2_schedulerClaimed = 2, + b2_schedulerComplete = 3, + } +} diff --git a/src/Box2D.NET/B2SchedulerWorkerContext.cs b/src/Box2D.NET/B2SchedulerWorkerContext.cs new file mode 100644 index 00000000..fbef0a7f --- /dev/null +++ b/src/Box2D.NET/B2SchedulerWorkerContext.cs @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +namespace Box2D.NET +{ + public sealed class B2SchedulerWorkerContext + { + public B2Scheduler scheduler; + public int threadIndex; + } +} diff --git a/src/Box2D.NET/B2Schedulers.cs b/src/Box2D.NET/B2Schedulers.cs new file mode 100644 index 00000000..c8565d35 --- /dev/null +++ b/src/Box2D.NET/B2Schedulers.cs @@ -0,0 +1,169 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using static Box2D.NET.B2Atomics; +using static Box2D.NET.B2Constants; +using static Box2D.NET.B2Diagnostics; +using static Box2D.NET.B2Timers; +using static Box2D.NET.B2Semaphores; +using static Box2D.NET.B2Threads; + +namespace Box2D.NET +{ + public static class B2Schedulers + { + // Try to claim and execute one pending task. + // Returns true if work was performed, false otherwise. + private static bool b2SchedulerExecuteOne(B2Scheduler scheduler) + { + int taskCount = b2AtomicLoadInt(ref scheduler.nextSlot); + for (int t = 0; t < taskCount; ++t) + { + ref B2SchedulerTask task = ref scheduler.tasks[t]; + if (b2AtomicLoadInt(ref task.status) != (int)B2SchedulerTaskStatus.b2_schedulerPending) + { + continue; + } + + if (b2AtomicCompareExchangeInt(ref task.status, (int)B2SchedulerTaskStatus.b2_schedulerPending, (int)B2SchedulerTaskStatus.b2_schedulerClaimed) == false) + { + continue; + } + + task.callback(task.taskContext); + + b2AtomicStoreInt(ref task.status, (int)B2SchedulerTaskStatus.b2_schedulerComplete); + return true; + } + + return false; + } + + // Background worker thread entry point. + internal static void b2SchedulerWorkerMain(object context) + { + B2SchedulerWorkerContext workerContext = (B2SchedulerWorkerContext)context; + B2Scheduler scheduler = workerContext.scheduler; + + while (true) + { + b2WaitSemaphore(ref scheduler.taskSemaphore); + + if (b2AtomicLoadInt(ref scheduler.shutdown) != 0) + { + break; + } + + // Claim and execute all available work + while (b2SchedulerExecuteOne(scheduler)) + { + } + } + } + + internal static B2Scheduler b2CreateScheduler(int workerCount) + { + B2_ASSERT(0 < workerCount && workerCount <= B2_MAX_WORKERS); + B2Scheduler scheduler = new B2Scheduler(); + + scheduler.workerCount = workerCount; + int threadCount = workerCount - 1; + scheduler.threadCount = threadCount; + scheduler.taskSemaphore = b2CreateSemaphore(0); + b2AtomicStoreInt(ref scheduler.shutdown, 0); + b2AtomicStoreInt(ref scheduler.nextSlot, 0); + + for (int i = 0; i < scheduler.tasks.Length; ++i) + { + scheduler.tasks[i] = new B2SchedulerTask(); + } + + // Background threads use indices 1..workerCount-1. + // Main thread uses index 0. + for (int i = 0; i < threadCount; ++i) + { + scheduler.workerContexts[i] = new B2SchedulerWorkerContext(); + scheduler.workerContexts[i].scheduler = scheduler; + scheduler.workerContexts[i].threadIndex = i + 1; + + string name = $"box2d_worker_{i + 1:00}"; + scheduler.threads[i] = b2CreateThread(b2SchedulerWorkerMain, scheduler.workerContexts[i], name); + } + + return scheduler; + } + + + internal static void b2DestroyScheduler(B2Scheduler scheduler) + { + b2AtomicStoreInt(ref scheduler.shutdown, 1); + + // Wake all background threads so they see the shutdown flag + for (int i = 0; i < scheduler.threadCount; ++i) + { + b2SignalSemaphore(ref scheduler.taskSemaphore); + } + + for (int i = 0; i < scheduler.threadCount; ++i) + { + b2JoinThread(scheduler.threads[i]); + scheduler.threads[i] = null; + } + + b2DestroySemaphore(ref scheduler.taskSemaphore); + + scheduler.threads = null; + scheduler.workerContexts = null; + scheduler.workerCount = 0; + } + + internal static void b2ResetScheduler(B2Scheduler scheduler) + { + b2AtomicStoreInt(ref scheduler.nextSlot, 0); + } + + // See b2EnqueueTaskCallback and b2FinishTaskCallback + internal static object b2SchedulerEnqueueTask(b2TaskCallback task, object taskContext, object userContext) + { + B2Scheduler scheduler = (B2Scheduler)userContext; + + int slot = b2AtomicFetchAddInt(ref scheduler.nextSlot, 1); + B2_ASSERT(slot < B2_MAX_TASKS); + + ref B2SchedulerTask schedulerTask = ref scheduler.tasks[slot]; + schedulerTask.callback = task; + schedulerTask.taskContext = taskContext; + + // Memory fence: status must be published after callback and context are written + b2AtomicStoreInt(ref schedulerTask.status, (int)B2SchedulerTaskStatus.b2_schedulerPending); + + // One wake per enqueue is enough: at most one worker picks up each task. + b2SignalSemaphore(ref scheduler.taskSemaphore); + + return schedulerTask; + } + + internal static void b2SchedulerFinishTask(object userTask, object userContext) + { + if (userTask == null) + { + return; + } + + B2Scheduler scheduler = (B2Scheduler)userContext; + B2SchedulerTask waitTask = (B2SchedulerTask)userTask; + + // Main thread helps execute any available work while waiting for the + // target task to complete. This keeps the main thread from idling when + // background threads are busy on other tasks from the same phase. + while (b2AtomicLoadInt(ref waitTask.status) != (int)B2SchedulerTaskStatus.b2_schedulerComplete) + { + if (b2SchedulerExecuteOne(scheduler) == false) + { + b2Yield(); + } + } + } + } +} diff --git a/src/Box2D.NET/B2Semaphore.cs b/src/Box2D.NET/B2Semaphore.cs new file mode 100644 index 00000000..1207ce67 --- /dev/null +++ b/src/Box2D.NET/B2Semaphore.cs @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using System.Threading; + +namespace Box2D.NET +{ + public struct B2Semaphore + { + public SemaphoreSlim semaphore; + } +} diff --git a/src/Box2D.NET/B2Semaphores.cs b/src/Box2D.NET/B2Semaphores.cs new file mode 100644 index 00000000..db4be503 --- /dev/null +++ b/src/Box2D.NET/B2Semaphores.cs @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using System.Threading; + +namespace Box2D.NET +{ + public static class B2Semaphores + { + public static B2Semaphore b2CreateSemaphore(int initCount) + { + B2Semaphore s; + s.semaphore = new SemaphoreSlim(initCount, int.MaxValue); + return s; + } + + public static void b2DestroySemaphore(ref B2Semaphore s) + { + s.semaphore?.Dispose(); + s.semaphore = null; + } + + public static void b2WaitSemaphore(ref B2Semaphore s) + { + s.semaphore.Wait(); + } + + public static void b2SignalSemaphore(ref B2Semaphore s) + { + s.semaphore.Release(); + } + } +} \ No newline at end of file diff --git a/src/Box2D.NET/B2Sensors.cs b/src/Box2D.NET/B2Sensors.cs index a8d3ebfb..c264d369 100644 --- a/src/Box2D.NET/B2Sensors.cs +++ b/src/Box2D.NET/B2Sensors.cs @@ -15,6 +15,7 @@ using static Box2D.NET.B2Distances; using static Box2D.NET.B2BitSets; using static Box2D.NET.B2CTZs; +using static Box2D.NET.B2ParallelFors; namespace Box2D.NET @@ -128,12 +129,11 @@ internal static int b2CompareVisitors(ref B2Visitor a, ref B2Visitor b) return 1; } - internal static void b2SensorTask(int startIndex, int endIndex, uint threadIndex, object context) + internal static void b2SensorTask(int startIndex, int endIndex, int threadIndex, object context) { b2TracyCZoneNC(B2TracyCZone.sensor_task, "Overlap", B2HexColor.b2_colorBrown, true); B2World world = context as B2World; - B2_ASSERT((int)threadIndex < world.workerCount); B2SensorTaskContext taskContext = world.sensorTaskContexts.data[threadIndex]; B2_ASSERT(startIndex < endIndex); @@ -255,12 +255,7 @@ internal static void b2OverlapSensors(B2World world) // Parallel-for sensors overlaps int minRange = 16; - object userSensorTask = world.enqueueTaskFcn(b2SensorTask, sensorCount, minRange, world, world.userTaskContext); - world.taskCount += 1; - if (userSensorTask != null) - { - world.finishTaskFcn(userSensorTask, world.userTaskContext); - } + b2ParallelFor(world, b2SensorTask, sensorCount, minRange, world); b2TracyCZoneNC(B2TracyCZone.sensor_state, "Events", B2HexColor.b2_colorLightSlateGray, true); @@ -401,4 +396,4 @@ public static void b2DestroySensor(B2World world, B2Shape sensorShape) } } } -} \ No newline at end of file +} diff --git a/src/Box2D.NET/B2SolverBlock.cs b/src/Box2D.NET/B2SolverBlock.cs index a170e566..6d25e719 100644 --- a/src/Box2D.NET/B2SolverBlock.cs +++ b/src/Box2D.NET/B2SolverBlock.cs @@ -4,20 +4,53 @@ namespace Box2D.NET { - // Each block of work has a sync index that gets incremented when a worker claims the block. This ensures only a single worker - // claims a block, yet lets work be distributed dynamically across multiple workers (work stealing). This also reduces contention - // on a single block index atomic. For non-iterative stages the sync index is simply set to one. For iterative stages (solver - // iteration) the same block of work is executed once per iteration and the atomic sync index is shared across iterations, so it - // increases monotonically. + // Solver work is partitioned into fixed-size blocks that worker threads claim + // in parallel via atomic CAS on each block's own syncIndex. Three properties + // of this design matter for performance: + // + // 1. Distributed contention. Per-block atomic syncIndex avoids the cache line stampede + // that a single shared fetch_add counter would cause. Once a worker + // settles into a block range, its CAS targets live in its own L1. + // + // 2. Monotonic syncIndex across iterations. Iterative stages (warm start, + // solve, relax) reuse the same block array every sub-step iteration. + // syncIndex grows each iteration; workers CAS (prev, prev+1), so the + // main thread never touches any per-block state between iterations. + // Non-iterative stages simply use syncIndex 1. + // + // 3. L2 affinity across iterations. Each worker picks a start offset from + // its workerIndex, then scans forward and (after wrap) backward: + // + // blocks: [0] [1] [2] [3] [4] [5] [6] [7] + // ^ ^ ^ ^ + // W0 W1 W2 W3 <- start offsets + // + // W0 claims 0,1,2,3 (forward), W1 claims 4,5, etc. Under balanced load + // each worker re-hits the same block range every iteration, keeping that + // range's hot data resident in its L2. A failed CAS means a neighbour + // already claimed the block, so the stealing worker stops -- preserving + // locality under mild imbalance while still draining the queue. + // + // A graph color stage lays out joint blocks first, then contact blocks: + // + // stage->blocks -> + // +------+------+------+------+------+------+------+ + // | J0 | J1 | J2 | C0 | C1 | C2 | C3 | + // +------+------+------+------+------+------+------+ + // <-- graphJointBlocks --><---- graphContactBlocks ----> + // + // Each block carries its type so the dispatcher routes J-blocks to the joint + // solver and C-blocks to the SIMD contact solver; both kinds run concurrently + // within the stage -- no barrier between them. The type tag lives on the + // block (not the stage) so that mixed-type stages can keep the concurrency. public class B2SolverBlock { public int startIndex; - // todo make this uint16_t - public short count; + public ushort count; - public short blockType; // b2SolverBlockType + // b2SolverBlockType + public short blockType; - // todo consider false sharing of this atomic public B2AtomicInt syncIndex; } } diff --git a/src/Box2D.NET/B2Solvers.cs b/src/Box2D.NET/B2Solvers.cs index 9267bd0e..55a0bdae 100644 --- a/src/Box2D.NET/B2Solvers.cs +++ b/src/Box2D.NET/B2Solvers.cs @@ -26,6 +26,7 @@ using static Box2D.NET.B2Timers; using static Box2D.NET.B2Islands; using static Box2D.NET.B2BroadPhases; +using static Box2D.NET.B2ParallelFors; using static Box2D.NET.B2ArenaAllocators; using static Box2D.NET.B2ConstraintGraphs; using static Box2D.NET.B2CTZs; @@ -659,15 +660,13 @@ internal static void b2SolveContinuous(B2World world, int bodySimIndex, B2TaskCo b2TracyCZoneEnd(B2TracyCZone.ccd); } - internal static void b2FinalizeBodiesTask(int startIndex, int endIndex, uint threadIndex, object context) + internal static void b2FinalizeBodiesTask(int startIndex, int endIndex, int threadIndex, object context) { b2TracyCZoneNC(B2TracyCZone.finalize_transforms, "Transforms", B2HexColor.b2_colorMediumSeaGreen, true); B2StepContext stepContext = context as B2StepContext; B2World world = stepContext.world; - B2_ASSERT((int)threadIndex < world.workerCount); - bool enableSleep = world.enableSleep; B2BodyState[] states = stepContext.states; B2BodySim[] sims = stepContext.sims; @@ -681,9 +680,9 @@ internal static void b2FinalizeBodiesTask(int startIndex, int endIndex, uint thr B2_ASSERT(endIndex <= world.bodyMoveEvents.count); B2BodyMoveEvent[] moveEvents = world.bodyMoveEvents.data; - ref B2BitSet enlargedSimBitSet = ref world.taskContexts.data[threadIndex].enlargedSimBitSet; - ref B2BitSet awakeIslandBitSet = ref world.taskContexts.data[threadIndex].awakeIslandBitSet; B2TaskContext taskContext = world.taskContexts.data[threadIndex]; + ref B2BitSet enlargedSimBitSet = ref taskContext.enlargedSimBitSet; + ref B2BitSet awakeIslandBitSet = ref taskContext.awakeIslandBitSet; bool enableContinuous = world.enableContinuous; @@ -894,6 +893,85 @@ public enum b2SolverBlockType } b2SolverBlockType; */ + // Compute the number of work blocks needed given an item count and desired block size. + // If there are too many blocks for the worker count, the block size is enlarged. + internal static int b2ComputeBlockCount(int itemCount, int defaultBlockSize, int maxBlockCount) + { + if (itemCount == 0) + { + return 0; + } + + if (itemCount > defaultBlockSize * maxBlockCount) + { + return maxBlockCount; + } + + return ((itemCount - 1) / defaultBlockSize) + 1; + } + + // Initialize solver blocks for a contiguous range of items. Computes block size internally + // from the same parameters used by b2ComputeBlockCount. + internal static void b2InitBlocks(ArraySegment blocks, int blockCount, int itemCount, int defaultBlockSize, int maxBlockCount, B2SolverBlockType blockType) + { + if (blockCount == 0) + { + return; + } + + // Compute the number of elements per block + int blockSize; + if (itemCount > defaultBlockSize * maxBlockCount) + { + blockSize = itemCount / maxBlockCount; + } + else + { + blockSize = defaultBlockSize; + } + + // Simulation too big + B2_ASSERT(blockSize <= ushort.MaxValue); + + for (int i = 0; i < blockCount; ++i) + { + blocks[i].startIndex = i * blockSize; + blocks[i].count = (ushort)blockSize; + blocks[i].blockType = (short)blockType; + b2AtomicStoreInt(ref blocks[i].syncIndex, 0); + } + + // The last block may not be full + blocks[blockCount - 1].count = (ushort)(itemCount - (blockCount - 1) * blockSize); + } + + internal static int b2InitStage(int stageIndex, ArraySegment stages, B2SolverStageType type, ArraySegment blocks, int blockCount, int colorIndex) + { + B2SolverStage stage = stages[stageIndex]; + stage.type = type; + stage.blocks = blocks; + stage.blockCount = blockCount; + stage.colorIndex = colorIndex; + b2AtomicStoreInt(ref stage.completionCount, 0); + return stageIndex + 1; + } + + // Initialize one stage per color for each iteration. Used for warm start, solve, relax, and restitution. + internal static int b2InitColorStages(int stageIndex, ArraySegment stages, B2SolverStageType type, int iterations, + int activeColorCount, ArraySegment[] graphColorBlocks, ReadOnlySpan colorBlockCounts, + ReadOnlySpan activeColorIndices) + { + for (int j = 0; j < iterations; ++j) + { + for (int i = 0; i < activeColorCount; ++i) + { + stageIndex = b2InitStage(stageIndex, stages, type, graphColorBlocks[i], colorBlockCounts[i], activeColorIndices[i]); + } + } + + return stageIndex; + } + internal static void b2ExecuteBlock(B2SolverStage stage, B2StepContext context, B2SolverBlock block, int workerIndex) { B2SolverStageType stageType = stage.type; @@ -1080,11 +1158,8 @@ internal static void b2ExecuteMainStage(B2SolverStage stage, B2StepContext conte } } - // This should not use the thread index because thread 0 can be called twice by enkiTS. - internal static void b2SolverTask(int startIndex, int endIndex, uint threadIndexIgnore, object taskContext) + internal static void b2SolverTask(object taskContext) { - B2_UNUSED(startIndex, endIndex, threadIndexIgnore); - B2WorkerContext workerContext = taskContext as B2WorkerContext; int workerIndex = workerContext.workerIndex; B2StepContext context = workerContext.context; @@ -1320,14 +1395,12 @@ internal static void b2SolverTask(int startIndex, int endIndex, uint threadIndex } } - internal static void b2BulletBodyTask(int startIndex, int endIndex, uint threadIndex, object context) + internal static void b2BulletBodyTask(int startIndex, int endIndex, int threadIndex, object context) { - B2_UNUSED(threadIndex); - b2TracyCZoneNC(B2TracyCZone.bullet_body_task, "Bullet", B2HexColor.b2_colorLightSkyBlue, true); B2StepContext stepContext = context as B2StepContext; - B2TaskContext taskContext = b2Array_Get(ref stepContext.world.taskContexts, (int)threadIndex); + B2TaskContext taskContext = b2Array_Get(ref stepContext.world.taskContexts, threadIndex); B2_ASSERT(startIndex <= endIndex); @@ -1393,63 +1466,32 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) // prepare for move events b2Array_Resize(ref world.bodyMoveEvents, awakeBodyCount); - // A block is a range of tasks, a start index and count as a sub-array. - // Each worker receives at most M blocks of work. The workers may receive less blocks if there is not sufficient work. - // Each block of work has a minimum number of elements (block size). This in turn may limit the number of blocks. - // If there are many elements then the block size is increased so there are still at most M blocks of work per worker. - // M is a tunable number that has two goals: - // 1. keep M small to reduce overhead - // 2. keep M large enough for other workers to be able to steal work - // The block size is a power of two to make math efficient. - int workerCount = world.workerCount; - // todo 4 seems good but more benchmarking would be good + // 4 is a small power of two that allows for meaningful work stealing const int blocksPerWorker = 4; int maxBlockCount = blocksPerWorker * workerCount; // Configure blocks for tasks that parallel-for bodies - int bodyBlockSize = 1 << 5; - int bodyBlockCount; - if (awakeBodyCount > bodyBlockSize * maxBlockCount) - { - // Too many blocks, increase block size - bodyBlockSize = awakeBodyCount / maxBlockCount; - bodyBlockCount = maxBlockCount; - } - else - { - // Divide by bodyBlockSize (32) and ensure there is at least one block - bodyBlockCount = ((awakeBodyCount - 1) >> 5) + 1; - } + int bodyBlockCount = b2ComputeBlockCount(awakeBodyCount, 1 << 5, maxBlockCount); - // Configure blocks for tasks parallel-for each active graph color - // The blocks are a mix of SIMD contact blocks and joint blocks B2_ASSERT(B2FixedArray24.Size == B2_GRAPH_COLOR_COUNT); - B2FixedArray24 arrayActiveColorIndices = new B2FixedArray24(); + // Configure blocks for tasks parallel-for each active graph color + // The blocks are a mix of wide contact blocks and joint blocks + B2FixedArray24 arrayActiveColorIndices = new B2FixedArray24(); B2FixedArray24 arrayColorContactCounts = new B2FixedArray24(); - B2FixedArray24 arrayColorContactBlockSizes = new B2FixedArray24(); - B2FixedArray24 arrayColorContactBlockCounts = new B2FixedArray24(); - B2FixedArray24 arrayColorJointCounts = new B2FixedArray24(); - B2FixedArray24 arrayColorJointBlockSizes = new B2FixedArray24(); - B2FixedArray24 arrayColorJointBlockCounts = new B2FixedArray24(); + B2FixedArray24 arrayColorBlockCounts = new B2FixedArray24(); Span activeColorIndices = arrayActiveColorIndices.AsSpan(); - Span colorContactCounts = arrayColorContactCounts.AsSpan(); - Span colorContactBlockSizes = arrayColorContactBlockSizes.AsSpan(); - Span colorContactBlockCounts = arrayColorContactBlockCounts.AsSpan(); - Span colorJointCounts = arrayColorJointCounts.AsSpan(); - Span colorJointBlockSizes = arrayColorJointBlockSizes.AsSpan(); - Span colorJointBlockCounts = arrayColorJointBlockCounts.AsSpan(); - + Span colorBlockCounts = arrayColorBlockCounts.AsSpan(); int graphBlockCount = 0; // c is the active color index - int simdContactCount = 0; + int wideContactCount = 0; int c = 0; for (int i = 0; i < B2_GRAPH_COLOR_COUNT - 1; ++i) { @@ -1460,58 +1502,17 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) { activeColorIndices[c] = i; - // 4/8-way SIMD - int colorContactCountSIMD = colorContactCount > 0 ? ((colorContactCount - 1) >> B2_SIMD_SHIFT) + 1 : 0; - - colorContactCounts[c] = colorContactCountSIMD; - - // determine the number of contact work blocks for this color - if (colorContactCountSIMD > blocksPerWorker * maxBlockCount) - { - // too many contact blocks per worker, so make bigger blocks - colorContactBlockSizes[c] = colorContactCountSIMD / maxBlockCount; - colorContactBlockCounts[c] = maxBlockCount; - } - else if (colorContactCountSIMD > 0) - { - // dividing by blocksPerWorker (4) - colorContactBlockSizes[c] = blocksPerWorker; - - // This math makes sure there is at least one block - //colorContactBlockCounts[c] = ((colorContactCountSIMD - 1) >> 2) + 1; - colorContactBlockCounts[c] = ((colorContactCountSIMD - 1) / blocksPerWorker) + 1; - } - else - { - // no contacts in this color - colorContactBlockSizes[c] = 0; - colorContactBlockCounts[c] = 0; - } - + // Ceiling for wide constraint count + int colorContactCountW = colorContactCount > 0 ? ((colorContactCount - 1) >> B2_SIMD_SHIFT) + 1 : 0; + colorContactCounts[c] = colorContactCountW; colorJointCounts[c] = colorJointCount; - // determine number of joint work blocks for this color - if (colorJointCount > blocksPerWorker * maxBlockCount) - { - // too many joint blocks - colorJointBlockSizes[c] = colorJointCount / maxBlockCount; - colorJointBlockCounts[c] = maxBlockCount; - } - else if (colorJointCount > 0) - { - // dividing by blocksPerWorker (4) - colorJointBlockSizes[c] = blocksPerWorker; - //colorJointBlockCounts[c] = ((colorJointCount - 1) >> 2) + 1; - colorJointBlockCounts[c] = ((colorJointCount - 1) / 4) + 1; - } - else - { - colorJointBlockSizes[c] = 0; - colorJointBlockCounts[c] = 0; - } + int colorContactBlockCount = b2ComputeBlockCount(colorContactCountW, blocksPerWorker, maxBlockCount); + int colorJointBlockCount = b2ComputeBlockCount(colorJointCount, blocksPerWorker, maxBlockCount); + colorBlockCounts[c] = colorContactBlockCount + colorJointBlockCount; - graphBlockCount += colorContactBlockCounts[c] + colorJointBlockCounts[c]; - simdContactCount += colorContactCountSIMD; + graphBlockCount += colorBlockCounts[c]; + wideContactCount += colorContactCountW; c += 1; } } @@ -1519,17 +1520,17 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) activeColorCount = c; // Gather contact pointers for easy parallel-for traversal. Some may be NULL due to SIMD remainders. - ArraySegment contacts = b2AllocateArenaItem( - world.arena, B2_SIMD_WIDTH * simdContactCount, "contact pointers"); + ArraySegment contacts = + b2AllocateArenaItem(world.arena, B2_SIMD_WIDTH * wideContactCount, "contact pointers"); // Gather joint pointers for easy parallel-for traversal. ArraySegment joints = b2AllocateArenaItem(world.arena, awakeJointCount, "joint pointers"); B2_ASSERT(B2FixedArray4.Size == B2_SIMD_WIDTH); - int simdConstraintSize = b2GetContactConstraintSIMDByteCount(); + int wideContactConstraintByteCount = b2GetWideContactConstraintByteCount(); ArraySegment wideContactConstraints = - b2AllocateArenaItem(world.arena, simdContactCount /** simdConstraintSize */, "contact constraint"); + b2AllocateArenaItem(world.arena, wideContactCount /** wideContactConstraintByteCount */, "contact constraint"); int overflowContactCount = colors[B2_OVERFLOW_INDEX].contactSims.count; ArraySegment overflowContactConstraints = b2AllocateArenaItem( @@ -1554,24 +1555,25 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) } else { - //color.simdConstraints = (b2ContactConstraintSIMD*)( (byte*)simdContactConstraints + contactBase * simdConstraintSize ); color.wideConstraints = wideContactConstraints.Slice(contactBase); + // Flat array of contacts for (int k = 0; k < colorContactCount; ++k) { contacts[B2_SIMD_WIDTH * contactBase + k] = color.contactSims.data[k]; } // remainder - int colorContactCountSIMD = ((colorContactCount - 1) >> B2_SIMD_SHIFT) + 1; - for (int k = colorContactCount; k < B2_SIMD_WIDTH * colorContactCountSIMD; ++k) + int colorContactCountW = ((colorContactCount - 1) >> B2_SIMD_SHIFT) + 1; + for (int k = colorContactCount; k < B2_SIMD_WIDTH * colorContactCountW; ++k) { contacts[B2_SIMD_WIDTH * contactBase + k] = null; } - contactBase += colorContactCountSIMD; + contactBase += colorContactCountW; } + // Flat array of joints int colorJointCount = color.jointSims.count; for (int k = 0; k < colorJointCount; ++k) { @@ -1581,31 +1583,15 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) jointBase += colorJointCount; } - B2_ASSERT(contactBase == simdContactCount); + B2_ASSERT(contactBase == wideContactCount); B2_ASSERT(jointBase == awakeJointCount); } // Define work blocks for preparing contacts and storing contact impulses - int contactBlockSize = blocksPerWorker; - //int contactBlockCount = simdContactCount > 0 ? ((simdContactCount - 1) >> 2) + 1 : 0; - int contactBlockCount = simdContactCount > 0 ? ((simdContactCount - 1) / blocksPerWorker) + 1 : 0; - if (simdContactCount > contactBlockSize * maxBlockCount) - { - // Too many blocks, increase block size - contactBlockSize = simdContactCount / maxBlockCount; - contactBlockCount = maxBlockCount; - } + int contactBlockCount = b2ComputeBlockCount(wideContactCount, blocksPerWorker, maxBlockCount); // Define work blocks for preparing joints - int jointBlockSize = blocksPerWorker; - //int jointBlockCount = awakeJointCount > 0 ? ((awakeJointCount - 1) >> 2) + 1 : 0; - int jointBlockCount = awakeJointCount > 0 ? ((awakeJointCount - 1) / blocksPerWorker) + 1 : 0; - if (awakeJointCount > jointBlockSize * maxBlockCount) - { - // Too many blocks, increase block size - jointBlockSize = awakeJointCount / maxBlockCount; - jointBlockCount = maxBlockCount; - } + int jointBlockCount = b2ComputeBlockCount(awakeJointCount, blocksPerWorker, maxBlockCount); int stageCount = 0; @@ -1643,55 +1629,24 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) object splitIslandTask = null; if (world.splitIslandId != B2_NULL_INDEX) { - splitIslandTask = world.enqueueTaskFcn(b2SplitIslandTask, 1, 1, world, world.userTaskContext); - world.taskCount += 1; - world.activeTaskCount += splitIslandTask == null ? 0 : 1; - } - - // Prepare body work blocks - for (int i = 0; i < bodyBlockCount; ++i) - { - B2SolverBlock block = bodyBlocks[i]; - block.startIndex = i * bodyBlockSize; - block.count = (short)bodyBlockSize; - block.blockType = (short)B2SolverBlockType.b2_bodyBlock; - b2AtomicStoreInt(ref block.syncIndex, 0); - } - - bodyBlocks[bodyBlockCount - 1].count = (short)(awakeBodyCount - (bodyBlockCount - 1) * bodyBlockSize); - - // Prepare joint work blocks - for (int i = 0; i < jointBlockCount; ++i) - { - B2SolverBlock block = jointBlocks[i]; - block.startIndex = i * jointBlockSize; - block.count = (short)jointBlockSize; - block.blockType = (int)B2SolverBlockType.b2_jointBlock; - b2AtomicStoreInt(ref block.syncIndex, 0); - } - - if (jointBlockCount > 0) - { - jointBlocks[jointBlockCount - 1].count = (short)(awakeJointCount - (jointBlockCount - 1) * jointBlockSize); - } - - // Prepare contact work blocks - for (int i = 0; i < contactBlockCount; ++i) - { - B2SolverBlock block = contactBlocks[i]; - block.startIndex = i * contactBlockSize; - block.count = (short)contactBlockSize; - block.blockType = (int)B2SolverBlockType.b2_contactBlock; - b2AtomicStoreInt(ref block.syncIndex, 0); + if (world.taskCount < B2_MAX_TASKS) + { + splitIslandTask = world.enqueueTaskFcn(b2SplitIslandTask, world, world.userTaskContext); + world.taskCount += 1; + world.activeTaskCount += splitIslandTask == null ? 0 : 1; + } + else + { + b2SplitIslandTask(world); + } } - if (contactBlockCount > 0) - { - contactBlocks[contactBlockCount - 1].count = - (short)(simdContactCount - (contactBlockCount - 1) * contactBlockSize); - } + // Prepare body, joint, and contact work blocks + b2InitBlocks(bodyBlocks, bodyBlockCount, awakeBodyCount, 1 << 5, maxBlockCount, B2SolverBlockType.b2_bodyBlock); + b2InitBlocks(jointBlocks, jointBlockCount, awakeJointCount, blocksPerWorker, maxBlockCount, B2SolverBlockType.b2_jointBlock); + b2InitBlocks(contactBlocks, contactBlockCount, wideContactCount, blocksPerWorker, maxBlockCount, B2SolverBlockType.b2_contactBlock); - // Prepare graph work blocks + // Prepare graph work blocks. Each color gets joint blocks followed by contact blocks. ArraySegment[] graphColorBlocks = new ArraySegment[B2_GRAPH_COLOR_COUNT]; ArraySegment baseGraphBlock = graphBlocks; @@ -1699,139 +1654,33 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) { graphColorBlocks[i] = baseGraphBlock; - int colorJointBlockCount = colorJointBlockCounts[i]; - int colorJointBlockSize = colorJointBlockSizes[i]; - for (int j = 0; j < colorJointBlockCount; ++j) - { - B2SolverBlock block = baseGraphBlock[j]; - block.startIndex = j * colorJointBlockSize; - block.count = (short)colorJointBlockSize; - block.blockType = (short)B2SolverBlockType.b2_graphJointBlock; - b2AtomicStoreInt(ref block.syncIndex, 0); - } - - if (colorJointBlockCount > 0) - { - baseGraphBlock[colorJointBlockCount - 1].count = - (short)(colorJointCounts[i] - (colorJointBlockCount - 1) * colorJointBlockSize); - baseGraphBlock = baseGraphBlock.Slice(colorJointBlockCount); - } + int count; + count = b2ComputeBlockCount(colorJointCounts[i], blocksPerWorker, maxBlockCount); + b2InitBlocks(baseGraphBlock, count, colorJointCounts[i], blocksPerWorker, maxBlockCount, B2SolverBlockType.b2_graphJointBlock); + baseGraphBlock = baseGraphBlock.Slice(count); - int colorContactBlockCount = colorContactBlockCounts[i]; - int colorContactBlockSize = colorContactBlockSizes[i]; - for (int j = 0; j < colorContactBlockCount; ++j) - { - B2SolverBlock block = baseGraphBlock[j]; - block.startIndex = j * colorContactBlockSize; - block.count = (short)colorContactBlockSize; - block.blockType = (short)B2SolverBlockType.b2_graphContactBlock; - b2AtomicStoreInt(ref block.syncIndex, 0); - } - - if (colorContactBlockCount > 0) - { - baseGraphBlock[colorContactBlockCount - 1].count = - (short)(colorContactCounts[i] - (colorContactBlockCount - 1) * colorContactBlockSize); - baseGraphBlock = baseGraphBlock.Slice(colorContactBlockCount); - } + count = b2ComputeBlockCount(colorContactCounts[i], blocksPerWorker, maxBlockCount); + b2InitBlocks(baseGraphBlock, count, colorContactCounts[i], blocksPerWorker, maxBlockCount, B2SolverBlockType.b2_graphContactBlock); + baseGraphBlock = baseGraphBlock.Slice(count); } - // TODO: @ikpil check! B2_ASSERT((baseGraphBlock.Offset - graphBlocks.Offset) == graphBlockCount); int stageIdx = 0; - B2SolverStage stage = stages[stageIdx]; - - // Prepare joints - stage.type = B2SolverStageType.b2_stagePrepareJoints; - stage.blocks = jointBlocks; - stage.blockCount = jointBlockCount; - stage.colorIndex = -1; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - - // Prepare contacts - stage.type = B2SolverStageType.b2_stagePrepareContacts; - stage.blocks = contactBlocks; - stage.blockCount = contactBlockCount; - stage.colorIndex = -1; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - - // Integrate velocities - stage.type = B2SolverStageType.b2_stageIntegrateVelocities; - stage.blocks = bodyBlocks; - stage.blockCount = bodyBlockCount; - stage.colorIndex = -1; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - - // Warm start - for (int i = 0; i < activeColorCount; ++i) - { - stage.type = B2SolverStageType.b2_stageWarmStart; - stage.blocks = graphColorBlocks[i]; - stage.blockCount = colorJointBlockCounts[i] + colorContactBlockCounts[i]; - stage.colorIndex = activeColorIndices[i]; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - } - - // Solve graph - for (int j = 0; j < ITERATIONS; ++j) - { - for (int i = 0; i < activeColorCount; ++i) - { - stage.type = B2SolverStageType.b2_stageSolve; - stage.blocks = graphColorBlocks[i]; - stage.blockCount = colorJointBlockCounts[i] + colorContactBlockCounts[i]; - stage.colorIndex = activeColorIndices[i]; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - } - } - - // Integrate positions - stage.type = B2SolverStageType.b2_stageIntegratePositions; - stage.blocks = bodyBlocks; - stage.blockCount = bodyBlockCount; - stage.colorIndex = -1; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - - // Relax constraints - for (int j = 0; j < RELAX_ITERATIONS; ++j) - { - for (int i = 0; i < activeColorCount; ++i) - { - stage.type = B2SolverStageType.b2_stageRelax; - stage.blocks = graphColorBlocks[i]; - stage.blockCount = colorJointBlockCounts[i] + colorContactBlockCounts[i]; - stage.colorIndex = activeColorIndices[i]; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - } - } - - // Restitution + stageIdx = b2InitStage(stageIdx, stages, B2SolverStageType.b2_stagePrepareJoints, jointBlocks, jointBlockCount, -1); + stageIdx = b2InitStage(stageIdx, stages, B2SolverStageType.b2_stagePrepareContacts, contactBlocks, contactBlockCount, -1); + stageIdx = b2InitStage(stageIdx, stages, B2SolverStageType.b2_stageIntegrateVelocities, bodyBlocks, bodyBlockCount, -1); + stageIdx = b2InitColorStages(stageIdx, stages, B2SolverStageType.b2_stageWarmStart, 1, activeColorCount, graphColorBlocks, + colorBlockCounts, activeColorIndices); + stageIdx = b2InitColorStages(stageIdx, stages, B2SolverStageType.b2_stageSolve, ITERATIONS, activeColorCount, graphColorBlocks, + colorBlockCounts, activeColorIndices); + stageIdx = b2InitStage(stageIdx, stages, B2SolverStageType.b2_stageIntegratePositions, bodyBlocks, bodyBlockCount, -1); + stageIdx = b2InitColorStages(stageIdx, stages, B2SolverStageType.b2_stageRelax, RELAX_ITERATIONS, activeColorCount, graphColorBlocks, + colorBlockCounts, activeColorIndices); // Note: joint blocks mixed in, could have joint limit restitution - for (int i = 0; i < activeColorCount; ++i) - { - stage.type = B2SolverStageType.b2_stageRestitution; - stage.blocks = graphColorBlocks[i]; - stage.blockCount = colorJointBlockCounts[i] + colorContactBlockCounts[i]; - stage.colorIndex = activeColorIndices[i]; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; - } - - // Store impulses - stage.type = B2SolverStageType.b2_stageStoreImpulses; - stage.blocks = contactBlocks; - stage.blockCount = contactBlockCount; - stage.colorIndex = -1; - b2AtomicStoreInt(ref stage.completionCount, 0); - stage = stages[++stageIdx]; + stageIdx = b2InitColorStages(stageIdx, stages, B2SolverStageType.b2_stageRestitution, 1, activeColorCount, graphColorBlocks, + colorBlockCounts, activeColorIndices); + stageIdx = b2InitStage(stageIdx, stages, B2SolverStageType.b2_stageStoreImpulses, contactBlocks, contactBlockCount, -1); //B2_ASSERT( (int)( stage - stages ) == stageCount ); B2_ASSERT((int)(stageIdx) == stageCount); @@ -1861,7 +1710,7 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) b2TracyCZoneNC(B2TracyCZone.solve_constraints, "Solve Constraints", B2HexColor.b2_colorIndigo, true); ulong constraintTicks = b2GetTicks(); - // Must use worker index because thread 0 can be assigned multiple tasks by enkiTS + // Must use worker index because thread 0 can be assigned multiple tasks int jointIdCapacity = b2GetIdCapacity(world.jointIdPool); for (int i = 0; i < workerCount; ++i) { @@ -1870,9 +1719,18 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) workerContext[i].context = stepContext; workerContext[i].workerIndex = i; - workerContext[i].userTask = world.enqueueTaskFcn(b2SolverTask, 1, 1, workerContext[i], world.userTaskContext); - world.taskCount += 1; - world.activeTaskCount += workerContext[i].userTask == null ? 0 : 1; + + if (world.taskCount < B2_MAX_TASKS) + { + workerContext[i].userTask = world.enqueueTaskFcn(b2SolverTask, workerContext[i], world.userTaskContext); + world.taskCount += 1; + world.activeTaskCount += workerContext[i].userTask == null ? 0 : 1; + } + else + { + workerContext[i].userTask = null; + b2SolverTask(workerContext[i]); + } } // Finish island split @@ -1913,13 +1771,7 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) } // Finalize bodies. Must happen after the constraint solver and after island splitting. - object finalizeBodiesTask = - world.enqueueTaskFcn(b2FinalizeBodiesTask, awakeBodyCount, 64, stepContext, world.userTaskContext); - world.taskCount += 1; - if (finalizeBodiesTask != null) - { - world.finishTaskFcn(finalizeBodiesTask, world.userTaskContext); - } + b2ParallelFor(world, b2FinalizeBodiesTask, awakeBodyCount, 64, stepContext); b2FreeArenaItem(world.arena, graphBlocks); b2FreeArenaItem(world.arena, jointBlocks); @@ -2159,13 +2011,7 @@ internal static void b2Solve(B2World world, B2StepContext stepContext) // Fast bullet bodies // Note: a bullet body may be moving slow int minRange = 8; - object userBulletBodyTask = world.enqueueTaskFcn(b2BulletBodyTask, bulletBodyCount, minRange, stepContext, - world.userTaskContext); - world.taskCount += 1; - if (userBulletBodyTask != null) - { - world.finishTaskFcn(userBulletBodyTask, world.userTaskContext); - } + b2ParallelFor(world, b2BulletBodyTask, bulletBodyCount, minRange, stepContext); // Serially enlarge broad-phase proxies for bullet shapes B2BroadPhase broadPhase = world.broadPhase; diff --git a/src/Box2D.NET/B2Thread.cs b/src/Box2D.NET/B2Thread.cs new file mode 100644 index 00000000..e03fff3d --- /dev/null +++ b/src/Box2D.NET/B2Thread.cs @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using System.Threading; + +namespace Box2D.NET +{ + public class B2Thread + { + public Thread thread; + public string name; + public b2ThreadFunction function; + public object context; + } +} diff --git a/src/Box2D.NET/B2Threads.cs b/src/Box2D.NET/B2Threads.cs new file mode 100644 index 00000000..a2393bb9 --- /dev/null +++ b/src/Box2D.NET/B2Threads.cs @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using System.Threading; + +namespace Box2D.NET +{ + public delegate void b2ThreadFunction(object context); + + public static class B2Threads + { + // macOS pthread_setname_np takes only the name; it always names the calling thread. + public static void b2SetCurrentThreadName(string name) + { + if (string.IsNullOrEmpty(name)) + { + return; + } + + Thread.CurrentThread.Name = name; + } + + public static void b2ThreadStart(object param) + { + B2Thread t = (B2Thread)param; + b2SetCurrentThreadName(t.name); + t.function(t.context); + } + + // Name may be NULL, otherwise it is copied. + public static B2Thread b2CreateThread(b2ThreadFunction function, object context, string name) + { + var t = new B2Thread(); + t.function = function; + t.context = context; + if (!string.IsNullOrEmpty(name)) + { + t.name = name; + } + else + { + t.name = string.Empty; + } + + t.thread = new Thread(b2ThreadStart) + { + IsBackground = true, + }; + t.thread.Start(t); + return t; + } + + public static void b2JoinThread(B2Thread t) + { + t.thread.Join(); + t.thread = null; + t.name = string.Empty; + t.function = null; + t.context = null; + } + } +} diff --git a/src/Box2D.NET/B2World.cs b/src/Box2D.NET/B2World.cs index 6397a568..291a2f67 100644 --- a/src/Box2D.NET/B2World.cs +++ b/src/Box2D.NET/B2World.cs @@ -133,6 +133,8 @@ public class B2World public b2FinishTaskCallback finishTaskFcn; public object userTaskContext; public object userTreeTask; + + public B2Scheduler scheduler; public B2UserData userData; @@ -243,6 +245,7 @@ public void Clear() finishTaskFcn = null; userTaskContext = null; userTreeTask = null; + scheduler = null; userData = B2UserData.Empty; diff --git a/src/Box2D.NET/B2WorldDef.cs b/src/Box2D.NET/B2WorldDef.cs index 8ca15c94..d9264304 100644 --- a/src/Box2D.NET/B2WorldDef.cs +++ b/src/Box2D.NET/B2WorldDef.cs @@ -49,13 +49,13 @@ public struct B2WorldDef /// Contact softening when mass ratios are large. Experimental. public bool enableContactSoftening; - /// Number of workers to use with the provided task system. Box2D performs best when using only - /// performance cores and accessing a single L2 cache. Efficiency cores and hyper-threading provide + /// Number of workers for multithreading. Box2D performs best when using performance cores and + /// accessing a single L3 cache (uniform memory). Efficiency cores and SMT provide /// little benefit and may even harm performance. - /// @note Box2D does not create threads. This is the number of threads your applications has created - /// that you are allocating to b2World_Step. - /// @warning Do not modify the default value unless you are also providing a task system and providing - /// task callbacks (enqueueTask and finishTask). + /// This is clamped to the range [1, B2_MAX_WORKERS]. + /// Using a value above 1 will turn on multithreading. If task callbacks are provided + /// then Box2D will use the user provided task system. Otherwise Box2D will create threads and use + /// an internal scheduler. public int workerCount; /// Function to spawn tasks @@ -73,4 +73,4 @@ public struct B2WorldDef /// Used internally to detect a valid definition. DO NOT SET. public int internalValue; } -} \ No newline at end of file +} diff --git a/src/Box2D.NET/B2Worlds.cs b/src/Box2D.NET/B2Worlds.cs index 50f0dd3b..8c588363 100644 --- a/src/Box2D.NET/B2Worlds.cs +++ b/src/Box2D.NET/B2Worlds.cs @@ -30,6 +30,8 @@ using static Box2D.NET.B2Islands; using static Box2D.NET.B2Timers; using static Box2D.NET.B2Sensors; +using static Box2D.NET.B2ParallelFors; +using static Box2D.NET.B2Schedulers; namespace Box2D.NET { @@ -50,6 +52,22 @@ private static B2World[] b2AllocWorlds(int maxWorld) return worlds; } + public static B2World b3GetUnlockedWorldFromId(B2WorldId id) + { + B2_ASSERT(1 <= id.index1 && id.index1 <= B2_MAX_WORLDS); + B2World world = b2_worlds[(id.index1 - 1)]; + B2_ASSERT(id.index1 == world.worldId + 1); + B2_ASSERT(id.generation == world.generation); + + // A world accessed from an id should not be locked + if (world.locked) + { + B2_ASSERT(false); + return null; + } + + return world; + } public static B2World b2GetWorldFromId(B2WorldId id) { @@ -82,10 +100,10 @@ internal static B2World b2GetWorldLocked(int index) return world; } - internal static object b2DefaultAddTaskFcn(b2TaskCallback task, int count, int minRange, object taskContext, object userContext) + internal static object b2DefaultAddTaskFcn(b2TaskCallback task, object taskContext, object userContext) { - B2_UNUSED(minRange, userContext); - task(0, count, 0, taskContext); + B2_UNUSED(userContext); + task(taskContext); return null; } @@ -106,6 +124,72 @@ internal static float b2DefaultRestitutionCallback(float restitutionA, ulong mat return b2MaxFloat(restitutionA, restitutionB); } + private static void b2CreateWorkerContexts(B2World world) + { + world.taskContexts = b2Array_Create(world.workerCount); + b2Array_ResizeAndSetZero(ref world.taskContexts, world.workerCount); + + world.sensorTaskContexts = b2Array_Create(world.workerCount); + b2Array_ResizeAndSetZero(ref world.sensorTaskContexts, world.workerCount); + + for (int i = 0; i < world.workerCount; ++i) + { + world.taskContexts.data[i].sensorHits = b2Array_Create(8); + world.taskContexts.data[i].contactStateBitSet = b2CreateBitSet(1024); + world.taskContexts.data[i].jointStateBitSet = b2CreateBitSet(1024); + world.taskContexts.data[i].enlargedSimBitSet = b2CreateBitSet(256); + world.taskContexts.data[i].awakeIslandBitSet = b2CreateBitSet(256); + world.taskContexts.data[i].splitIslandId = B2_NULL_INDEX; + + world.sensorTaskContexts.data[i].eventBits = b2CreateBitSet(128); + } + } + + private static void b2DestroyWorkerContexts(B2World world) + { + for (int i = 0; i < world.workerCount; ++i) + { + b2Array_Destroy(ref world.taskContexts.data[i].sensorHits); + b2DestroyBitSet(ref world.taskContexts.data[i].contactStateBitSet); + b2DestroyBitSet(ref world.taskContexts.data[i].jointStateBitSet); + b2DestroyBitSet(ref world.taskContexts.data[i].enlargedSimBitSet); + b2DestroyBitSet(ref world.taskContexts.data[i].awakeIslandBitSet); + + b2DestroyBitSet(ref world.sensorTaskContexts.data[i].eventBits); + } + + b2Array_Destroy(ref world.taskContexts); + b2Array_Destroy(ref world.sensorTaskContexts); + } + + private static void b2UseSerialTaskSystem(B2World world) + { + if (world.scheduler != null) + { + b2DestroyScheduler(world.scheduler); + } + + world.workerCount = 1; + world.enqueueTaskFcn = b2DefaultAddTaskFcn; + world.finishTaskFcn = b2DefaultFinishTaskFcn; + world.userTaskContext = null; + world.scheduler = null; + } + + private static void b2UseBuiltInScheduler(B2World world, int workerCount) + { + if (world.scheduler != null) + { + b2DestroyScheduler(world.scheduler); + } + + world.workerCount = b2MinInt(workerCount, B2_MAX_WORKERS); + world.scheduler = b2CreateScheduler(world.workerCount); + world.enqueueTaskFcn = b2SchedulerEnqueueTask; + world.finishTaskFcn = b2SchedulerFinishTask; + world.userTaskContext = world.scheduler; + } + public static B2WorldId b2CreateWorld(in B2WorldDef def) { // check @@ -246,35 +330,33 @@ public static B2WorldId b2CreateWorld(in B2WorldDef def) if (def.workerCount > 0 && def.enqueueTask != null && def.finishTask != null) { + // External task system world.workerCount = b2MinInt(def.workerCount, B2_MAX_WORKERS); world.enqueueTaskFcn = def.enqueueTask; world.finishTaskFcn = def.finishTask; world.userTaskContext = def.userTaskContext; + world.scheduler = null; + } + else if (def.workerCount > 1) + { + // Built-in scheduler + world.workerCount = b2MinInt(def.workerCount, B2_MAX_WORKERS); + world.scheduler = b2CreateScheduler(world.workerCount); + world.enqueueTaskFcn = b2SchedulerEnqueueTask; + world.finishTaskFcn = b2SchedulerFinishTask; + world.userTaskContext = world.scheduler; } else { + // Serial fallback world.workerCount = 1; world.enqueueTaskFcn = b2DefaultAddTaskFcn; world.finishTaskFcn = b2DefaultFinishTaskFcn; world.userTaskContext = null; + world.scheduler = null; } - world.taskContexts = b2Array_Create(world.workerCount); - b2Array_Resize(ref world.taskContexts, world.workerCount); - - world.sensorTaskContexts = b2Array_Create(world.workerCount); - b2Array_Resize(ref world.sensorTaskContexts, world.workerCount); - - for (int i = 0; i < world.workerCount; ++i) - { - world.taskContexts.data[i].sensorHits = b2Array_Create(8); - world.taskContexts.data[i].contactStateBitSet = b2CreateBitSet(1024); - world.taskContexts.data[i].jointStateBitSet = b2CreateBitSet(1024); - world.taskContexts.data[i].enlargedSimBitSet = b2CreateBitSet(256); - world.taskContexts.data[i].awakeIslandBitSet = b2CreateBitSet(256); - - world.sensorTaskContexts.data[i].eventBits = b2CreateBitSet(128); - } + b2CreateWorkerContexts(world); world.debugBodySet = b2CreateBitSet(256); world.debugJointSet = b2CreateBitSet(256); @@ -289,24 +371,18 @@ public static void b2DestroyWorld(B2WorldId worldId) { B2World world = b2GetWorldFromId(worldId); + if (world.scheduler != null) + { + b2DestroyScheduler(world.scheduler); + world.scheduler = null; + } + b2DestroyBitSet(ref world.debugBodySet); b2DestroyBitSet(ref world.debugJointSet); b2DestroyBitSet(ref world.debugContactSet); b2DestroyBitSet(ref world.debugIslandSet); - for (int i = 0; i < world.workerCount; ++i) - { - b2Array_Destroy(ref world.taskContexts.data[i].sensorHits); - b2DestroyBitSet(ref world.taskContexts.data[i].contactStateBitSet); - b2DestroyBitSet(ref world.taskContexts.data[i].jointStateBitSet); - b2DestroyBitSet(ref world.taskContexts.data[i].enlargedSimBitSet); - b2DestroyBitSet(ref world.taskContexts.data[i].awakeIslandBitSet); - - b2DestroyBitSet(ref world.sensorTaskContexts.data[i].eventBits); - } - - b2Array_Destroy(ref world.taskContexts); - b2Array_Destroy(ref world.sensorTaskContexts); + b2DestroyWorkerContexts(world); b2Array_Destroy(ref world.bodyMoveEvents); b2Array_Destroy(ref world.sensorBeginEvents); @@ -391,13 +467,12 @@ public static void b2DestroyWorld(B2WorldId worldId) world.generation = (ushort)(generation + 1); } - internal static void b2CollideTask(int startIndex, int endIndex, uint threadIndex, object context) + internal static void b2CollideTask(int startIndex, int endIndex, int threadIndex, object context) { b2TracyCZoneNC(B2TracyCZone.collide_task, "Collide", B2HexColor.b2_colorDodgerBlue, true); B2StepContext stepContext = context as B2StepContext; B2World world = stepContext.world; - B2_ASSERT((int)threadIndex < world.workerCount); B2TaskContext taskContext = world.taskContexts.data[threadIndex]; ArraySegment contactSims = stepContext.contacts; B2Shape[] shapes = world.shapes.data; @@ -628,12 +703,7 @@ internal static void b2Collide(B2StepContext context) // Task should take at least 40us on a 4GHz CPU (10K cycles) int minRange = 64; - object userCollideTask = world.enqueueTaskFcn(b2CollideTask, contactCount, minRange, context, world.userTaskContext); - world.taskCount += 1; - if (userCollideTask != null) - { - world.finishTaskFcn(userCollideTask, world.userTaskContext); - } + b2ParallelFor(world, b2CollideTask, contactCount, minRange, context); b2FreeArenaItem(world.arena, contactSims); context.contacts = null; @@ -819,6 +889,11 @@ public static void b2World_Step(B2WorldId worldId, float timeStep, int subStepCo world.activeTaskCount = 0; world.taskCount = 0; + if (world.scheduler != null) + { + b2ResetScheduler(world.scheduler); + } + ulong stepTicks = b2GetTicks(); // Update collision pairs and create contacts @@ -1731,6 +1806,37 @@ public static B2Profile b2World_GetProfile(B2WorldId worldId) return world.profile; } + /// Set the worker count. Must be between in the range [1, B2_MAX_WORKERS] + public static void b2World_SetWorkerCount(B2WorldId worldId, int count) + { + B2World world = b3GetUnlockedWorldFromId(worldId); + if (world == null) + { + return; + } + + if (count == world.workerCount) + { + return; + } + + b2DestroyWorkerContexts(world); + world.workerCount = b2ClampInt(count, 1, B2_MAX_WORKERS); + b2CreateWorkerContexts(world); + } + + /// Get the worker count. + public static int b2World_GetWorkerCount(B2WorldId worldId) + { + B2World world = b3GetUnlockedWorldFromId(worldId); + if (world == null) + { + return 0; + } + + return world.workerCount; + } + public static B2Counters b2World_GetCounters(B2WorldId worldId) { B2World world = b2GetWorldFromId(worldId); diff --git a/test/Box2D.NET.Test/B2DeterminismTest.cs b/test/Box2D.NET.Test/B2DeterminismTest.cs index 5744b9a8..1098a7d3 100644 --- a/test/Box2D.NET.Test/B2DeterminismTest.cs +++ b/test/Box2D.NET.Test/B2DeterminismTest.cs @@ -3,147 +3,24 @@ // SPDX-License-Identifier: MIT using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Threading; -using System.Threading.Tasks; using Box2D.NET.Shared; using NUnit.Framework; using static Box2D.NET.B2Types; using static Box2D.NET.B2Worlds; -using static Box2D.NET.B2Diagnostics; using static Box2D.NET.B2Profiling; using static Box2D.NET.Shared.Determinism; namespace Box2D.NET.Test; -public class b2TaskTester : IDisposable -{ - private readonly int _workerCount; - private SemaphoreSlim _semaphore; - private int e_maxTasks; - public int taskCount; - private ConcurrentQueue _runningTasks; - - public b2TaskTester(int workerCount, int maxTasks) - { - _workerCount = workerCount; - _semaphore = new SemaphoreSlim(workerCount); - e_maxTasks = maxTasks; - _runningTasks = new ConcurrentQueue(); - } - - public void Dispose() - { - _semaphore.Dispose(); - _semaphore = null; - - B2_ASSERT(0 >= _runningTasks.Count); - } - - private IEnumerable Next(int itemCount, int minRange) - { - if (itemCount <= minRange) - { - yield return itemCount; - } - else - { - var workerCount = Math.Min(_workerCount, minRange); - int quotient = itemCount / workerCount; - int remainder = itemCount % workerCount; - - int distributeValue = remainder / quotient; - int extraValueCount = remainder % quotient; - - int index = 0; - for (int i = 0; i < workerCount; i++) - { - int count = quotient + distributeValue; - if (i < extraValueCount) - { - count = +1; - } - - yield return count; - } - } - } - - public object EnqueueTask(b2TaskCallback box2dTask, int itemCount, int minRange, object box2dContext, object userContext) - { - B2_UNUSED(userContext); - - if (taskCount < e_maxTasks) - { - uint loop = 0; - int index = 0; - int remain = itemCount; - while (0 < remain) - { - var stepCount = Math.Min(remain, minRange); - remain -= stepCount; - - uint workerIndex = (loop++) % (uint)_workerCount; - var startIndex = index; - var endIndex = startIndex + stepCount; - - index = endIndex; - - var running = Task.Run(async () => - { - await _semaphore.WaitAsync(); - try - { - box2dTask.Invoke(startIndex, endIndex, workerIndex, box2dContext); - } - finally - { - _semaphore.Release(); - } - }); - - _runningTasks.Enqueue(running); - } - } - else - { - box2dTask(0, itemCount, 0, box2dContext); - - return null; - } - - ++taskCount; - return box2dTask; - } - - public void FinishTask(object userTask, object userContext) - { - B2_UNUSED(userContext); - - // wait! - while (_runningTasks.TryDequeue(out var task)) - { - task.Wait(); - } - } -} - public class B2DeterminismTest { private const int EXPECTED_SLEEP_STEP = 293; private const uint EXPECTED_HASH = 0x2FF98AC6; - private const int e_maxTasks = 128; - // todo_erin move this to shared public static int SingleMultithreadingTest(int workerCount) { - var tester = new b2TaskTester(workerCount, e_maxTasks); - B2WorldDef worldDef = b2DefaultWorldDef(); - worldDef.enqueueTask = tester.EnqueueTask; - worldDef.finishTask = tester.FinishTask; worldDef.workerCount = workerCount; B2WorldId worldId = b2CreateWorld(worldDef); @@ -179,10 +56,19 @@ public static int SingleMultithreadingTest(int workerCount) [Test] public void MultithreadingTest() { - for (int workerCount = 1; workerCount < 6; ++workerCount) + for (int run = 0; run < 3; ++run) { - int result = SingleMultithreadingTest(workerCount); - Assert.That(result, Is.EqualTo(0)); + for (int workerCount = 1; workerCount < 16; workerCount += 2) + { + int result = SingleMultithreadingTest(workerCount); + Assert.That(result, Is.EqualTo(0)); + } + + for (int workerCount = 32; workerCount >= 0; workerCount -= 5) + { + int result = SingleMultithreadingTest(workerCount); + Assert.That(result, Is.EqualTo(0)); + } } } @@ -214,4 +100,4 @@ public void CrossPlatformTest() b2DestroyWorld(worldId); } -} \ No newline at end of file +} diff --git a/test/Box2D.NET.Test/B2ThreadTest.cs b/test/Box2D.NET.Test/B2ThreadTest.cs new file mode 100644 index 00000000..73b82f14 --- /dev/null +++ b/test/Box2D.NET.Test/B2ThreadTest.cs @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: 2026 Erin Catto +// SPDX-FileCopyrightText: 2026 Ikpil Choi(ikpil@naver.com) +// SPDX-License-Identifier: MIT + +using NUnit.Framework; +using static Box2D.NET.B2Mutexes; +using static Box2D.NET.B2Semaphores; +using static Box2D.NET.B2Threads; + +namespace Box2D.NET.Test; + +public class B2ThreadTest +{ + private sealed class SemData + { + public B2Semaphore sem; + public int value; + } + + private static void SemWorker(object context) + { + SemData data = (SemData)context; + data.value = 99; + b2SignalSemaphore(ref data.sem); + } + + [Test] + public void SemaphoreCreateDestroyTest() + { + B2Semaphore s = b2CreateSemaphore(0); + Assert.That(s.semaphore, Is.Not.Null); + b2DestroySemaphore(ref s); + } + + [Test] + public void SemaphoreSignalWaitTest() + { + SemData data = new SemData + { + sem = b2CreateSemaphore(0), + value = 0, + }; + + B2Thread thread = b2CreateThread(SemWorker, data, "sem test"); + b2WaitSemaphore(ref data.sem); + + Assert.That(data.value, Is.EqualTo(99)); + + b2JoinThread(thread); + b2DestroySemaphore(ref data.sem); + } + + [Test] + public void SemaphoreInitialCountTest() + { + B2Semaphore s = b2CreateSemaphore(3); + + b2WaitSemaphore(ref s); + b2WaitSemaphore(ref s); + b2WaitSemaphore(ref s); + + b2SignalSemaphore(ref s); + b2WaitSemaphore(ref s); + + b2DestroySemaphore(ref s); + } + + [Test] + public void ThreadCreateJoinTest() + { + SemData data = new SemData + { + sem = b2CreateSemaphore(0), + value = 0, + }; + + B2Thread thread = b2CreateThread(SemWorker, data, "join test"); + b2JoinThread(thread); + + Assert.That(data.value, Is.EqualTo(99)); + + b2DestroySemaphore(ref data.sem); + } + + private sealed class SumData + { + public B2Mutex mutex; + public int sum; + } + + private static void SumWorker(object context) + { + SumData data = (SumData)context; + for (int i = 0; i < 1000; ++i) + { + b2LockMutex(ref data.mutex); + data.sum += 1; + b2UnlockMutex(ref data.mutex); + } + } + + [Test] + public void ThreadMultipleTest() + { + SumData data = new SumData + { + mutex = b2CreateMutex(), + sum = 0, + }; + + const int threadCount = 4; + B2Thread[] threads = new B2Thread[threadCount]; + for (int i = 0; i < threadCount; ++i) + { + threads[i] = b2CreateThread(SumWorker, data, $"sum test {i}"); + } + + for (int i = 0; i < threadCount; ++i) + { + b2JoinThread(threads[i]); + } + + Assert.That(data.sum, Is.EqualTo(threadCount * 1000)); + + b2DestroyMutex(ref data.mutex); + } +} diff --git a/test/Box2D.NET.Test/B2WorldTest.cs b/test/Box2D.NET.Test/B2WorldTest.cs index 68e6ad4a..fe5a4450 100644 --- a/test/Box2D.NET.Test/B2WorldTest.cs +++ b/test/Box2D.NET.Test/B2WorldTest.cs @@ -13,6 +13,7 @@ using static Box2D.NET.B2Constants; using static Box2D.NET.B2Joints; using static Box2D.NET.B2Diagnostics; +using static Box2D.NET.Shared.Benchmarks; namespace Box2D.NET.Test; @@ -394,4 +395,44 @@ public void TestSensor() Assert.That(beginCount, Is.EqualTo(1)); Assert.That(endCount, Is.EqualTo(1)); } -} \ No newline at end of file + + [Test] + public void TestSetWorkerCount() + { + B2WorldDef worldDef = b2DefaultWorldDef(); + worldDef.workerCount = 1; + B2WorldId worldId = b2CreateWorld(worldDef); + Assert.That(b2World_IsValid(worldId)); + Assert.That(b2World_GetWorkerCount(worldId), Is.EqualTo(1)); + + var junkyardData = CreateJunkyard(worldId); + StepJunkyard(junkyardData, worldId, 1); + + b2World_SetWorkerCount(worldId, 4); + Assert.That(b2World_GetWorkerCount(worldId), Is.EqualTo(4)); + + StepJunkyard(junkyardData, worldId, 2); + + b2World_SetWorkerCount(worldId, 4); + Assert.That(b2World_GetWorkerCount(worldId), Is.EqualTo(4)); + + StepJunkyard(junkyardData, worldId, 3); + + b2World_SetWorkerCount(worldId, 0); + Assert.That(b2World_GetWorkerCount(worldId), Is.EqualTo(1)); + + StepJunkyard(junkyardData, worldId, 4); + + b2World_SetWorkerCount(worldId, -5); + Assert.That(b2World_GetWorkerCount(worldId), Is.EqualTo(1)); + + StepJunkyard(junkyardData, worldId, 5); + + b2World_SetWorkerCount(worldId, B2_MAX_WORKERS + 10); + Assert.That(b2World_GetWorkerCount(worldId), Is.EqualTo(B2_MAX_WORKERS)); + + StepJunkyard(junkyardData, worldId, 2); + + b2DestroyWorld(worldId); + } +} diff --git a/test/Box2D.NET.Test/Helpers/B2TestContext.cs b/test/Box2D.NET.Test/Helpers/B2TestContext.cs index a1129fbb..bb6b25e4 100644 --- a/test/Box2D.NET.Test/Helpers/B2TestContext.cs +++ b/test/Box2D.NET.Test/Helpers/B2TestContext.cs @@ -42,10 +42,10 @@ public void Dispose() } - private static object EnqueueTask(b2TaskCallback task, int itemCount, int minRange, object taskContext, object userContext) + private static object EnqueueTask(b2TaskCallback task, object taskContext, object userContext) { // Execute the task immediately for testing purposes - task(0, itemCount, 0, taskContext); + task(taskContext); return null; } @@ -53,4 +53,4 @@ private static void FinishTask(object userTask, object userContext) { // No cleanup needed for testing } -} \ No newline at end of file +}