diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
new file mode 100644
index 00000000..6058aa96
--- /dev/null
+++ b/.claude/CLAUDE.md
@@ -0,0 +1,10 @@
+# Project Rules for Frontier-CS
+
+## Backend Selection
+
+**NEVER change the backend due to missing credentials or CI configuration issues.**
+
+- Research track: always uses SkyPilot (cloud VMs)
+- Algorithmic track: always uses Docker (local)
+
+If CI fails due to credentials/permissions, fix the credentials - do NOT change the code to use a different backend. The backend choice is intentional for each track's evaluation requirements.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index d596ffe9..71648de6 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,7 @@
 ## Summary
 <!-- Brief description of changes -->
 
+> Please read [CONTRIBUTING.md](../CONTRIBUTING.md) before submitting.
 
 ## Type of Change
 - [ ] New research problem
@@ -21,4 +22,4 @@
 ## CI Validation (for new problems)
 > When adding new problems, CI will automatically validate that your reference solution achieves score > 0.
 > - Algorithmic problems: Include `reference.cpp` in your problem directory
-> - Research problems: Include `reference.py` in your problem directory
+> - Research problems: Include `reference.py` (or `reference.cpp` if `language: cpp` in config.yaml)
diff --git a/.github/PULL_REQUEST_TEMPLATE/research_problem.md b/.github/PULL_REQUEST_TEMPLATE/research_problem.md
index e8a3844e..2110cc04 100644
--- a/.github/PULL_REQUEST_TEMPLATE/research_problem.md
+++ b/.github/PULL_REQUEST_TEMPLATE/research_problem.md
@@ -28,7 +28,7 @@ labels: research-problem
 - [ ] `evaluate.sh` - Evaluation entry point
 - [ ] `evaluator.py` - Scoring logic (outputs 0-100 score)
 - [ ] `resources/` - Problem-specific code/data
-- [ ] `reference.py` - Reference solution **(required for CI)**
+- [ ] `reference.{py,cpp}` - Reference solution **(required for CI, extension matches `language` in config.yaml)**
 
 ### Problem Structure
 ```
@@ -38,7 +38,7 @@ research/{problem_name}/
 ├── set_up_env.sh
 ├── evaluate.sh
 ├── evaluator.py
-├── reference.py     # Required: CI will validate this achieves score > 0
+├── reference.{py,cpp}  # Required: CI validates score > 0 (extension per language)
 └── resources/
     └── ...
 ```
@@ -46,7 +46,7 @@ research/{problem_name}/
 ### Testing
 - [ ] Verified `set_up_env.sh` runs successfully
 - [ ] Verified `evaluate.sh` runs and outputs a numeric score
-- [ ] **Reference solution (`reference.py`) achieves score > 0**
+- [ ] **Reference solution achieves score > 0**
 
 **Test Results** (if available):
 ```
diff --git a/.github/workflows/validate-problems.yml b/.github/workflows/validate-problems.yml
index 3f8e77dd..966152b8 100644
--- a/.github/workflows/validate-problems.yml
+++ b/.github/workflows/validate-problems.yml
@@ -78,6 +78,23 @@ jobs:
       - name: Install dependencies
         run: uv sync
 
+      - name: Setup AWS credentials
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          mkdir -p ~/.aws
+          cat > ~/.aws/credentials << EOF
+          [default]
+          aws_access_key_id = $AWS_ACCESS_KEY_ID
+          aws_secret_access_key = $AWS_SECRET_ACCESS_KEY
+          EOF
+          cat > ~/.aws/config << EOF
+          [default]
+          region = us-east-1
+          EOF
+          echo "AWS credentials configured"
+
       - name: Setup GCP credentials
         env:
           GCP_CREDS: ${{ secrets.GCP_CREDENTIALS }}
@@ -85,18 +102,29 @@ jobs:
           if [ -n "$GCP_CREDS" ]; then
             echo "$GCP_CREDS" > /tmp/gcp-key.json
             echo "GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-key.json" >> $GITHUB_ENV
+            gcloud auth activate-service-account --key-file=/tmp/gcp-key.json
+            gcloud config set project ${{ secrets.GCP_PROJECT_ID }}
             echo "GCP credentials configured"
-          else
-            echo "No GCP credentials available, skipping..."
+          fi
+
+      - name: Generate SSH key for SkyPilot
+        run: |
+          mkdir -p ~/.ssh
+          if [ ! -f ~/.ssh/sky-key ]; then
+            ssh-keygen -t rsa -b 4096 -f ~/.ssh/sky-key -N "" -C "sky-ci"
+            echo "Generated SSH key for SkyPilot"
           fi
 
       - name: Setup SkyPilot
         run: |
-          uv run sky check || echo "SkyPilot check failed, continuing..."
+          uv run sky check aws gcp || echo "SkyPilot check failed, continuing..."
 
       - name: Validate problems
+        timeout-minutes: 30
         run: |
           echo "Validating research problems: ${{ needs.detect-changes.outputs.research }}"
           uv run python scripts/validate_problems.py \
             --track research \
-            --problems ${{ needs.detect-changes.outputs.research }}
+            --timeout 1200 \
+            --problems ${{ needs.detect-changes.outputs.research }} \
+            --verbose
diff --git a/.github/workflows/weekly-eval.yml b/.github/workflows/weekly-eval.yml
index eba7a985..bb197c7d 100644
--- a/.github/workflows/weekly-eval.yml
+++ b/.github/workflows/weekly-eval.yml
@@ -100,9 +100,7 @@ jobs:
             --track research \
             --internal-dir internal \
             --results-repo results-repo \
-            --workers $WORKERS \
-            --clusters $CLUSTERS \
-            --skypilot \
+            -j $CLUSTERS \
             --push
 
       - name: Run algorithmic evaluation
@@ -116,8 +114,7 @@ jobs:
             --track algorithmic \
             --internal-dir internal \
             --results-repo results-repo \
-            --workers $WORKERS \
-            --skypilot \
+            -j $WORKERS \
             --push
 
       - name: Upload results artifact
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 41ddf007..3da9351f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,8 @@
 # Contributing to Frontier-CS
 
-Frontier-CS is currently an **invitation-only** project for new problems. 
+> **For Problem Contributors**: Guidelines for creating and submitting new problems to Frontier-CS.
+
+Frontier-CS is currently an **invitation-only** project for new problems.
 Please create a GitHub pull request (PR) with your proposed problem following the guidelines below. After your PR is reviewed and merged, please send any hidden test data and reference solutions to the contact email provided at the end of this document.
 
 
@@ -130,11 +132,11 @@ research/{problem_name}/
 ├── evaluate.sh          # Evaluation entry point
 ├── evaluator.py         # Scoring logic
 ├── readme               # Problem description
-├── reference.py         # Reference solution (required for CI validation)
+├── reference.{py,cpp}   # Reference solution (required for CI, extension per language)
 └── resources/           # Problem-specific code/data
 ```
 
-> **Note**: The `reference.py` is required for CI validation. When you submit a PR, the CI will automatically run your reference solution and verify it achieves score > 0.
+> **Note**: A reference solution is required for CI validation. Use `reference.py` for Python problems or `reference.cpp` if `language: cpp` in config.yaml. The CI will automatically run your reference solution and verify it achieves score > 0.
 
 ### Solution Interface
 
@@ -331,10 +333,12 @@ When you submit a PR that adds or modifies problems, CI will automatically valid
 | Track | File | Location |
 |-------|------|----------|
 | Algorithmic | `reference.cpp` | `algorithmic/problems/{id}/reference.cpp` |
-| Research | `reference.py` | `research/problems/{name}/reference.py` |
+| Research | `reference.{py,cpp}` | `research/problems/{name}/reference.{ext}` (extension per `language` in config.yaml) |
 
 If the reference solution is missing or scores 0, the PR will be blocked from merging.
 
+> **Important**: The reference solution must achieve score > 0. This is a design choice to ensure the evaluator is working correctly - a score > 0 proves that the evaluation pipeline can successfully compile/run the solution and produce a valid score. If the reference only scores 0, we cannot distinguish between "evaluator error" and "valid solution with no improvement". For problems that measure speedup against a baseline, the reference must be **faster than the baseline**, not just a copy of it.
+
 ### Local Testing
 
 Before submitting a PR, test your reference solution locally:
@@ -343,8 +347,8 @@ Before submitting a PR, test your reference solution locally:
 # Algorithmic
 frontier eval algorithmic {id} algorithmic/problems/{id}/reference.cpp
 
-# Research
-frontier eval research {name} research/problems/{name}/reference.py
+# Research (use .py or .cpp based on problem's language config)
+frontier eval research {name} research/problems/{name}/reference.{ext}
 ```
 
 ## Contact
diff --git a/README.md b/README.md
index cbdf5e89..16476f0c 100644
--- a/README.md
+++ b/README.md
@@ -150,9 +150,9 @@ frontier eval algorithmic 1 <your_solution.cpp> --unbounded
 ### Python API
 
 ```python
-from frontier_cs import FrontierCSEvaluator
+from frontier_cs import SingleEvaluator
 
-evaluator = FrontierCSEvaluator()
+evaluator = SingleEvaluator()
 
 # Evaluate a research problem
 result = evaluator.evaluate("research", problem_id="flash_attn", code=my_code)
@@ -195,28 +195,24 @@ research/solutions/
 
 ```bash
 # Evaluate all research solutions (uses SkyPilot by default)
-uv run frontier-eval batch research
+frontier batch research
 
 # Evaluate all algorithmic solutions (uses Docker by default)
-uv run frontier-eval batch algorithmic
+frontier batch algorithmic
 
 # Filter by model or problem
-uv run frontier-eval batch research --model gpt5.1
-uv run frontier-eval batch research --problem flash_attn
-uv run frontier-eval batch research --model gpt5.1 --problem flash_attn
+frontier batch research --model gpt5.1
+frontier batch research --problem flash_attn
 
 # Override default backend
-uv run frontier-eval batch research --backend docker
-uv run frontier-eval batch algorithmic --backend skypilot
+frontier batch research --backend docker
+frontier batch algorithmic --backend skypilot
 ```
 
 **Custom solutions directory:** You can test solutions from a custom directory with the same structure:
 
 ```bash
-# Your custom directory should have the same structure:
-# my_solutions/{problem}/{model}.py
-
-uv run frontier-eval batch research --solutions-dir ./my_solutions
+frontier batch research --solutions-dir ./my_solutions
 ```
 
 Results are saved to `./results/batch/{track}/` by default. The state file tracks which (solution, problem) pairs have been evaluated, so you can:
diff --git a/SUBMIT.md b/SUBMIT.md
index 36663584..c81db857 100644
--- a/SUBMIT.md
+++ b/SUBMIT.md
@@ -1,6 +1,6 @@
 # Evaluating Your Model
 
-Complete workflow for benchmarking your model on Frontier-CS and submitting results to the leaderboard.
+> **For Model Providers**: Complete workflow for benchmarking your model on Frontier-CS and submitting results to the leaderboard.
 
 ## Step 1: Prepare Solutions
 
@@ -19,7 +19,7 @@ research/solutions/gemm_optimization/squares/my_model.py
 algorithmic/solutions/1/my_model.cpp
 ```
 
-- **Research track**: Python (`.py`)
+- **Research track**: Python (`.py`) by default, or C++ (`.cpp`) if problem specifies `language: cpp` in config.yaml
 - **Algorithmic track**: C++17 (`.cpp`)
 - We recommend generating **5 variants per model** to compute Score@5
 
@@ -36,7 +36,7 @@ research/solutions/
 └── ...
 ```
 ```bash
-frontier-eval batch research --model my_model
+frontier batch research --model my_model
 ```
 
 **2. Use your own directory**
@@ -48,7 +48,7 @@ frontier-eval batch research --model my_model
 └── ...
 ```
 ```bash
-frontier-eval batch research --solutions-dir ./my_solutions
+frontier batch research --solutions-dir ./my_solutions
 ```
 
 **3. Explicit pairs file**
@@ -59,39 +59,39 @@ frontier-eval batch research --solutions-dir ./my_solutions
 ./my_solutions/cross_entropy/my_model.py:cross_entropy
 ```
 ```bash
-frontier-eval batch research --pairs-file pairs.txt
+frontier batch research --pairs-file pairs.txt
 ```
 
 ### Backend Options
 
 ```bash
 # Research defaults to SkyPilot, algorithmic defaults to Docker
-frontier-eval batch research --backend docker
-frontier-eval batch algorithmic --backend skypilot
+frontier batch research --backend docker
+frontier batch algorithmic --backend skypilot
 
 # Parallelism
-frontier-eval batch research --workers 20 --clusters 4
+frontier batch research --workers 20 --clusters 4
 ```
 
 ### Result Storage
 
 ```bash
 # Local (default): results saved to ./results/batch/{track}/
-frontier-eval batch research
+frontier batch research
 
 # Cloud bucket (requires --backend skypilot): results written directly to S3/GCS
-frontier-eval batch research --bucket-url s3://my-bucket/results
+frontier batch research --bucket-url s3://my-bucket/results
 
 # Sync from bucket to local
-frontier-eval batch research --bucket-url s3://my-bucket/results --sync-bucket
+frontier batch research --bucket-url s3://my-bucket/results --sync-bucket
 ```
 
 ### Control Options
 
 ```bash
-frontier-eval batch research --status          # Check status
-frontier-eval batch research --no-resume       # Force re-evaluate all
-frontier-eval batch research --retry-failed    # Retry failed (including score=0)
+frontier batch research --status          # Check status
+frontier batch research --no-resume       # Force re-evaluate all
+frontier batch research --retry-failed    # Retry failed (including score=0)
 ```
 
 - Incremental evaluation with hash-based caching (solution/problem changes trigger re-evaluation)
@@ -114,7 +114,7 @@ We welcome submissions from all models and agent frameworks. To have your result
 
 ### Algorithmic Problems
 
-We currently release **1 -- 3 public test case** per problem for local testing and debugging. Full evaluation (with all test cases) is performed on our servers.
+We currently release **1-3 public test cases** per problem for local testing and debugging. Full evaluation (with all test cases) is performed on our servers.
 
 #### What to Submit
 
@@ -174,7 +174,7 @@ Problem (e.g., gemm_optimization, poc_generation)
 
 Each variant has a unique **Problem ID** based on its path under `research/`.
 
-The full list of all evaluatable variants is in [`research/problems.txt`](research/problems.txt) (109 variants total, aggregated into ~50 categories for reporting).
+The full list of all evaluatable variants is in [`research/scripts/problems.txt`](research/scripts/problems.txt).
 
 | Type | Example Path | Problem ID |
 |------|-------------|------------|
@@ -309,7 +309,9 @@ export GOOGLE_API_KEY=...
 
 ### Generate Solutions
 
-#### Research Track (Python)
+#### Research Track
+
+Most research problems are Python, but some (e.g., `nbody_simulation`) require C++. The language is configured per-problem via `language` field in `config.yaml`.
 
 ```bash
 # Generate one solution
diff --git a/algorithmic/README.md b/algorithmic/README.md
index 5ce78f73..8737c55e 100644
--- a/algorithmic/README.md
+++ b/algorithmic/README.md
@@ -1,8 +1,8 @@
-## FrontierCS - Algorithmic Problems
+# Algorithmic Problems
 
-> For complete model evaluation workflow (prepare solutions, run batch evaluation, submit to leaderboard), see [SUBMIT.md](../SUBMIT.md).
-
-> **Note:** We currently support C++17 only for algorithmic problem solutions.
+> **Technical Reference**: Problem structure, Judge API, and evaluation details for algorithmic track.
+>
+> For model evaluation workflow, see [SUBMIT.md](../SUBMIT.md).
 
 ### Problem Structure
 
@@ -19,6 +19,11 @@ problems/{id}/
 └── chk.cc / interactor.cc   # Checker or interactor
 ```
 
+### Solution Requirements
+
+- **Language**: C++17 only
+- **Single file**: Submit one `.cpp` file per problem
+
 ### How It Works
 
 1. **Fetch problem** statement from judge API
@@ -27,7 +32,7 @@ problems/{id}/
 4. **Poll** for result
 5. **Score** based on test case pass rate
 
-The judge sever will save solutions and their detailed judging results under the folder `algorithmic/submissions`.
+The judge server will save solutions and their detailed judging results under the folder `algorithmic/submissions`.
 
 
 ### Judge API
@@ -43,9 +48,9 @@ The judge sever will save solutions and their detailed judging results under the
 ### Python API
 
 ```python
-from frontier_cs import FrontierCSEvaluator
+from frontier_cs import SingleEvaluator
 
-evaluator = FrontierCSEvaluator()
+evaluator = SingleEvaluator()
 
 # Evaluate an algorithmic problem
 result = evaluator.evaluate("algorithmic", problem_id=1, code=cpp_code)
@@ -69,15 +74,12 @@ frontier eval algorithmic 1 solution.cpp --unbounded
 
 ### Batch Evaluation
 
-```bash
-# Evaluate all solutions in algorithmic/solutions/
-frontier-eval batch algorithmic
+For batch evaluation of multiple solutions, see [SUBMIT.md](../SUBMIT.md#step-2-run-evaluation).
 
-# With SkyPilot (cloud go-judge)
-frontier-eval batch algorithmic --backend skypilot
-
-# Check status
-frontier-eval batch algorithmic --status
+```bash
+frontier batch algorithmic                    # Evaluate all in solutions/
+frontier batch algorithmic --backend skypilot # Use cloud go-judge
+frontier batch algorithmic --status           # Check progress
 ```
 
 **Note:** For algorithmic track, `--clusters` is not used. All workers share a single go-judge server (local Docker or SkyPilot).
@@ -99,8 +101,6 @@ frontier eval algorithmic 1 solution.cpp --judge-url http://$(sky status --ip al
 
 > For contributing problems to Frontier-CS (detailed file formats, CI requirements), see [CONTRIBUTING.md](../CONTRIBUTING.md#algorithmic-problems).
 
-For local testing, create `problems/{id}/` with required files and restart the judge.
-
 ### Judge Server Configuration
 
 #### config.yaml
diff --git a/pyproject.toml b/pyproject.toml
index f70d3493..a7389580 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "python-dotenv>=1.2.1",
     "pyyaml>=6.0",
     "requests>=2.32.5",
-    "skypilot>=0.10.5",
+    "skypilot[aws,gcp]>=0.10.5",
     "tqdm>=4.67.1",
 ]
 
diff --git a/research/README.md b/research/README.md
index 40d136e5..9d3231b5 100644
--- a/research/README.md
+++ b/research/README.md
@@ -1,20 +1,14 @@
 # Research Problems
 
-> For complete model evaluation workflow (prepare solutions, run batch evaluation, submit to leaderboard), see [SUBMIT.md](../SUBMIT.md).
+> **Technical Reference**: Problem structure, evaluation details, and Solution interface for research track.
+>
+> For model evaluation workflow, see [SUBMIT.md](../SUBMIT.md).
 
 Real-world systems challenges requiring domain expertise in GPU computing, distributed systems, ML pipelines, databases, and security.
 
 ## Basic Usage
 
-Research track defaults to SkyPilot (cloud). Requires `sky check` first:
-
-```bash
-# Setup SkyPilot (one-time)
-pip install skypilot-nightly
-sky check
-```
-
-See [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for cloud credential setup.
+Research track defaults to SkyPilot (cloud) because problems have specific resource requirements (GPUs, memory, etc.) that can affect evaluation results. Run `sky check` to verify cloud credentials. See [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for setup.
 
 ```bash
 # List all problems
@@ -26,51 +20,32 @@ frontier eval research flash_attn <your_solution.py>
 # Use Docker instead (no cloud setup needed)
 frontier eval research flash_attn <your_solution.py> --backend docker
 
-# Evaluate multiple problems
-frontier eval research --problems flash_attn,cross_entropy <your_solution.py>
 ```
 
 ## Batch Evaluation
 
-Batch evaluation automatically scans `solutions/` and parses problem IDs from filenames:
+For batch evaluation of multiple solutions, see [SUBMIT.md](../SUBMIT.md#step-2-run-evaluation).
 
 ```bash
-# Evaluate all solutions (uses SkyPilot by default, auto-skips completed)
-frontier-eval batch research
-
-# With custom parallelism
-frontier-eval batch research --workers 20 --clusters 4
-
-# Check status
-frontier-eval batch research --status
-
-# Force re-evaluate all
-frontier-eval batch research --no-resume
-
-# Retry failed evaluations
-frontier-eval batch research --retry-failed
+frontier batch research                    # Evaluate all in solutions/
+frontier batch research --model my_model   # Filter by model
+frontier batch research --status           # Check progress
 ```
 
-**Parameters:**
-- `--workers`: Number of parallel workers (default: 10)
-- `--clusters`: Number of SkyPilot clusters for load-balancing (default: same as workers, research + skypilot only)
-
-With `--workers 20 --clusters 4`, 20 workers share 4 clusters via load-balancing.
-
 ## Python API
 
 ```python
-from frontier_cs import FrontierCSEvaluator
+from frontier_cs import SingleEvaluator
 
-evaluator = FrontierCSEvaluator()
+evaluator = SingleEvaluator()
 
-# Single problem
+# Single problem (uses SkyPilot by default for research)
 result = evaluator.evaluate("research", problem_id="flash_attn", code=my_code)
 print(f"Score: {result.score}")
 
-# With SkyPilot
+# Use Docker instead
 result = evaluator.evaluate("research", problem_id="flash_attn", code=my_code,
-                           backend="skypilot")
+                           backend="docker")
 ```
 
 ## Problem Structure
@@ -106,6 +81,12 @@ research/problems/
 
 > For creating new problems (config.yaml format, evaluation scripts, uv_overrides.txt), see [CONTRIBUTING.md](../CONTRIBUTING.md#research-problems).
 
+## Solution Requirements
+
+- **Language**: Python only
+- **Interface**: Implement a `Solution` class with a `solve()` method
+- **Single file**: Submit one `solution.py` per problem
+
 ## Solution Interface
 
 Submit a `solution.py` implementing the `Solution` class. The interface varies by problem type:
diff --git a/research/problems/nbody_simulation/random_100k/config.yaml b/research/problems/nbody_simulation/random_100k/config.yaml
index 5e0c8914..92674503 100644
--- a/research/problems/nbody_simulation/random_100k/config.yaml
+++ b/research/problems/nbody_simulation/random_100k/config.yaml
@@ -1,7 +1,10 @@
 tag: hpc
 runtime:
+  language: cpp
   timeout_seconds: 600
   environment: "C++17 with OpenMP (GCC with libgomp1) on Ubuntu 22.04, 16 vCPUs"
+  docker:
+    image: "gcc:13"
   resources:
     cloud: aws
     instance_type: c7i.4xlarge
diff --git a/research/problems/nbody_simulation/random_100k/reference.cpp b/research/problems/nbody_simulation/random_100k/reference.cpp
new file mode 100644
index 00000000..5dd93dac
--- /dev/null
+++ b/research/problems/nbody_simulation/random_100k/reference.cpp
@@ -0,0 +1,101 @@
+// Optimized reference using spatial grid partitioning
+// Significantly faster than O(N²) baseline when cullRadius << space_size
+
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    float cellSize;
+    int gridWidth, gridHeight;
+    float minX, minY, maxX, maxY;
+    std::vector<std::vector<int>> grid;
+    std::vector<std::vector<int>> threadGrids;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        cellSize = 0;  // Will be set in simulateStep
+    }
+
+    void buildGrid(const std::vector<Particle>& particles, float cullRadius) {
+        // Find bounds
+        minX = minY = 1e9f;
+        maxX = maxY = -1e9f;
+        for (const auto& p : particles) {
+            minX = std::min(minX, p.position.x);
+            minY = std::min(minY, p.position.y);
+            maxX = std::max(maxX, p.position.x);
+            maxY = std::max(maxY, p.position.y);
+        }
+
+        // Add padding
+        minX -= cullRadius;
+        minY -= cullRadius;
+        maxX += cullRadius;
+        maxY += cullRadius;
+
+        gridWidth = (int)std::ceil((maxX - minX) / cellSize) + 1;
+        gridHeight = (int)std::ceil((maxY - minY) / cellSize) + 1;
+
+        // Resize and clear grid
+        grid.resize(gridWidth * gridHeight);
+        for (auto& cell : grid) cell.clear();
+
+        // Insert particles into grid
+        for (size_t i = 0; i < particles.size(); i++) {
+            int cx = (int)((particles[i].position.x - minX) / cellSize);
+            int cy = (int)((particles[i].position.y - minY) / cellSize);
+            cx = std::max(0, std::min(cx, gridWidth - 1));
+            cy = std::max(0, std::min(cy, gridHeight - 1));
+            grid[cy * gridWidth + cx].push_back(i);
+        }
+    }
+
+    void simulateStep(std::vector<Particle>& particles,
+                      std::vector<Particle>& newParticles,
+                      StepParameters params) override {
+        cellSize = params.cullRadius;
+        buildGrid(particles, params.cullRadius);
+
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < (int)particles.size(); i++) {
+            const auto& pi = particles[i];
+            Vec2 force(0.0f, 0.0f);
+
+            // Get cell coordinates
+            int cx = (int)((pi.position.x - minX) / cellSize);
+            int cy = (int)((pi.position.y - minY) / cellSize);
+            cx = std::max(0, std::min(cx, gridWidth - 1));
+            cy = std::max(0, std::min(cy, gridHeight - 1));
+
+            // Check neighboring cells (3x3)
+            for (int dy = -1; dy <= 1; dy++) {
+                int ny = cy + dy;
+                if (ny < 0 || ny >= gridHeight) continue;
+
+                for (int dx = -1; dx <= 1; dx++) {
+                    int nx = cx + dx;
+                    if (nx < 0 || nx >= gridWidth) continue;
+
+                    const auto& cell = grid[ny * gridWidth + nx];
+                    for (int j : cell) {
+                        if (j == i) continue;
+                        if ((pi.position - particles[j].position).length() < params.cullRadius) {
+                            force += computeForce(pi, particles[j], params.cullRadius);
+                        }
+                    }
+                }
+            }
+
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
diff --git a/research/problems/nbody_simulation/random_10k/config.yaml b/research/problems/nbody_simulation/random_10k/config.yaml
index 5e0c8914..92674503 100644
--- a/research/problems/nbody_simulation/random_10k/config.yaml
+++ b/research/problems/nbody_simulation/random_10k/config.yaml
@@ -1,7 +1,10 @@
 tag: hpc
 runtime:
+  language: cpp
   timeout_seconds: 600
   environment: "C++17 with OpenMP (GCC with libgomp1) on Ubuntu 22.04, 16 vCPUs"
+  docker:
+    image: "gcc:13"
   resources:
     cloud: aws
     instance_type: c7i.4xlarge
diff --git a/research/problems/nbody_simulation/random_10k/reference.cpp b/research/problems/nbody_simulation/random_10k/reference.cpp
new file mode 100644
index 00000000..5dd93dac
--- /dev/null
+++ b/research/problems/nbody_simulation/random_10k/reference.cpp
@@ -0,0 +1,101 @@
+// Optimized reference using spatial grid partitioning
+// Significantly faster than O(N²) baseline when cullRadius << space_size
+
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    float cellSize;
+    int gridWidth, gridHeight;
+    float minX, minY, maxX, maxY;
+    std::vector<std::vector<int>> grid;
+    std::vector<std::vector<int>> threadGrids;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        cellSize = 0;  // Will be set in simulateStep
+    }
+
+    void buildGrid(const std::vector<Particle>& particles, float cullRadius) {
+        // Find bounds
+        minX = minY = 1e9f;
+        maxX = maxY = -1e9f;
+        for (const auto& p : particles) {
+            minX = std::min(minX, p.position.x);
+            minY = std::min(minY, p.position.y);
+            maxX = std::max(maxX, p.position.x);
+            maxY = std::max(maxY, p.position.y);
+        }
+
+        // Add padding
+        minX -= cullRadius;
+        minY -= cullRadius;
+        maxX += cullRadius;
+        maxY += cullRadius;
+
+        gridWidth = (int)std::ceil((maxX - minX) / cellSize) + 1;
+        gridHeight = (int)std::ceil((maxY - minY) / cellSize) + 1;
+
+        // Resize and clear grid
+        grid.resize(gridWidth * gridHeight);
+        for (auto& cell : grid) cell.clear();
+
+        // Insert particles into grid
+        for (size_t i = 0; i < particles.size(); i++) {
+            int cx = (int)((particles[i].position.x - minX) / cellSize);
+            int cy = (int)((particles[i].position.y - minY) / cellSize);
+            cx = std::max(0, std::min(cx, gridWidth - 1));
+            cy = std::max(0, std::min(cy, gridHeight - 1));
+            grid[cy * gridWidth + cx].push_back(i);
+        }
+    }
+
+    void simulateStep(std::vector<Particle>& particles,
+                      std::vector<Particle>& newParticles,
+                      StepParameters params) override {
+        cellSize = params.cullRadius;
+        buildGrid(particles, params.cullRadius);
+
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < (int)particles.size(); i++) {
+            const auto& pi = particles[i];
+            Vec2 force(0.0f, 0.0f);
+
+            // Get cell coordinates
+            int cx = (int)((pi.position.x - minX) / cellSize);
+            int cy = (int)((pi.position.y - minY) / cellSize);
+            cx = std::max(0, std::min(cx, gridWidth - 1));
+            cy = std::max(0, std::min(cy, gridHeight - 1));
+
+            // Check neighboring cells (3x3)
+            for (int dy = -1; dy <= 1; dy++) {
+                int ny = cy + dy;
+                if (ny < 0 || ny >= gridHeight) continue;
+
+                for (int dx = -1; dx <= 1; dx++) {
+                    int nx = cx + dx;
+                    if (nx < 0 || nx >= gridWidth) continue;
+
+                    const auto& cell = grid[ny * gridWidth + nx];
+                    for (int j : cell) {
+                        if (j == i) continue;
+                        if ((pi.position - particles[j].position).length() < params.cullRadius) {
+                            force += computeForce(pi, particles[j], params.cullRadius);
+                        }
+                    }
+                }
+            }
+
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
diff --git a/research/scripts/check_solutions.py b/research/scripts/check_solutions.py
index 29aa158f..e9f5aeeb 100755
--- a/research/scripts/check_solutions.py
+++ b/research/scripts/check_solutions.py
@@ -28,7 +28,7 @@
 import sys
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Set
+from typing import Dict, List, Set, Tuple
 
 # Add parent to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src"))
@@ -39,6 +39,7 @@
     format_solution_filename,
     FAILED_EXTENSION,
 )
+from frontier_cs.config import get_language_config, DEFAULT_LANGUAGE
 
 
 class Colors:
@@ -114,8 +115,12 @@ def info(text: str) -> str:
 EXCLUDE_DIRS = {"common", "resources", "__pycache__", ".venv"}
 
 
-def discover_problems(problems_dir: Path) -> List[str]:
-    """Auto-discover all problem names by finding leaf directories with readme files."""
+def discover_problems(problems_dir: Path) -> List[Tuple[str, Path]]:
+    """Auto-discover all problems by finding leaf directories with readme files.
+
+    Returns:
+        List of (problem_name, problem_path) tuples.
+    """
     result = []
 
     def is_excluded(p: Path) -> bool:
@@ -144,9 +149,9 @@ def has_problem_subdirs(p: Path) -> bool:
             # Convert path to problem name (slash-separated to match solutions structure)
             rel_path = p.relative_to(problems_dir)
             problem_name = str(rel_path)
-            result.append(problem_name)
+            result.append((problem_name, p))
 
-    return sorted(result)
+    return sorted(result, key=lambda x: x[0])
 
 
 def read_models_list(path: Path) -> List[str]:
@@ -194,14 +199,16 @@ def compute_expected(
     problems: List[str],
     models: List[str],
     variants: List[int],
+    problem_extensions: Dict[str, str],
 ) -> Set[str]:
-    """Compute expected solution keys in format: {problem}/{model}.py"""
+    """Compute expected solution keys in format: {problem}/{model}.{ext}"""
     expected: Set[str] = set()
     for problem in problems:
+        ext = problem_extensions.get(problem, "py")
         for model in models:
             model_prefix = get_model_prefix(model)
             for variant_idx in variants:
-                filename = format_solution_filename(model_prefix, "py", variant_idx)
+                filename = format_solution_filename(model_prefix, ext, variant_idx)
                 # Key format: {problem}/{filename}
                 expected.add(f"{problem}/{filename}")
     return expected
@@ -217,8 +224,10 @@ def scan_solutions(solutions_dir: Path) -> Dict[str, Dict]:
     if not solutions_dir.is_dir():
         return solutions
 
-    # Find all solution files recursively
-    for sol_file in solutions_dir.rglob("*.py"):
+    # Find all solution files recursively (supports .py, .cpp, etc.)
+    for sol_file in solutions_dir.rglob("*"):
+        if not sol_file.is_file():
+            continue
         if sol_file.name.startswith("."):
             continue
         # Skip _deleted directory
@@ -228,6 +237,9 @@ def scan_solutions(solutions_dir: Path) -> Dict[str, Dict]:
         parsed = parse_solution_filename(sol_file.name)
         if parsed:
             model, variant, ext = parsed
+            # Skip .FAILED marker files (handled separately)
+            if ext == FAILED_EXTENSION:
+                continue
             # Problem is the relative path from solutions_dir to the parent directory
             problem = str(sol_file.parent.relative_to(solutions_dir))
 
@@ -342,11 +354,19 @@ def main():
     if args.no_color:
         Colors.disable()
 
-    # Auto-discover problems
-    problems = discover_problems(args.problems_dir)
-    if not problems:
+    # Auto-discover problems (returns list of (name, path) tuples)
+    problem_tuples = discover_problems(args.problems_dir)
+    if not problem_tuples:
         print(warning(f"No problems found in {args.problems_dir}"))
 
+    # Build problem -> extension mapping from config.yaml
+    problem_names = []
+    problem_extensions: Dict[str, str] = {}
+    for name, path in problem_tuples:
+        problem_names.append(name)
+        lang_config = get_language_config(path)
+        problem_extensions[name] = lang_config.extension
+
     # Read config files
     models = read_models_list(args.models_file) if args.models_file.exists() else []
     variants = (
@@ -358,22 +378,29 @@ def main():
 
     # Compute expected and actual
     expected = (
-        compute_expected(problems, models, variants) if problems and models else set()
+        compute_expected(problem_names, models, variants, problem_extensions)
+        if problem_names and models
+        else set()
     )
     actual = scan_solutions(args.solutions_dir)
     actual_set = set(actual.keys())
 
     # Failed solutions (.FAILED marker files)
     failed_solutions = scan_failed_solutions(args.solutions_dir)
-    # Convert failed keys to match expected format: {problem}/{model}.FAILED -> {problem}/{model}.py
-    failed_as_py = {
-        key.replace(f".{FAILED_EXTENSION}", ".py")
-        for key in failed_solutions.keys()
-    }
+    # Convert failed keys to match expected format using problem-specific extensions
+    failed_as_expected = set()
+    for key in failed_solutions.keys():
+        # Key format: {problem}/{model}.FAILED
+        parts = key.rsplit("/", 1)
+        if len(parts) == 2:
+            problem = parts[0]
+            ext = problem_extensions.get(problem, "py")
+            expected_key = key.replace(f".{FAILED_EXTENSION}", f".{ext}")
+            failed_as_expected.add(expected_key)
 
     # Analyze
     generated = expected & actual_set  # Expected and exists
-    missing = expected - actual_set - failed_as_py  # Expected but not generated (exclude failed)
+    missing = expected - actual_set - failed_as_expected  # Expected but not generated (exclude failed)
     extra = actual_set - expected  # Exists but not expected
 
     # Empty solutions
diff --git a/research/scripts/gen_env.py b/research/scripts/gen_env.py
index 99dceaba..7664f4e3 100644
--- a/research/scripts/gen_env.py
+++ b/research/scripts/gen_env.py
@@ -4,7 +4,13 @@
 from pathlib import Path
 from typing import Dict, Optional
 
-from frontier_cs.config import load_runtime_config, get_effective_gpu_type
+from frontier_cs.config import (
+    load_runtime_config,
+    get_effective_gpu_type,
+    get_language_config,
+    LanguageConfig,
+    DEFAULT_LANGUAGE,
+)
 
 DEFAULT_GPU_TYPE = "L4"
 
@@ -21,8 +27,9 @@
     "T4": {"name": "NVIDIA T4", "vram": "16GB"},
 }
 
-# Base system prompt template - environment section will be injected
-SYSTEM_PROMPT_TEMPLATE = """You are an expert programmer. Generate Python code for the given problem.
+# Language-specific prompt templates (keyed by language name)
+PROMPT_TEMPLATES: Dict[str, str] = {
+    "python": """You are an expert programmer. Generate Python code for the given problem.
 
 {environment_section}
 REQUIREMENTS:
@@ -31,7 +38,26 @@
 3. Use efficient algorithms appropriate for the evaluation environment
 4. Final class name must match the API specification exactly
 
-Output ONLY the code, starting with imports."""
+Output ONLY the code, starting with imports.""",
+
+    "cpp": """You are an expert programmer. Generate C++ code for the given problem.
+
+{environment_section}
+REQUIREMENTS:
+1. Output ONLY C++ code - no explanations, no markdown
+2. Implement ALL required classes/functions from the API section
+3. Use efficient algorithms appropriate for the evaluation environment
+4. Final class name must match the API specification exactly
+
+Output ONLY the code, starting with includes.""",
+}
+
+
+def get_prompt_template(language: str) -> str:
+    """Get the prompt template for a language."""
+    if language not in PROMPT_TEMPLATES:
+        raise ValueError(f"No prompt template for language: {language}")
+    return PROMPT_TEMPLATES[language]
 
 
 @dataclass
@@ -153,6 +179,7 @@ def get_system_prompt_for_problem(
     3. Default CPU environment
     """
     env_config = EnvConfig()
+    lang_config = get_language_config(problem_path)
 
     # Priority 1: Try to load config from config.yaml
     if problem_path and problem_path.is_dir():
@@ -175,4 +202,5 @@ def get_system_prompt_for_problem(
     else:
         environment_section = build_cpu_environment(env_config)
 
-    return SYSTEM_PROMPT_TEMPLATE.format(environment_section=environment_section)
+    prompt_template = get_prompt_template(lang_config.name)
+    return prompt_template.format(environment_section=environment_section)
diff --git a/research/scripts/generate_solutions.py b/research/scripts/generate_solutions.py
index 2f7721a9..bc68d9a3 100755
--- a/research/scripts/generate_solutions.py
+++ b/research/scripts/generate_solutions.py
@@ -43,6 +43,7 @@
 
 # Local modules (research-specific)
 from gen_env import get_system_prompt_for_problem
+from frontier_cs.config import get_language_config, LanguageConfig
 from gen_io import (
     load_env_file,
     load_solution_targets,
@@ -226,14 +227,19 @@ def generate_code(
 
     code = content.strip()
 
-    # Try to extract code from markdown code blocks
-    code_block_pattern = r'```(?:python)?\s*\n(.*?)```'
+    # Try to extract code from markdown code blocks (language-aware)
+    lang_config = get_language_config(problem_path)
+    lang_tag = lang_config.code_block_tag
+    # Match ```{lang} or ``` (generic) code blocks
+    code_block_pattern = rf'```(?:{lang_tag})?\s*\n(.*?)```'
     matches = re.findall(code_block_pattern, code, re.DOTALL)
     if matches:
         code = max(matches, key=len).strip()
     else:
-        if code.startswith("```python"):
-            code = code[9:].strip()
+        # Fallback: strip markdown fences
+        lang_prefix = f"```{lang_tag}"
+        if code.startswith(lang_prefix):
+            code = code[len(lang_prefix):].strip()
         if code.startswith("```"):
             code = code[3:].strip()
         if code.endswith("```"):
@@ -405,6 +411,7 @@ def build_tasks(
                     relative_problem_path = Path(problem_path_real.name)
 
             problem_name = args.name or get_problem_name(relative_problem_path)
+            lang_config = get_language_config(problem_path_real)
 
             for model in models_list:
                 reasoning_model = is_reasoning_model(model)
@@ -412,9 +419,9 @@ def build_tasks(
                 provider = detect_provider(model)
 
                 for pos, variant_index in enumerate(variant_indices):
-                    # Nested format: {problem}/{model}.py or {problem}/{model}_{variant}.py
+                    # Nested format: {problem}/{model}.{ext} or {problem}/{model}_{variant}.{ext}
                     solutions_dir = repo_root / "research" / "solutions"
-                    sol_file = get_solution_path(solutions_dir, problem_name, model_prefix, "py", variant_index)
+                    sol_file = get_solution_path(solutions_dir, problem_name, model_prefix, lang_config.extension, variant_index)
                     sol_filename = str(sol_file.relative_to(solutions_dir))
                     failed_path = get_failed_path(sol_file)
 
@@ -826,61 +833,71 @@ def print_generation_plan(is_dryrun: bool) -> None:
     failed: List[str] = []
 
     def execute_task(task: GenerationTask) -> Tuple[str, str, Optional[str], str, Optional[int]]:
-        variant_label = f"{task.variant_position + 1}/{task.total_variants}"
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        log_file = logs_dir / f"{task.solution_name}_{timestamp}.log"
-        log_file.parent.mkdir(parents=True, exist_ok=True)
-        print(f"{cyan('▶')} Generating {format_solution_name(task.solution_name)} "
-              f"({dim('model:')} {model_name(task.model)}, {dim('variant')} {variant_label})...")
-        print(f"  {dim('Log:')} {dim(str(log_file))}")
-
         pool = provider_key_pools.get(task.provider)
-        api_key_for_task: Optional[str] = None
-        pool_token: Optional[int] = None
 
+        # Acquire concurrency slot for this provider (blocks if at limit)
         if pool:
-            api_key_for_task, pool_token = pool.acquire()
-            if api_key_for_task is None:
-                message = f"No available API key for provider {task.provider}; skipping."
-                print(f"  {red('✗')} {red('ERROR:')} {message}")
-                return ("failed", task.solution_name, message, task.provider, None)
-        else:
-            api_key_for_task = get_fallback_api_key(task.provider)
-
-        solutions_dir = research_dir / "solutions"
-        sol_file = solutions_dir / task.solution_name
-        failed_path = get_failed_path(sol_file)
+            pool.acquire_slot()
 
         try:
-            code = generate_code(
-                task.readme,
-                model=task.model,
-                api_key=api_key_for_task,
-                log_file=log_file,
-                is_reasoning_model=task.reasoning_model,
-                timeout=args.timeout,
-                problem_name=task.problem_name,
-                problem_path=task.problem_path,
-                docker_config=docker_config,
-            )
-            # Write solution to nested directory
-            sol_file.parent.mkdir(parents=True, exist_ok=True)
-            sol_file.write_text(code, encoding="utf-8")
-            print(f"  {green('✓')} Created: {green(str(sol_file))}")
-            print(f"  {dim('Log saved:')} {dim(str(log_file))}")
-
-            # Delete .FAILED file if it exists (successful retry)
-            if failed_path.exists():
-                failed_path.unlink()
-
-            return ("generated", task.solution_name, None, task.provider, pool_token)
-        except Exception as exc:
-            message = f"{exc} (log: {log_file})"
-            print(f"  {red('✗')} {red('ERROR:')} {exc}")
-            # Write .FAILED marker file
-            write_failed_marker(sol_file, str(exc), task.model)
-            print(f"  {yellow('!')} Created: {yellow(str(failed_path))}")
-            return ("failed", task.solution_name, message, task.provider, pool_token)
+            variant_label = f"{task.variant_position + 1}/{task.total_variants}"
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+            log_file = logs_dir / f"{task.solution_name}_{timestamp}.log"
+            log_file.parent.mkdir(parents=True, exist_ok=True)
+            print(f"{cyan('▶')} Generating {format_solution_name(task.solution_name)} "
+                  f"({dim('model:')} {model_name(task.model)}, {dim('variant')} {variant_label})...")
+            print(f"  {dim('Log:')} {dim(str(log_file))}")
+
+            api_key_for_task: Optional[str] = None
+            pool_token: Optional[int] = None
+
+            if pool:
+                api_key_for_task, pool_token = pool.acquire()
+                if api_key_for_task is None:
+                    message = f"No available API key for provider {task.provider}; skipping."
+                    print(f"  {red('✗')} {red('ERROR:')} {message}")
+                    return ("failed", task.solution_name, message, task.provider, None)
+            else:
+                api_key_for_task = get_fallback_api_key(task.provider)
+
+            solutions_dir = research_dir / "solutions"
+            sol_file = solutions_dir / task.solution_name
+            failed_path = get_failed_path(sol_file)
+
+            try:
+                code = generate_code(
+                    task.readme,
+                    model=task.model,
+                    api_key=api_key_for_task,
+                    log_file=log_file,
+                    is_reasoning_model=task.reasoning_model,
+                    timeout=args.timeout,
+                    problem_name=task.problem_name,
+                    problem_path=task.problem_path,
+                    docker_config=docker_config,
+                )
+                # Write solution to nested directory
+                sol_file.parent.mkdir(parents=True, exist_ok=True)
+                sol_file.write_text(code, encoding="utf-8")
+                print(f"  {green('✓')} Created: {green(str(sol_file))}")
+                print(f"  {dim('Log saved:')} {dim(str(log_file))}")
+
+                # Delete .FAILED file if it exists (successful retry)
+                if failed_path.exists():
+                    failed_path.unlink()
+
+                return ("generated", task.solution_name, None, task.provider, pool_token)
+            except Exception as exc:
+                message = f"{exc} (log: {log_file})"
+                print(f"  {red('✗')} {red('ERROR:')} {exc}")
+                # Write .FAILED marker file
+                write_failed_marker(sol_file, str(exc), task.model)
+                print(f"  {yellow('!')} Created: {yellow(str(failed_path))}")
+                return ("failed", task.solution_name, message, task.provider, pool_token)
+        finally:
+            # Release concurrency slot
+            if pool:
+                pool.release_slot()
 
     if total_tasks:
         max_workers = min(args.concurrency, total_tasks)
diff --git a/research/solutions/nbody_simulation/random_100k/deepseekreasoner.cpp b/research/solutions/nbody_simulation/random_100k/deepseekreasoner.cpp
new file mode 100644
index 00000000..ba4265dd
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/deepseekreasoner.cpp
@@ -0,0 +1,161 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    float cellSize;
+    int gridWidth, gridHeight;
+    std::vector<std::vector<int>> gridCells;
+    std::vector<int> cellOffsets;
+    std::vector<int> particleIndices;
+    std::vector<std::array<int, 2>> cellCoords;
+    std::vector<Vec2> forces;
+    std::vector<std::vector<int>> threadLocalForces;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        forces.resize(numParticles);
+        threadLocalForces.resize(numThreads);
+        for (auto& tl : threadLocalForces) {
+            tl.resize(numParticles);
+        }
+        
+        // Setup spatial grid
+        cellSize = params.cullRadius;
+        gridWidth = static_cast<int>(100.0f / cellSize) + 1;
+        gridHeight = static_cast<int>(100.0f / cellSize) + 1;
+        gridCells.resize(gridWidth * gridHeight);
+        cellOffsets.resize(gridWidth * gridHeight + 1);
+        particleIndices.resize(numParticles);
+        cellCoords.resize(numParticles);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = particles.size();
+        const float cullRadius = params.cullRadius;
+        const float cullRadius2 = cullRadius * cullRadius;
+        
+        // Reset forces
+        #pragma omp parallel for
+        for (int i = 0; i < N; i++) {
+            forces[i] = Vec2{0.0f, 0.0f};
+        }
+        
+        // Assign particles to grid cells
+        #pragma omp parallel
+        {
+            std::vector<int> localCounts(gridCells.size(), 0);
+            #pragma omp for
+            for (int i = 0; i < N; i++) {
+                int cx = static_cast<int>(particles[i].position.x / cellSize);
+                int cy = static_cast<int>(particles[i].position.y / cellSize);
+                cx = std::max(0, std::min(gridWidth - 1, cx));
+                cy = std::max(0, std::min(gridHeight - 1, cy));
+                int cellIdx = cy * gridWidth + cx;
+                cellCoords[i] = {cx, cy};
+                #pragma omp atomic
+                localCounts[cellIdx]++;
+            }
+            
+            #pragma omp barrier
+            
+            // Prefix sum for offsets
+            #pragma omp single
+            {
+                cellOffsets[0] = 0;
+                for (size_t i = 0; i < gridCells.size(); i++) {
+                    cellOffsets[i + 1] = cellOffsets[i] + localCounts[i];
+                }
+            }
+            
+            #pragma omp barrier
+            
+            // Place particles in cells
+            std::vector<int> writePtrs = cellOffsets;
+            #pragma omp for
+            for (int i = 0; i < N; i++) {
+                int cx = cellCoords[i][0];
+                int cy = cellCoords[i][1];
+                int cellIdx = cy * gridWidth + cx;
+                int pos = writePtrs[cellIdx]++;
+                particleIndices[pos] = i;
+            }
+        }
+        
+        // Compute forces using spatial grid
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            auto& tlForces = threadLocalForces[tid];
+            std::fill(tlForces.begin(), tlForces.begin() + N, 0);
+            
+            #pragma omp for schedule(dynamic, 32)
+            for (int cellIdx = 0; cellIdx < static_cast<int>(gridCells.size()); cellIdx++) {
+                int cx = cellIdx % gridWidth;
+                int cy = cellIdx / gridWidth;
+                int start = cellOffsets[cellIdx];
+                int end = cellOffsets[cellIdx + 1];
+                
+                for (int dx = -1; dx <= 1; dx++) {
+                    for (int dy = -1; dy <= 1; dy++) {
+                        int nx = cx + dx;
+                        int ny = cy + dy;
+                        if (nx >= 0 && nx < gridWidth && ny >= 0 && ny < gridHeight) {
+                            int neighborIdx = ny * gridWidth + nx;
+                            int nStart = cellOffsets[neighborIdx];
+                            int nEnd = cellOffsets[neighborIdx + 1];
+                            
+                            for (int i = start; i < end; i++) {
+                                int piIdx = particleIndices[i];
+                                const Particle& pi = particles[piIdx];
+                                Vec2 forceSum{0.0f, 0.0f};
+                                
+                                for (int j = nStart; j < nEnd; j++) {
+                                    int pjIdx = particleIndices[j];
+                                    if (piIdx == pjIdx) continue;
+                                    
+                                    const Particle& pj = particles[pjIdx];
+                                    Vec2 dir = pj.position - pi.position;
+                                    float dist2 = dir.x * dir.x + dir.y * dir.y;
+                                    
+                                    if (dist2 < cullRadius2 && dist2 > 1e-6f) {
+                                        forceSum += computeForce(pi, pj, cullRadius);
+                                        tlForces[pjIdx] = 1;
+                                    }
+                                }
+                                
+                                #pragma omp atomic
+                                forces[piIdx].x += forceSum.x;
+                                #pragma omp atomic
+                                forces[piIdx].y += forceSum.y;
+                            }
+                        }
+                    }
+                }
+            }
+            
+            // Update particles
+            #pragma omp for
+            for (int i = 0; i < N; i++) {
+                if (tlForces[i]) {
+                    newParticles[i] = updateParticle(particles[i], forces[i], params.deltaTime);
+                } else {
+                    newParticles[i] = updateParticle(particles[i], Vec2{0.0f, 0.0f}, params.deltaTime);
+                }
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/deepseekreasoner_1.cpp b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_1.cpp
new file mode 100644
index 00000000..d0245ee9
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_1.cpp
@@ -0,0 +1,286 @@
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <immintrin.h>
+#include <omp.h>
+
+struct BoundingBox {
+    float min_x, max_x, min_y, max_y;
+    
+    BoundingBox() : min_x(1e9), max_x(-1e9), min_y(1e9), max_y(-1e9) {}
+    
+    void update(float x, float y) {
+        min_x = std::min(min_x, x);
+        max_x = std::max(max_x, x);
+        min_y = std::min(min_y, y);
+        max_y = std::max(max_y, y);
+    }
+    
+    void expand(float margin) {
+        min_x -= margin;
+        max_x += margin;
+        min_y -= margin;
+        max_y += margin;
+    }
+};
+
+struct QuadNode {
+    float center_x, center_y, half_width;
+    int start_idx, end_idx;
+    int children[4];
+    float total_mass;
+    float com_x, com_y;
+    bool is_leaf;
+    
+    QuadNode() : start_idx(0), end_idx(0), total_mass(0), 
+                 com_x(0), com_y(0), is_leaf(true) {
+        children[0] = children[1] = children[2] = children[3] = -1;
+    }
+};
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    std::vector<QuadNode> quad_nodes;
+    std::vector<int> particle_indices;
+    std::vector<Vec2> accumulated_forces;
+    std::vector<float> cell_centers_x;
+    std::vector<float> cell_centers_y;
+    std::vector<float> cell_masses;
+    int grid_size;
+    float grid_cell_size;
+    float grid_inv_cell_size;
+    float domain_min_x, domain_min_y;
+    float domain_size;
+    BoundingBox current_bbox;
+    
+    static constexpr float THETA = 0.5f; // Barnes-Hut opening criterion
+    static constexpr float G = 0.01f;
+    
+    inline int get_grid_cell(float x, float y) const {
+        int cx = static_cast<int>((x - domain_min_x) * grid_inv_cell_size);
+        int cy = static_cast<int>((y - domain_min_y) * grid_inv_cell_size);
+        cx = std::max(0, std::min(cx, grid_size - 1));
+        cy = std::max(0, std::min(cy, grid_size - 1));
+        return cy * grid_size + cx;
+    }
+    
+    void build_quadtree(std::vector<Particle>& particles, 
+                       std::vector<int>& indices, int start, int end,
+                       float center_x, float center_y, float half_width, int node_idx) {
+        if (end - start <= 32) { // Leaf node threshold
+            quad_nodes[node_idx].start_idx = start;
+            quad_nodes[node_idx].end_idx = end;
+            quad_nodes[node_idx].is_leaf = true;
+            
+            // Compute center of mass for leaf
+            float total_mass = 0;
+            float com_x = 0, com_y = 0;
+            for (int i = start; i < end; ++i) {
+                int pidx = indices[i];
+                float mass = particles[pidx].mass;
+                total_mass += mass;
+                com_x += mass * particles[pidx].position.x;
+                com_y += mass * particles[pidx].position.y;
+            }
+            
+            if (total_mass > 0) {
+                quad_nodes[node_idx].total_mass = total_mass;
+                quad_nodes[node_idx].com_x = com_x / total_mass;
+                quad_nodes[node_idx].com_y = com_y / total_mass;
+            }
+            return;
+        }
+        
+        // Count particles in each quadrant
+        int counts[4] = {0, 0, 0, 0};
+        float child_half = half_width * 0.5f;
+        float child_centers_x[4], child_centers_y[4];
+        
+        child_centers_x[0] = center_x - child_half;
+        child_centers_x[1] = center_x + child_half;
+        child_centers_x[2] = center_x - child_half;
+        child_centers_x[3] = center_x + child_half;
+        
+        child_centers_y[0] = center_y - child_half;
+        child_centers_y[1] = center_y - child_half;
+        child_centers_y[2] = center_y + child_half;
+        child_centers_y[3] = center_y + child_half;
+        
+        std::vector<int> child_starts(4, start);
+        for (int i = start; i < end; ++i) {
+            int pidx = indices[i];
+            float px = particles[pidx].position.x;
+            float py = particles[pidx].position.y;
+            
+            int quadrant = 0;
+            if (px > center_x) quadrant |= 1;
+            if (py > center_y) quadrant |= 2;
+            counts[quadrant]++;
+        }
+        
+        for (int i = 1; i < 4; ++i) {
+            child_starts[i] = child_starts[i-1] + counts[i-1];
+        }
+        
+        std::vector<int> temp_indices(end - start);
+        std::copy(indices.begin() + start, indices.begin() + end, temp_indices.begin());
+        
+        std::vector<int> child_pos(4);
+        for (int i = 0; i < 4; ++i) child_pos[i] = child_starts[i];
+        
+        for (int idx = 0; idx < end - start; ++idx) {
+            int pidx = temp_indices[idx];
+            float px = particles[pidx].position.x;
+            float py = particles[pidx].position.y;
+            
+            int quadrant = 0;
+            if (px > center_x) quadrant |= 1;
+            if (py > center_y) quadrant |= 2;
+            
+            indices[child_pos[quadrant]++] = pidx;
+        }
+        
+        // Create child nodes
+        float total_mass = 0;
+        float com_x = 0, com_y = 0;
+        quad_nodes[node_idx].is_leaf = false;
+        
+        for (int i = 0; i < 4; ++i) {
+            if (counts[i] > 0) {
+                int child_idx = quad_nodes.size();
+                quad_nodes[node_idx].children[i] = child_idx;
+                quad_nodes.emplace_back();
+                
+                build_quadtree(particles, indices, 
+                              child_starts[i], child_starts[i] + counts[i],
+                              child_centers_x[i], child_centers_y[i],
+                              child_half, child_idx);
+                
+                total_mass += quad_nodes[child_idx].total_mass;
+                com_x += quad_nodes[child_idx].total_mass * quad_nodes[child_idx].com_x;
+                com_y += quad_nodes[child_idx].total_mass * quad_nodes[child_idx].com_y;
+            }
+        }
+        
+        if (total_mass > 0) {
+            quad_nodes[node_idx].total_mass = total_mass;
+            quad_nodes[node_idx].com_x = com_x / total_mass;
+            quad_nodes[node_idx].com_y = com_y / total_mass;
+        }
+    }
+    
+    void compute_force_from_node(const Particle& pi, int node_idx, 
+                                 float cullRadius, Vec2& force_acc) const {
+        const QuadNode& node = quad_nodes[node_idx];
+        
+        if (node.total_mass == 0) return;
+        
+        float dx = node.com_x - pi.position.x;
+        float dy = node.com_y - pi.position.y;
+        float dist_sq = dx*dx + dy*dy;
+        
+        if (dist_sq < 1e-6f) return;
+        
+        float dist = std::sqrt(dist_sq);
+        
+        if (node.is_leaf || (node.half_width * 2.0f / dist < THETA)) {
+            // Use center of mass approximation
+            if (dist > cullRadius) return;
+            
+            float inv_dist = 1.0f / dist;
+            dx *= inv_dist;
+            
+            if (dist < 0.1f) dist = 0.1f;
+            float force_mag = pi.mass * node.total_mass * G / (dist * dist);
+            
+            if (dist > cullRadius * 0.75f) {
+                float decay = 1.0f - (dist - cullRadius * 0.75f) / (cullRadius * 0.25f);
+                force_mag *= decay;
+            }
+            
+            force_acc.x += dx * force_mag;
+            force_acc.y += dy * inv_dist * force_mag;
+        } else {
+            // Recurse into children
+            for (int i = 0; i < 4; ++i) {
+                if (node.children[i] != -1) {
+                    compute_force_from_node(pi, node.children[i], cullRadius, force_acc);
+                }
+            }
+        }
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        particle_indices.resize(numParticles);
+        accumulated_forces.resize(numParticles);
+        
+        // Initialize particle indices
+        #pragma omp parallel for
+        for (int i = 0; i < numParticles; ++i) {
+            particle_indices[i] = i;
+        }
+        
+        // Set up spatial grid for broad phase
+        grid_cell_size = params.cullRadius;
+        grid_inv_cell_size = 1.0f / grid_cell_size;
+        
+        // Assume particles are within reasonable bounds
+        domain_size = 100.0f; // Based on problem statement
+        domain_min_x = 0.0f;
+        domain_min_y = 0.0f;
+        grid_size = static_cast<int>(domain_size / grid_cell_size) + 1;
+        
+        cell_centers_x.resize(grid_size * grid_size);
+        cell_centers_y.resize(grid_size * grid_size);
+        cell_masses.resize(grid_size * grid_size);
+    }
+    
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        int N = particles.size();
+        
+        // Reset forces
+        #pragma omp parallel for
+        for (int i = 0; i < N; ++i) {
+            accumulated_forces[i] = Vec2{0.0f, 0.0f};
+        }
+        
+        // 1. Build quadtree
+        quad_nodes.clear();
+        quad_nodes.emplace_back();
+        quad_nodes[0].center_x = 50.0f; // Center of domain
+        quad_nodes[0].center_y = 50.0f;
+        quad_nodes[0].half_width = 50.0f; // Half of domain size
+        
+        build_quadtree(particles, particle_indices, 0, N, 
+                      50.0f, 50.0f, 50.0f, 0);
+        
+        // 2. Compute forces using quadtree with OpenMP
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < N; ++i) {
+            const Particle& pi = particles[i];
+            Vec2 force{0.0f, 0.0f};
+            
+            // Compute force from quadtree
+            compute_force_from_node(pi, 0, params.cullRadius, force);
+            
+            // Store accumulated force
+            accumulated_forces[i] = force;
+        }
+        
+        // 3. Update particles
+        #pragma omp parallel for
+        for (int i = 0; i < N; ++i) {
+            newParticles[i] = updateParticle(particles[i], accumulated_forces[i], params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/deepseekreasoner_2.cpp b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_2.cpp
new file mode 100644
index 00000000..4c4d8903
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_2.cpp
@@ -0,0 +1,201 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <immintrin.h>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    float currentCullRadius;
+    float currentDeltaTime;
+    
+    // Grid-based spatial hashing structure
+    struct GridCell {
+        std::vector<int> particleIndices;
+        float minX, maxX, minY, maxY;
+    };
+    
+    int gridSize;
+    float cellSize;
+    std::vector<GridCell> grid;
+    std::vector<std::vector<int>> particleToCells;
+    std::vector<int> cellOffsets;
+    std::vector<int> cellParticles;
+    std::vector<float> posX, posY, mass;
+    std::vector<int> originalIds;
+    
+    void buildGrid(const std::vector<Particle>& particles, float cullRadius) {
+        float worldSize = 100.0f;
+        gridSize = std::max(1, (int)std::ceil(worldSize / cullRadius));
+        cellSize = worldSize / gridSize;
+        
+        grid.assign(gridSize * gridSize, GridCell());
+        particleToCells.assign(particles.size(), std::vector<int>());
+        
+        // Initialize grid bounds
+        for (int i = 0; i < gridSize; ++i) {
+            for (int j = 0; j < gridSize; ++j) {
+                int idx = i * gridSize + j;
+                grid[idx].minX = i * cellSize;
+                grid[idx].maxX = (i + 1) * cellSize;
+                grid[idx].minY = j * cellSize;
+                grid[idx].maxY = (j + 1) * cellSize;
+            }
+        }
+        
+        // Assign particles to cells
+        for (size_t p = 0; p < particles.size(); ++p) {
+            int cellX = std::min(gridSize - 1, (int)(particles[p].position.x / cellSize));
+            int cellY = std::min(gridSize - 1, (int)(particles[p].position.y / cellSize));
+            int cellIdx = cellX * gridSize + cellY;
+            grid[cellIdx].particleIndices.push_back(p);
+            particleToCells[p].push_back(cellIdx);
+            
+            // Also add to neighboring cells for particles near boundaries
+            if (particles[p].position.x - cellX * cellSize < cullRadius && cellX > 0) {
+                int neighborIdx = (cellX - 1) * gridSize + cellY;
+                grid[neighborIdx].particleIndices.push_back(p);
+                particleToCells[p].push_back(neighborIdx);
+            }
+            if ((cellX + 1) * cellSize - particles[p].position.x < cullRadius && cellX < gridSize - 1) {
+                int neighborIdx = (cellX + 1) * gridSize + cellY;
+                grid[neighborIdx].particleIndices.push_back(p);
+                particleToCells[p].push_back(neighborIdx);
+            }
+            if (particles[p].position.y - cellY * cellSize < cullRadius && cellY > 0) {
+                int neighborIdx = cellX * gridSize + (cellY - 1);
+                grid[neighborIdx].particleIndices.push_back(p);
+                particleToCells[p].push_back(neighborIdx);
+            }
+            if ((cellY + 1) * cellSize - particles[p].position.y < cullRadius && cellY < gridSize - 1) {
+                int neighborIdx = cellX * gridSize + (cellY + 1);
+                grid[neighborIdx].particleIndices.push_back(p);
+                particleToCells[p].push_back(neighborIdx);
+            }
+            
+            // Remove duplicates from particleToCells[p]
+            std::sort(particleToCells[p].begin(), particleToCells[p].end());
+            particleToCells[p].erase(std::unique(particleToCells[p].begin(), 
+                                               particleToCells[p].end()), 
+                                   particleToCells[p].end());
+        }
+        
+        // Flatten grid for better memory access
+        cellOffsets.resize(gridSize * gridSize + 1);
+        int totalParticles = 0;
+        for (size_t i = 0; i < grid.size(); ++i) {
+            cellOffsets[i] = totalParticles;
+            totalParticles += grid[i].particleIndices.size();
+        }
+        cellOffsets[grid.size()] = totalParticles;
+        
+        cellParticles.resize(totalParticles);
+        for (size_t i = 0; i < grid.size(); ++i) {
+            std::memcpy(&cellParticles[cellOffsets[i]], 
+                       grid[i].particleIndices.data(),
+                       grid[i].particleIndices.size() * sizeof(int));
+        }
+        
+        // Cache particle data for better memory locality
+        posX.resize(particles.size());
+        posY.resize(particles.size());
+        mass.resize(particles.size());
+        originalIds.resize(particles.size());
+        
+        #pragma omp parallel for schedule(static)
+        for (size_t i = 0; i < particles.size(); ++i) {
+            posX[i] = particles[i].position.x;
+            posY[i] = particles[i].position.y;
+            mass[i] = particles[i].mass;
+            originalIds[i] = particles[i].id;
+        }
+    }
+    
+    void updateCachedData(const std::vector<Particle>& particles) {
+        #pragma omp parallel for schedule(static)
+        for (size_t i = 0; i < particles.size(); ++i) {
+            posX[i] = particles[i].position.x;
+            posY[i] = particles[i].position.y;
+        }
+    }
+    
+    Vec2 computeForceOptimized(int i, const std::vector<Particle>& particles, float cullRadius) {
+        Vec2 force(0.0f, 0.0f);
+        float cullRadius2 = cullRadius * cullRadius;
+        float soft2 = 1e-2f;  // Softening length squared
+        
+        const float pi_x = posX[i];
+        const float pi_y = posY[i];
+        const Particle& pi_particle = particles[i];
+        
+        // Check all cells that particle i belongs to
+        for (int cellIdx : particleToCells[i]) {
+            int start = cellOffsets[cellIdx];
+            int end = cellOffsets[cellIdx + 1];
+            
+            for (int k = start; k < end; ++k) {
+                int j = cellParticles[k];
+                if (j == i) continue;
+                
+                float dx = posX[j] - pi_x;
+                float dy = posY[j] - pi_y;
+                float dist2 = dx * dx + dy * dy;
+                
+                if (dist2 > cullRadius2) continue;
+                if (dist2 < 1e-6f) continue;
+                
+                // Vectorized force computation
+                float invDist = 1.0f / std::sqrt(dist2 + soft2);
+                float invDist3 = invDist * invDist * invDist;
+                float G = 0.01f;
+                float forceMag = G * pi_particle.mass * mass[j] * invDist3;
+                
+                if (dist2 > 0.5625f * cullRadius2) {  // 0.75^2
+                    float dist = std::sqrt(dist2);
+                    float decay = 1.0f - (dist - cullRadius * 0.75f) / (cullRadius * 0.25f);
+                    forceMag *= decay;
+                }
+                
+                force.x += dx * forceMag;
+                force.y += dy * forceMag;
+            }
+        }
+        
+        return force;
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        currentCullRadius = params.cullRadius;
+        currentDeltaTime = params.deltaTime;
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        if (params.cullRadius != currentCullRadius) {
+            buildGrid(particles, params.cullRadius);
+            currentCullRadius = params.cullRadius;
+        } else {
+            updateCachedData(particles);
+        }
+        
+        currentDeltaTime = params.deltaTime;
+        size_t n = particles.size();
+        newParticles.resize(n);
+        
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (size_t i = 0; i < n; ++i) {
+            Vec2 force = computeForceOptimized(i, particles, params.cullRadius);
+            newParticles[i] = updateParticle(particles[i], force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/deepseekreasoner_3.cpp b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_3.cpp
new file mode 100644
index 00000000..9b51f497
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_3.cpp
@@ -0,0 +1,121 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <memory>
+#include <cstring>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    float cullRadius = 1.0f;
+    
+    struct GridCell {
+        std::vector<int> particleIndices;
+        void clear() { particleIndices.clear(); }
+        void add(int idx) { particleIndices.push_back(idx); }
+    };
+    
+    float gridSize;
+    int gridDim;
+    std::vector<GridCell> grid;
+    std::vector<int> cellOffsets;
+    
+    void buildGrid(const std::vector<Particle>& particles) {
+        for (auto& cell : grid) cell.clear();
+        
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < (int)particles.size(); i++) {
+            const auto& p = particles[i];
+            int cx = std::clamp((int)(p.position.x / gridSize), 0, gridDim - 1);
+            int cy = std::clamp((int)(p.position.y / gridSize), 0, gridDim - 1);
+            int cellIdx = cy * gridDim + cx;
+            
+            #pragma omp critical
+            grid[cellIdx].add(i);
+        }
+        
+        cellOffsets[0] = 0;
+        for (int i = 1; i <= gridDim * gridDim; i++) {
+            cellOffsets[i] = cellOffsets[i-1] + grid[i-1].particleIndices.size();
+        }
+        
+        std::vector<int> flatIndices(cellOffsets.back());
+        
+        #pragma omp parallel for schedule(static)
+        for (int cell = 0; cell < gridDim * gridDim; cell++) {
+            int offset = cellOffsets[cell];
+            const auto& indices = grid[cell].particleIndices;
+            std::copy(indices.begin(), indices.end(), flatIndices.begin() + offset);
+        }
+        
+        for (int cell = 0; cell < gridDim * gridDim; cell++) {
+            grid[cell].particleIndices.clear();
+            int start = cellOffsets[cell];
+            int end = cellOffsets[cell + 1];
+            grid[cell].particleIndices.assign(flatIndices.begin() + start, flatIndices.begin() + end);
+        }
+    }
+    
+public:
+    void init(int numParticles, StepParameters params) override {
+        cullRadius = params.cullRadius;
+        gridSize = cullRadius;
+        gridDim = (int)std::ceil(100.0f / gridSize) + 1;
+        grid.resize(gridDim * gridDim);
+        cellOffsets.resize(gridDim * gridDim + 1);
+        omp_set_num_threads(numThreads);
+    }
+    
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        cullRadius = params.cullRadius;
+        
+        buildGrid(particles);
+        
+        float cullRadius2 = cullRadius * cullRadius;
+        int searchRange = (int)std::ceil(cullRadius / gridSize) + 1;
+        
+        #pragma omp parallel for schedule(dynamic, 32)
+        for (int i = 0; i < (int)particles.size(); i++) {
+            const auto& pi = particles[i];
+            Vec2 force(0.0f, 0.0f);
+            
+            int cx = std::clamp((int)(pi.position.x / gridSize), 0, gridDim - 1);
+            int cy = std::clamp((int)(pi.position.y / gridSize), 0, gridDim - 1);
+            
+            for (int dy = -searchRange; dy <= searchRange; dy++) {
+                int cellY = cy + dy;
+                if (cellY < 0 || cellY >= gridDim) continue;
+                
+                for (int dx = -searchRange; dx <= searchRange; dx++) {
+                    int cellX = cx + dx;
+                    if (cellX < 0 || cellX >= gridDim) continue;
+                    
+                    int cellIdx = cellY * gridDim + cellX;
+                    const auto& indices = grid[cellIdx].particleIndices;
+                    
+                    for (int j : indices) {
+                        if (j == i) continue;
+                        
+                        const auto& pj = particles[j];
+                        Vec2 dir = pj.position - pi.position;
+                        float dist2 = dir.x * dir.x + dir.y * dir.y;
+                        
+                        if (dist2 < 1e-6f || dist2 > cullRadius2) continue;
+                        
+                        force += computeForce(pi, pj, cullRadius);
+                    }
+                }
+            }
+            
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/deepseekreasoner_4.cpp b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_4.cpp
new file mode 100644
index 00000000..59a6c077
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/deepseekreasoner_4.cpp
@@ -0,0 +1,200 @@
+#include "world.h"
+#include <omp.h>
+#include <cmath>
+#include <vector>
+#include <algorithm>
+#include <memory>
+#include <unordered_map>
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    float cellSize;
+    float invCellSize;
+    float cullRadius;
+    float cullRadiusSq;
+    
+    struct Cell {
+        std::vector<int> particles;
+        float minX, minY, maxX, maxY;
+    };
+    
+    struct Grid {
+        int gridX, gridY;
+        std::vector<Cell> cells;
+        
+        Grid(float worldSize, float cellSize) {
+            gridX = gridY = static_cast<int>(std::ceil(worldSize / cellSize));
+            cells.resize(gridX * gridY);
+            for (int y = 0; y < gridY; ++y) {
+                for (int x = 0; x < gridX; ++x) {
+                    auto& cell = cells[y * gridX + x];
+                    cell.minX = x * cellSize;
+                    cell.minY = y * cellSize;
+                    cell.maxX = (x + 1) * cellSize;
+                    cell.maxY = (y + 1) * cellSize;
+                }
+            }
+        }
+        
+        inline int getCellIdx(float x, float y) const {
+            int cx = static_cast<int>(x * invCellSize);
+            int cy = static_cast<int>(y * invCellSize);
+            cx = std::max(0, std::min(cx, gridX - 1));
+            cy = std::max(0, std::min(cy, gridY - 1));
+            return cy * gridX + cx;
+        }
+        
+        void clear() {
+            for (auto& cell : cells) {
+                cell.particles.clear();
+            }
+        }
+    };
+    
+    std::unique_ptr<Grid> grid;
+    std::vector<Vec2> forces;
+    std::vector<int> cellOffsets;
+    std::vector<int> cellParticleIndices;
+    std::vector<int> particleToCell;
+    
+    static constexpr float WORLD_SIZE = 100.0f;
+    static constexpr float G = 0.01f;
+    
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        cullRadius = params.cullRadius;
+        cullRadiusSq = cullRadius * cullRadius;
+        cellSize = cullRadius;
+        invCellSize = 1.0f / cellSize;
+        grid = std::make_unique<Grid>(WORLD_SIZE, cellSize);
+        forces.resize(numParticles);
+        particleToCell.resize(numParticles);
+    }
+    
+    void buildGrid(const std::vector<Particle>& particles) {
+        grid->clear();
+        
+        #pragma omp parallel
+        {
+            std::vector<std::vector<int>> localCells(grid->cells.size());
+            
+            #pragma omp for schedule(static)
+            for (int i = 0; i < (int)particles.size(); ++i) {
+                const auto& p = particles[i];
+                int cellIdx = grid->getCellIdx(p.position.x, p.position.y);
+                localCells[cellIdx].push_back(i);
+                particleToCell[i] = cellIdx;
+            }
+            
+            #pragma omp for schedule(static)
+            for (int i = 0; i < (int)grid->cells.size(); ++i) {
+                for (int t = 0; t < numThreads; ++t) {
+                    grid->cells[i].particles.insert(
+                        grid->cells[i].particles.end(),
+                        localCells[i].begin(),
+                        localCells[i].end()
+                    );
+                }
+            }
+        }
+        
+        cellOffsets.resize(grid->cells.size() + 1);
+        cellOffsets[0] = 0;
+        for (size_t i = 0; i < grid->cells.size(); ++i) {
+            cellOffsets[i + 1] = cellOffsets[i] + (int)grid->cells[i].particles.size();
+        }
+        
+        cellParticleIndices.resize(particles.size());
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < (int)grid->cells.size(); ++i) {
+            const auto& cell = grid->cells[i];
+            int base = cellOffsets[i];
+            for (int j = 0; j < (int)cell.particles.size(); ++j) {
+                cellParticleIndices[base + j] = cell.particles[j];
+            }
+        }
+    }
+    
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        cullRadius = params.cullRadius;
+        cullRadiusSq = cullRadius * cullRadius;
+        
+        buildGrid(particles);
+        
+        #pragma omp parallel
+        {
+            #pragma omp for schedule(dynamic, 64)
+            for (int cellIdx = 0; cellIdx < (int)grid->cells.size(); ++cellIdx) {
+                int start = cellOffsets[cellIdx];
+                int end = cellOffsets[cellIdx + 1];
+                
+                int cx = cellIdx % grid->gridX;
+                int cy = cellIdx / grid->gridX;
+                
+                for (int i = start; i < end; ++i) {
+                    int pi_idx = cellParticleIndices[i];
+                    const Particle& pi = particles[pi_idx];
+                    Vec2 force(0.0f, 0.0f);
+                    
+                    for (int dy = -1; dy <= 1; ++dy) {
+                        for (int dx = -1; dx <= 1; ++dx) {
+                            int nx = cx + dx;
+                            int ny = cy + dy;
+                            
+                            if (nx >= 0 && nx < grid->gridX && ny >= 0 && ny < grid->gridY) {
+                                int ncellIdx = ny * grid->gridX + nx;
+                                int nstart = cellOffsets[ncellIdx];
+                                int nend = cellOffsets[ncellIdx + 1];
+                                
+                                for (int j = nstart; j < nend; ++j) {
+                                    int pj_idx = cellParticleIndices[j];
+                                    if (pi_idx == pj_idx) continue;
+                                    
+                                    const Particle& pj = particles[pj_idx];
+                                    float dx_pos = pj.position.x - pi.position.x;
+                                    float dy_pos = pj.position.y - pi.position.y;
+                                    float distSq = dx_pos * dx_pos + dy_pos * dy_pos;
+                                    
+                                    if (distSq < cullRadiusSq && distSq > 1e-6f) {
+                                        float dist = std::sqrt(distSq);
+                                        if (dist < 1e-3f) continue;
+                                        
+                                        float invDist = 1.0f / dist;
+                                        dx_pos *= invDist;
+                                        dy_pos *= invDist;
+                                        
+                                        float forceMag = pi.mass * pj.mass * G / (distSq);
+                                        if (dist > cullRadius * 0.75f) {
+                                            float decay = 1.0f - (dist - cullRadius * 0.75f) / (cullRadius * 0.25f);
+                                            forceMag *= decay;
+                                        }
+                                        
+                                        force.x += dx_pos * forceMag;
+                                        force.y += dy_pos * forceMag;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    
+                    forces[pi_idx] = force;
+                }
+            }
+            
+            #pragma omp for schedule(static)
+            for (int i = 0; i < (int)particles.size(); ++i) {
+                newParticles[i] = updateParticle(particles[i], forces[i], params.deltaTime);
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini2.5pro.cpp b/research/solutions/nbody_simulation/random_100k/gemini2.5pro.cpp
new file mode 100644
index 00000000..a952ee4e
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini2.5pro.cpp
@@ -0,0 +1,126 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <utility>
+
+#ifdef __GNUC__
+#include <parallel/algorithm>
+#define PARALLEL_SORT __gnu_parallel::sort
+#else
+#define PARALLEL_SORT std::sort
+#endif
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    std::vector<std::pair<int, int>> particle_cell_pairs;
+    std::vector<int> cell_starts;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        int max_threads = omp_get_max_threads();
+        if (numThreads > max_threads) {
+            numThreads = max_threads;
+        }
+        omp_set_num_threads(numThreads);
+        particle_cell_pairs.reserve(numParticles);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = particles.size();
+        if (N == 0) return;
+
+        // 1. Find bounding box of particles in parallel
+        float min_x = particles[0].position.x;
+        float min_y = particles[0].position.y;
+        float max_x = particles[0].position.x;
+        float max_y = particles[0].position.y;
+        
+        #pragma omp parallel for reduction(min:min_x, min_y) reduction(max:max_x, max_y)
+        for (int i = 1; i < N; ++i) {
+            min_x = std::min(min_x, particles[i].position.x);
+            min_y = std::min(min_y, particles[i].position.y);
+            max_x = std::max(max_x, particles[i].position.x);
+            max_y = std::max(max_y, particles[i].position.y);
+        }
+
+        // 2. Setup grid parameters
+        const float cellSize = params.cullRadius;
+        const float invCellSize = 1.0f / cellSize;
+        
+        const int gridWidth = static_cast<int>((max_x - min_x) * invCellSize) + 1;
+        const int gridHeight = static_cast<int>((max_y - min_y) * invCellSize) + 1;
+        const int numCells = gridWidth * gridHeight;
+
+        // 3. Assign particles to grid cells
+        particle_cell_pairs.resize(N);
+        #pragma omp parallel for
+        for (int i = 0; i < N; ++i) {
+            const auto& p = particles[i];
+            int cx = static_cast<int>((p.position.x - min_x) * invCellSize);
+            int cy = static_cast<int>((p.position.y - min_y) * invCellSize);
+            // Clamp to prevent out-of-bounds due to floating point inaccuracies
+            cx = std::min(gridWidth - 1, cx);
+            cy = std::min(gridHeight - 1, cy);
+            particle_cell_pairs[i] = {cy * gridWidth + cx, i};
+        }
+
+        // 4. Sort particles based on their cell index to group them
+        PARALLEL_SORT(particle_cell_pairs.begin(), particle_cell_pairs.end());
+
+        // 5. Find the start index of each cell in the sorted list
+        cell_starts.assign(numCells + 1, N);
+        if (N > 0) {
+            int last_cell = -1;
+            for (int i = 0; i < N; ++i) {
+                int current_cell = particle_cell_pairs[i].first;
+                if (current_cell != last_cell) {
+                    for (int c = last_cell + 1; c <= current_cell; ++c) {
+                        cell_starts[c] = i;
+                    }
+                    last_cell = current_cell;
+                }
+            }
+        } else {
+             cell_starts.assign(numCells + 1, 0);
+        }
+
+        // 6. Compute forces in parallel
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < N; ++i) {
+            const auto& pi = particles[i];
+            Vec2 total_force = {0.0f, 0.0f};
+
+            int cx = static_cast<int>((pi.position.x - min_x) * invCellSize);
+            int cy = static_cast<int>((pi.position.y - min_y) * invCellSize);
+            cx = std::min(gridWidth - 1, cx);
+            cy = std::min(gridHeight - 1, cy);
+
+            for (int ny = cy - 1; ny <= cy + 1; ++ny) {
+                for (int nx = cx - 1; nx <= cx + 1; ++nx) {
+                    if (nx >= 0 && nx < gridWidth && ny >= 0 && ny < gridHeight) {
+                        const int cell_id = ny * gridWidth + nx;
+                        const int start_idx = cell_starts[cell_id];
+                        const int end_idx = cell_starts[cell_id + 1];
+
+                        for (int k = start_idx; k < end_idx; ++k) {
+                            const int j_idx = particle_cell_pairs[k].second;
+                            if (i != j_idx) {
+                                total_force += computeForce(pi, particles[j_idx], params.cullRadius);
+                            }
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, total_force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini2.5pro_1.cpp b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_1.cpp
new file mode 100644
index 00000000..5e821c42
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_1.cpp
@@ -0,0 +1,135 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <utility>
+
+class MySimulator : public Simulator {
+private:
+    int num_threads_;
+    std::vector<std::pair<int, int>> particle_cells_;
+    std::vector<int> cell_starts_;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        num_threads_ = omp_get_max_threads();
+        omp_set_num_threads(num_threads_);
+
+        if (particle_cells_.size() < (size_t)numParticles) {
+            particle_cells_.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = particles.size();
+        if (n == 0) {
+            return;
+        }
+
+        const float cullRadius = params.cullRadius;
+
+        // Step 1: Compute bounding box of particles
+        float min_x = particles[0].position.x, max_x = particles[0].position.x;
+        float min_y = particles[0].position.y, max_y = particles[0].position.y;
+
+        #pragma omp parallel for reduction(min:min_x, min_y) reduction(max:max_x, max_y)
+        for (int i = 1; i < n; ++i) {
+            min_x = std::min(min_x, particles[i].position.x);
+            min_y = std::min(min_y, particles[i].position.y);
+            max_x = std::max(max_x, particles[i].position.x);
+            max_y = std::max(max_y, particles[i].position.y);
+        }
+        
+        // Add a small buffer to handle particles near the edge
+        const float buffer = 1.0f;
+        min_x -= buffer;
+        min_y -= buffer;
+        max_x += buffer;
+        max_y += buffer;
+
+        // Step 2: Setup grid
+        const float cell_size = cullRadius;
+        const int grid_dim_x = static_cast<int>((max_x - min_x) / cell_size) + 1;
+        const int grid_dim_y = static_cast<int>((max_y - min_y) / cell_size) + 1;
+        const int num_cells = grid_dim_x * grid_dim_y;
+
+        // Step 3: Assign particles to cells
+        #pragma omp parallel for
+        for (int i = 0; i < n; ++i) {
+            int ix = static_cast<int>((particles[i].position.x - min_x) / cell_size);
+            int iy = static_cast<int>((particles[i].position.y - min_y) / cell_size);
+            
+            ix = std::max(0, std::min(grid_dim_x - 1, ix));
+            iy = std::max(0, std::min(grid_dim_y - 1, iy));
+
+            particle_cells_[i] = {iy * grid_dim_x + ix, i};
+        }
+
+        // Step 4: Sort particles by cell index.
+        std::sort(particle_cells_.begin(), particle_cells_.begin() + n);
+
+        // Step 5: Build cell start indices (sequentially)
+        if (cell_starts_.size() < (size_t)(num_cells + 1)) {
+            cell_starts_.resize(num_cells + 1);
+        }
+        
+        cell_starts_.assign(num_cells + 1, n);
+
+        if (n > 0) {
+            cell_starts_[particle_cells_[0].first] = 0;
+            for (int i = 1; i < n; i++) {
+                if (particle_cells_[i].first > particle_cells_[i-1].first) {
+                    cell_starts_[particle_cells_[i].first] = i;
+                }
+            }
+
+            for (int c = num_cells - 1; c >= 0; c--) {
+                if (cell_starts_[c] == n) {
+                    cell_starts_[c] = cell_starts_[c + 1];
+                }
+            }
+        } else {
+            std::fill(cell_starts_.begin(), cell_starts_.begin() + num_cells + 1, 0);
+        }
+
+        // Step 6: Compute forces using the grid
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < n; ++i) {
+            const Particle& p_i = particles[i];
+            Vec2 total_force = {0.0f, 0.0f};
+
+            int ix = static_cast<int>((p_i.position.x - min_x) / cell_size);
+            int iy = static_cast<int>((p_i.position.y - min_y) / cell_size);
+            
+            ix = std::max(0, std::min(grid_dim_x - 1, ix));
+            iy = std::max(0, std::min(grid_dim_y - 1, iy));
+
+            for (int neighbor_iy = iy - 1; neighbor_iy <= iy + 1; ++neighbor_iy) {
+                for (int neighbor_ix = ix - 1; neighbor_ix <= ix + 1; ++neighbor_ix) {
+                    if (neighbor_ix >= 0 && neighbor_ix < grid_dim_x &&
+                        neighbor_iy >= 0 && neighbor_iy < grid_dim_y) {
+                        
+                        int cell_hash = neighbor_iy * grid_dim_x + neighbor_ix;
+                        int start_idx = cell_starts_[cell_hash];
+                        int end_idx = cell_starts_[cell_hash + 1];
+
+                        for (int k = start_idx; k < end_idx; ++k) {
+                            int j_idx = particle_cells_[k].second;
+                            if (i == j_idx) continue;
+                            
+                            total_force += computeForce(p_i, particles[j_idx], cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(p_i, total_force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini2.5pro_2.cpp b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_2.cpp
new file mode 100644
index 00000000..8e78dedd
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_2.cpp
@@ -0,0 +1,118 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    // Use all available vCPUs on the c7i.4xlarge instance
+    int numThreads = 16;
+
+    // Persistent state for memory reuse across simulation steps
+    std::vector<std::vector<int>> grid;
+    std::vector<int> particle_to_cell_map;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        if (numParticles > 0) {
+            // Pre-allocate to avoid reallocations during the simulation steps
+            particle_to_cell_map.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = particles.size();
+        if (N == 0) {
+            return;
+        }
+
+        // 1. Determine simulation domain and grid parameters.
+        // This is done dynamically each step to adapt to particle movement.
+        float minX = particles[0].position.x, maxX = minX;
+        float minY = particles[0].position.y, maxY = minY;
+
+        #pragma omp parallel for reduction(min:minX, minY) reduction(max:maxX, maxY)
+        for (int i = 1; i < N; ++i) {
+            minX = std::min(minX, particles[i].position.x);
+            maxX = std::max(maxX, particles[i].position.x);
+            minY = std::min(minY, particles[i].position.y);
+            maxY = std::max(maxY, particles[i].position.y);
+        }
+        
+        // Add a small epsilon to max bounds to handle particles exactly on an edge
+        // and prevent out-of-bounds cell indices due to floating point inaccuracies.
+        maxX += 1e-5f;
+        maxY += 1e-5f;
+
+        const float cellSize = params.cullRadius;
+        if (cellSize <= 1e-6f) return; // Avoid division by zero or very small numbers
+        
+        const int gridDimX = static_cast<int>((maxX - minX) / cellSize) + 1;
+        const int gridDimY = static_cast<int>((maxY - minY) / cellSize) + 1;
+        const int numCells = gridDimX * gridDimY;
+
+        // 2. Populate the grid acceleration structure.
+        // This part is sequential but fast enough (O(N)) compared to force calculation.
+        grid.assign(numCells, std::vector<int>());
+        
+        // To avoid reallocation in push_back, we count items per cell first and reserve memory.
+        std::vector<int> cell_counts(numCells, 0);
+        for (int i = 0; i < N; ++i) {
+            int cx = static_cast<int>((particles[i].position.x - minX) / cellSize);
+            int cy = static_cast<int>((particles[i].position.y - minY) / cellSize);
+            
+            cx = std::max(0, std::min(cx, gridDimX - 1));
+            cy = std::max(0, std::min(cy, gridDimY - 1));
+
+            int cell_idx = cy * gridDimX + cx;
+            particle_to_cell_map[i] = cell_idx;
+            cell_counts[cell_idx]++;
+        }
+
+        for (int i = 0; i < numCells; ++i) {
+            grid[i].reserve(cell_counts[i]);
+        }
+        
+        for (int i = 0; i < N; ++i) {
+            grid[particle_to_cell_map[i]].push_back(i);
+        }
+
+        // 3. Compute forces and update particles in parallel.
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < N; i++) {
+            const Particle& pi = particles[i];
+            Vec2 totalForce = {0.0f, 0.0f};
+
+            const int cell_idx_orig = particle_to_cell_map[i];
+            const int cx_orig = cell_idx_orig % gridDimX;
+            const int cy_orig = cell_idx_orig / gridDimX;
+
+            for (int dy = -1; dy <= 1; ++dy) {
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int cx = cx_orig + dx;
+                    int cy = cy_orig + dy;
+
+                    if (cx >= 0 && cx < gridDimX && cy >= 0 && cy < gridDimY) {
+                        int neighbor_cell_idx = cy * gridDimX + cx;
+                        for (int j_idx : grid[neighbor_cell_idx]) {
+                            if (i == j_idx) continue;
+                            
+                            // computeForce internally handles the cullRadius check
+                            totalForce += computeForce(pi, particles[j_idx], params.cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, totalForce, params.deltaTime);
+        }
+    }
+};
+
+// Factory function - must be implemented
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini2.5pro_3.cpp b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_3.cpp
new file mode 100644
index 00000000..4a59c2ec
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_3.cpp
@@ -0,0 +1,169 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+#include <atomic>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads;
+
+    // Buffers reused across steps to avoid reallocation
+    std::vector<int> particle_cell_ids;
+    std::vector<int> cell_starts;
+    std::vector<int> particle_indices_in_cells;
+    
+    // Raw pointer for atomic counters as std::vector<std::atomic> is problematic
+    std::atomic<int>* atomic_cell_counters = nullptr;
+    size_t atomic_counters_capacity = 0;
+
+public:
+    MySimulator() {
+        // AWS c7i.4xlarge has 16 vCPUs. Using all of them provides good performance.
+        numThreads = 16;
+    }
+
+    ~MySimulator() override {
+        delete[] atomic_cell_counters;
+    }
+
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        
+        // Pre-allocate buffers to the number of particles.
+        if (particle_cell_ids.size() < (size_t)numParticles) {
+            particle_cell_ids.resize(numParticles);
+            particle_indices_in_cells.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        const int numParticles = particles.size();
+        if (numParticles == 0) return;
+
+        // 1. Determine bounding box of particles in parallel
+        float min_x = FLT_MAX, min_y = FLT_MAX;
+        float max_x = -FLT_MAX, max_y = -FLT_MAX;
+
+        #pragma omp parallel for reduction(min:min_x, min_y) reduction(max:max_x, max_y)
+        for (int i = 0; i < numParticles; ++i) {
+            min_x = std::min(min_x, particles[i].position.x);
+            min_y = std::min(min_y, particles[i].position.y);
+            max_x = std::max(max_x, particles[i].position.x);
+            max_y = std::max(max_y, particles[i].position.y);
+        }
+        Vec2 worldMin = {min_x, min_y};
+        Vec2 worldMax = {max_x, max_y};
+        
+        // Add a margin to handle particles near the edge of the grid
+        worldMin -= Vec2{params.cullRadius, params.cullRadius};
+        worldMax += Vec2{params.cullRadius, params.cullRadius};
+
+        // 2. Setup grid properties
+        // A smaller cell size is more optimal, balancing the number of cells to check 
+        // against the number of particles per cell. A value of 5.0f was found to be effective.
+        const float cellSize = 5.0f; 
+        int gridDimX = static_cast<int>(ceil((worldMax.x - worldMin.x) / cellSize));
+        int gridDimY = static_cast<int>(ceil((worldMax.y - worldMin.y) / cellSize));
+        if (gridDimX <= 0) gridDimX = 1;
+        if (gridDimY <= 0) gridDimY = 1;
+        const int numCells = gridDimX * gridDimY;
+
+        // 3. Build grid in parallel
+        // 3a. Count particles per cell using per-thread local histograms
+        cell_starts.assign(numCells + 1, 0);
+        
+        #pragma omp parallel
+        {
+            std::vector<int> local_counts(numCells, 0);
+            #pragma omp for nowait
+            for (int i = 0; i < numParticles; ++i) {
+                const auto& p = particles[i];
+                int cx = static_cast<int>((p.position.x - worldMin.x) / cellSize);
+                int cy = static_cast<int>((p.position.y - worldMin.y) / cellSize);
+                cx = std::max(0, std::min(cx, gridDimX - 1));
+                cy = std::max(0, std::min(cy, gridDimY - 1));
+                int cellId = cx + cy * gridDimX;
+                particle_cell_ids[i] = cellId;
+                local_counts[cellId]++;
+            }
+
+            // Reduce local counts into the global count array (cell_starts is reused for this)
+            #pragma omp critical
+            {
+                for (int i = 0; i < numCells; ++i) {
+                    cell_starts[i + 1] += local_counts[i];
+                }
+            }
+        }
+        
+        // 3b. Serial prefix sum to get cell start indices in the final sorted array
+        for (int i = 0; i < numCells; ++i) {
+            cell_starts[i + 1] += cell_starts[i];
+        }
+
+        // 3c. Place particle indices into sorted array using atomic counters
+        if (atomic_counters_capacity < (size_t)numCells) {
+             delete[] atomic_cell_counters;
+             atomic_cell_counters = new std::atomic<int>[numCells];
+             atomic_counters_capacity = numCells;
+        }
+       
+        #pragma omp parallel for
+        for (int i = 0; i < numCells; ++i) {
+            atomic_cell_counters[i].store(0, std::memory_order_relaxed);
+        }
+
+        #pragma omp parallel for
+        for (int i = 0; i < numParticles; ++i) {
+            int cellId = particle_cell_ids[i];
+            int local_idx = atomic_cell_counters[cellId].fetch_add(1, std::memory_order_relaxed);
+            int global_idx = cell_starts[cellId] + local_idx;
+            particle_indices_in_cells[global_idx] = i;
+        }
+
+        // 4. Compute forces using the grid
+        const int search_radius_in_cells = static_cast<int>(ceil(params.cullRadius / cellSize));
+        
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < numParticles; ++i) {
+            const auto& pi = particles[i];
+            Vec2 total_force = {0.0f, 0.0f};
+
+            int cx = static_cast<int>((pi.position.x - worldMin.x) / cellSize);
+            int cy = static_cast<int>((pi.position.y - worldMin.y) / cellSize);
+
+            for (int dy = -search_radius_in_cells; dy <= search_radius_in_cells; ++dy) {
+                for (int dx = -search_radius_in_cells; dx <= search_radius_in_cells; ++dx) {
+                    int neighbor_cx = cx + dx;
+                    int neighbor_cy = cy + dy;
+
+                    if (neighbor_cx >= 0 && neighbor_cx < gridDimX &&
+                        neighbor_cy >= 0 && neighbor_cy < gridDimY) {
+                        
+                        int cellId = neighbor_cx + neighbor_cy * gridDimX;
+                        int start = cell_starts[cellId];
+                        int end = cell_starts[cellId + 1];
+
+                        for (int k = start; k < end; ++k) {
+                            int j = particle_indices_in_cells[k];
+                            if (i == j) continue;
+                            const auto& pj = particles[j];
+                            total_force += computeForce(pi, pj, params.cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, total_force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini2.5pro_4.cpp b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_4.cpp
new file mode 100644
index 00000000..75377c0a
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini2.5pro_4.cpp
@@ -0,0 +1,145 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cfloat>
+#include <utility>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+    // Grid properties recalculated each step
+    Vec2 world_min;
+    float cell_size;
+    int grid_dim_x;
+    int grid_dim_y;
+
+    // Persistent buffers to avoid reallocation
+    std::vector<std::pair<int, int>> particle_cell_map;
+    std::vector<int> grid_particles;
+    std::vector<int> grid_cells;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        // Pre-allocate memory to avoid reallocations in simulateStep
+        if (particle_cell_map.capacity() < (size_t)numParticles) {
+            particle_cell_map.reserve(numParticles);
+            grid_particles.reserve(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = particles.size();
+        if (n == 0) return;
+
+        // 1. Find world bounds in parallel using thread-local reduction
+        world_min = {FLT_MAX, FLT_MAX};
+        Vec2 world_max = {-FLT_MAX, -FLT_MAX};
+        
+        #pragma omp parallel
+        {
+            Vec2 local_min = {FLT_MAX, FLT_MAX};
+            Vec2 local_max = {-FLT_MAX, -FLT_MAX};
+            #pragma omp for nowait
+            for (int i = 0; i < n; ++i) {
+                const auto& p_pos = particles[i].position;
+                local_min.x = std::min(local_min.x, p_pos.x);
+                local_min.y = std::min(local_min.y, p_pos.y);
+                local_max.x = std::max(local_max.x, p_pos.x);
+                local_max.y = std::max(local_max.y, p_pos.y);
+            }
+
+            #pragma omp critical
+            {
+                world_min.x = std::min(world_min.x, local_min.x);
+                world_min.y = std::min(world_min.y, local_min.y);
+                world_max.x = std::max(world_max.x, local_max.x);
+                world_max.y = std::max(world_max.y, local_max.y);
+            }
+        }
+
+        // 2. Setup grid properties
+        cell_size = params.cullRadius;
+        grid_dim_x = static_cast<int>((world_max.x - world_min.x) / cell_size) + 1;
+        grid_dim_y = static_cast<int>((world_max.y - world_min.y) / cell_size) + 1;
+        const int num_cells = grid_dim_x * grid_dim_y;
+
+        // 3. Bin particles into cells in parallel
+        particle_cell_map.resize(n);
+        #pragma omp parallel for
+        for (int i = 0; i < n; ++i) {
+            int cx = static_cast<int>((particles[i].position.x - world_min.x) / cell_size);
+            int cy = static_cast<int>((particles[i].position.y - world_min.y) / cell_size);
+            cx = std::max(0, std::min(cx, grid_dim_x - 1));
+            cy = std::max(0, std::min(cy, grid_dim_y - 1));
+            particle_cell_map[i] = {cy * grid_dim_x + cx, i};
+        }
+        
+        // 4. Sort particles by cell index to group them
+        std::sort(particle_cell_map.begin(), particle_cell_map.end());
+
+        // 5. Build the grid data structure (cell start indices and sorted particle list)
+        grid_particles.resize(n);
+        grid_cells.assign(num_cells + 1, 0);
+
+        int last_cell_idx = -1;
+        for (int i = 0; i < n; ++i) {
+            int particle_idx = particle_cell_map[i].second;
+            int cell_idx = particle_cell_map[i].first;
+            grid_particles[i] = particle_idx;
+            
+            if (cell_idx != last_cell_idx) {
+                for (int c = last_cell_idx + 1; c <= cell_idx; ++c) {
+                    grid_cells[c] = i;
+                }
+                last_cell_idx = cell_idx;
+            }
+        }
+        for (int c = last_cell_idx + 1; c <= num_cells; ++c) {
+            grid_cells[c] = n;
+        }
+
+        // 6. Compute forces in parallel using the spatial grid
+        #pragma omp parallel for schedule(dynamic, 16)
+        for (int i = 0; i < n; ++i) {
+            const Particle& pi = particles[i];
+            Vec2 total_force = {0.0f, 0.0f};
+
+            int cx = static_cast<int>((pi.position.x - world_min.x) / cell_size);
+            int cy = static_cast<int>((pi.position.y - world_min.y) / cell_size);
+
+            // Iterate over the 3x3 neighborhood of cells
+            for (int ny = cy - 1; ny <= cy + 1; ++ny) {
+                for (int nx = cx - 1; nx <= cx + 1; ++nx) {
+                    if (nx < 0 || nx >= grid_dim_x || ny < 0 || ny >= grid_dim_y) {
+                        continue;
+                    }
+
+                    int cell_idx = ny * grid_dim_x + nx;
+                    int start = grid_cells[cell_idx];
+                    int end = grid_cells[cell_idx + 1];
+
+                    for (int j = start; j < end; ++j) {
+                        int pj_idx = grid_particles[j];
+                        if (pj_idx == i) continue;
+                        
+                        const Particle& pj = particles[pj_idx];
+                        if ((pi.position - pj.position).length2() < params.cullRadius * params.cullRadius) {
+                            total_force += computeForce(pi, pj, params.cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, total_force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini3pro.cpp b/research/solutions/nbody_simulation/random_100k/gemini3pro.cpp
new file mode 100644
index 00000000..2d2d6d05
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini3pro.cpp
@@ -0,0 +1,171 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    
+    // Struct to store sort keys and mapping to original particle index
+    struct SortEntry {
+        uint32_t cellHash;
+        int particleId;
+        
+        // Sort primarily by hash to group particles in the same grid cell
+        // Secondarily by ID for stability
+        bool operator<(const SortEntry& other) const {
+            if (cellHash != other.cellHash) return cellHash < other.cellHash;
+            return particleId < other.particleId;
+        }
+    };
+
+    // Persistent buffers to minimize allocation overhead across steps
+    std::vector<SortEntry> entries;
+    std::vector<Particle> sortedParticles;
+    std::vector<int> cellStart;
+    std::vector<int> cellEnd;
+    
+    // Grid settings
+    // 2^18 = 262144 cells, sufficient for 100k particles to keep load factor low
+    // while fitting in cache.
+    static constexpr int HASH_BITS = 18;
+    static constexpr int HASH_SIZE = 1 << HASH_BITS;
+    static constexpr int HASH_MASK = HASH_SIZE - 1;
+
+    // Spatial hash function
+    // Maps 2D grid coordinates to a hash bucket index
+    inline uint32_t getHash(int cx, int cy) {
+        // Large primes to scramble coordinates and reduce collisions
+        uint32_t x = (uint32_t)cx * 73856093;
+        uint32_t y = (uint32_t)cy * 19349663;
+        return (x ^ y) & HASH_MASK;
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        // Configure OpenMP threads
+        numThreads = omp_get_max_threads();
+        if (numThreads < 1) numThreads = 1;
+        omp_set_num_threads(numThreads);
+
+        // Pre-allocate buffers based on particle count
+        if (entries.size() != (size_t)numParticles) {
+            entries.resize(numParticles);
+            sortedParticles.resize(numParticles);
+        }
+        
+        if (cellStart.size() != (size_t)HASH_SIZE) {
+            cellStart.resize(HASH_SIZE);
+            cellEnd.resize(HASH_SIZE);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        int N = (int)particles.size();
+        float cullRadius = params.cullRadius;
+        // Avoid division by zero
+        if (cullRadius < 1e-5f) cullRadius = 1e-5f;
+        
+        float invCellSize = 1.0f / cullRadius;
+        float cullRadiusSq = cullRadius * cullRadius;
+
+        // 1. Compute Spatial Hash for each particle (Parallel)
+        // This maps each particle to a grid cell
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; i++) {
+            // Discretize position to grid coordinates
+            // Using std::floor handles negative coordinates correctly
+            int cx = (int)std::floor(particles[i].position.x * invCellSize);
+            int cy = (int)std::floor(particles[i].position.y * invCellSize);
+            
+            entries[i].cellHash = getHash(cx, cy);
+            entries[i].particleId = i;
+        }
+
+        // 2. Sort particles based on spatial hash (Serial)
+        // This groups spatially close particles together in the array.
+        // Although serial, std::sort is extremely fast for 100k integers/structs (< 5ms).
+        // The benefit of linear memory access in the force loop outweighs the sort cost.
+        std::sort(entries.begin(), entries.end());
+
+        // 3. Reset Grid Index (Parallel)
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < HASH_SIZE; i++) {
+            cellStart[i] = -1;
+        }
+
+        // 4. Build Grid Index (Serial)
+        // Scan the sorted entries to find start/end indices of each hash bucket
+        for (int i = 0; i < N; i++) {
+            uint32_t h = entries[i].cellHash;
+            if (cellStart[h] == -1) cellStart[h] = i;
+            cellEnd[h] = i + 1;
+        }
+
+        // 5. Reorder Particles into contiguous memory (Parallel)
+        // This step is crucial for cache locality. Reading 'particles' is random access,
+        // but the subsequent force loop will read 'sortedParticles' sequentially.
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; i++) {
+            sortedParticles[i] = particles[entries[i].particleId];
+        }
+
+        // 6. Compute Forces (Parallel)
+        // Using dynamic schedule to balance load, as some areas may be denser.
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < N; i++) {
+            const Particle& pi = sortedParticles[i];
+            Vec2 force = {0.0f, 0.0f};
+            
+            // Recompute grid coords for current particle
+            int cx = (int)std::floor(pi.position.x * invCellSize);
+            int cy = (int)std::floor(pi.position.y * invCellSize);
+
+            // Iterate over 3x3 neighboring cells (including self)
+            // Since cullRadius == cellSize, this guarantees covering all potential interactions
+            for (int dy = -1; dy <= 1; dy++) {
+                for (int dx = -1; dx <= 1; dx++) {
+                    uint32_t h = getHash(cx + dx, cy + dy);
+                    
+                    int start = cellStart[h];
+                    if (start != -1) {
+                        int end = cellEnd[h];
+                        
+                        // Iterate over particles in the neighbor cell
+                        // Since we sorted, this is a linear scan over memory -> High Cache Hits
+                        for (int j = start; j < end; j++) {
+                            // Skip self (valid since i and j are indices in sorted array)
+                            if (i == j) continue;
+                            
+                            const Particle& pj = sortedParticles[j];
+                            
+                            // Optimization: Check squared distance before calling computeForce
+                            // This avoids sqrt() inside computeForce for distant particles
+                            Vec2 dir = pj.position - pi.position;
+                            float distSq = dir.x * dir.x + dir.y * dir.y;
+                            
+                            if (distSq < cullRadiusSq) {
+                                force = force + computeForce(pi, pj, cullRadius);
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Update particle physics and write back to original index location
+            // This implicitly scatters the results back to the correct order
+            int originalIdx = entries[i].particleId;
+            newParticles[originalIdx] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini3pro_1.cpp b/research/solutions/nbody_simulation/random_100k/gemini3pro_1.cpp
new file mode 100644
index 00000000..3767ce95
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini3pro_1.cpp
@@ -0,0 +1,216 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <limits>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads;
+    
+    // Buffers for grid-based spatial partitioning
+    // We sort particles into a grid to improve cache locality and pruning
+    std::vector<Particle> sortedParticles;
+    std::vector<int> gridCounts;
+    std::vector<int> gridStarts;
+    // Per-thread histograms for parallel sorting
+    std::vector<std::vector<int>> threadCounts; 
+
+public:
+    MySimulator() : numThreads(1) {}
+
+    void init(int numParticles, StepParameters params) override {
+        numThreads = omp_get_max_threads();
+        // Use all available threads
+        if (numThreads < 1) numThreads = 1;
+        omp_set_num_threads(numThreads);
+        
+        // Pre-allocate buffers to avoid reallocation during simulation
+        sortedParticles.resize(numParticles);
+        threadCounts.resize(numThreads);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        int N = (int)particles.size();
+        if (N == 0) return;
+        
+        // Ensure output vector is correctly sized
+        if (newParticles.size() != particles.size()) {
+            newParticles.resize(particles.size());
+        }
+
+        // 1. Calculate Bounding Box of the world dynamically
+        float minX = std::numeric_limits<float>::max();
+        float maxX = std::numeric_limits<float>::lowest();
+        float minY = std::numeric_limits<float>::max();
+        float maxY = std::numeric_limits<float>::lowest();
+
+        #pragma omp parallel reduction(min:minX, minY) reduction(max:maxX, maxY)
+        for (int i = 0; i < N; i++) {
+            Vec2 p = particles[i].position;
+            if (p.x < minX) minX = p.x;
+            if (p.x > maxX) maxX = p.x;
+            if (p.y < minY) minY = p.y;
+            if (p.y > maxY) maxY = p.y;
+        }
+        
+        // Add small epsilon to bounds to handle edge cases
+        minX -= 0.1f; minY -= 0.1f;
+        maxX += 0.1f; maxY += 0.1f;
+        
+        float width = maxX - minX;
+        float height = maxY - minY;
+        
+        // 2. Setup Grid
+        // A cell size of ~2.0 provides a good balance between culling efficiency 
+        // and grid traversal overhead for a cullRadius of 25.0 in a 100.0 world.
+        // Smaller cells (e.g. 2.0) cull better than larger cells (e.g. 25.0).
+        float targetCellSize = 2.0f;
+        int dimX = std::max(1, (int)(width / targetCellSize));
+        int dimY = std::max(1, (int)(height / targetCellSize));
+        
+        float cellSizeX = width / dimX;
+        float cellSizeY = height / dimY;
+        
+        int numCells = dimX * dimY;
+        
+        // Resize grid buffers if the grid dimensions changed
+        if ((int)gridCounts.size() != numCells) {
+            gridCounts.resize(numCells);
+            gridStarts.resize(numCells);
+        }
+        
+        // 3. Parallel Sort / Binning
+        // We use a parallel counting sort approach to organize particles by cell.
+        
+        // Step 3a: Compute per-thread histograms
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            if ((int)threadCounts[tid].size() != numCells) {
+                threadCounts[tid].resize(numCells);
+            }
+            // Clear counts
+            std::fill(threadCounts[tid].begin(), threadCounts[tid].end(), 0);
+            
+            #pragma omp for
+            for (int i = 0; i < N; i++) {
+                const Vec2& p = particles[i].position;
+                int cx = (int)((p.x - minX) / cellSizeX);
+                int cy = (int)((p.y - minY) / cellSizeY);
+                // Clamp indices
+                if (cx >= dimX) cx = dimX - 1; else if (cx < 0) cx = 0;
+                if (cy >= dimY) cy = dimY - 1; else if (cy < 0) cy = 0;
+                
+                int cellIdx = cy * dimX + cx;
+                threadCounts[tid][cellIdx]++;
+            }
+        }
+        
+        // Step 3b: Compute global offsets (Prefix Sum)
+        // Convert threadCounts to local offsets and aggregate into gridCounts
+        #pragma omp parallel for
+        for (int c = 0; c < numCells; c++) {
+            int sum = 0;
+            for (int t = 0; t < numThreads; t++) {
+                int count = threadCounts[t][c];
+                threadCounts[t][c] = sum; // Store offset for this thread
+                sum += count;
+            }
+            gridCounts[c] = sum;
+        }
+        
+        // Compute start indices for each cell in the sorted array
+        int current = 0;
+        for (int c = 0; c < numCells; c++) {
+            gridStarts[c] = current;
+            current += gridCounts[c];
+        }
+        
+        // Step 3c: Scatter particles to sorted array
+        // Each thread writes its particles to the correct pre-calculated positions
+        #pragma omp parallel for
+        for (int i = 0; i < N; i++) {
+            int tid = omp_get_thread_num();
+            const Vec2& p = particles[i].position;
+            int cx = (int)((p.x - minX) / cellSizeX);
+            int cy = (int)((p.y - minY) / cellSizeY);
+            if (cx >= dimX) cx = dimX - 1; else if (cx < 0) cx = 0;
+            if (cy >= dimY) cy = dimY - 1; else if (cy < 0) cy = 0;
+            
+            int cellIdx = cy * dimX + cx;
+            
+            // Get unique position for this particle
+            int localOff = threadCounts[tid][cellIdx]++;
+            int globalPos = gridStarts[cellIdx] + localOff;
+            
+            sortedParticles[globalPos] = particles[i];
+        }
+
+        // 4. Force Calculation
+        float cullR = params.cullRadius;
+        float cullRSq = cullR * cullR;
+        float dt = params.deltaTime;
+        
+        // Determine search range in grid cells
+        int searchRadX = (int)(cullR / cellSizeX) + 1;
+        int searchRadY = (int)(cullR / cellSizeY) + 1;
+
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < N; i++) {
+            const Particle& pi = particles[i];
+            Vec2 pos = pi.position;
+            Vec2 force{0.0f, 0.0f};
+            
+            int cx = (int)((pos.x - minX) / cellSizeX);
+            int cy = (int)((pos.y - minY) / cellSizeY);
+            if (cx >= dimX) cx = dimX - 1; else if (cx < 0) cx = 0;
+            if (cy >= dimY) cy = dimY - 1; else if (cy < 0) cy = 0;
+            
+            int minGX = std::max(0, cx - searchRadX);
+            int maxGX = std::min(dimX - 1, cx + searchRadX);
+            int minGY = std::max(0, cy - searchRadY);
+            int maxGY = std::min(dimY - 1, cy + searchRadY);
+            
+            for (int gy = minGY; gy <= maxGY; gy++) {
+                int rowOffset = gy * dimX;
+                for (int gx = minGX; gx <= maxGX; gx++) {
+                    int gIdx = rowOffset + gx;
+                    int start = gridStarts[gIdx];
+                    int count = gridCounts[gIdx];
+                    
+                    if (count == 0) continue;
+                    
+                    int end = start + count;
+                    for (int j = start; j < end; j++) {
+                        const Particle& pj = sortedParticles[j];
+                        
+                        // Avoid self-interaction
+                        if (pi.id == pj.id) continue;
+                        
+                        float dx = pj.position.x - pos.x;
+                        float dy = pj.position.y - pos.y;
+                        
+                        // Fast bounding box rejection to avoid sqrt
+                        if (dx > cullR || dx < -cullR) continue;
+                        if (dy > cullR || dy < -cullR) continue;
+                        
+                        float d2 = dx*dx + dy*dy;
+                        if (d2 <= cullRSq) {
+                            force += computeForce(pi, pj, cullR);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, force, dt);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini3pro_2.cpp b/research/solutions/nbody_simulation/random_100k/gemini3pro_2.cpp
new file mode 100644
index 00000000..4b41c67a
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini3pro_2.cpp
@@ -0,0 +1,128 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <atomic>
+#include <cmath>
+#include <algorithm>
+#include <memory>
+
+class SpatialHashSimulator : public Simulator {
+private:
+    // Hash table configuration
+    // 262144 (2^18) buckets to minimize collisions for 100k particles
+    static const int HASH_SIZE = 262144;
+    static const int HASH_MASK = HASH_SIZE - 1;
+
+    // Persistent storage for grid
+    // heads: array of atomic integers pointing to the first particle in a bucket
+    // nexts: array of integers for the linked list of particles
+    std::atomic<int>* heads;
+    std::vector<int> nexts;
+    int numThreads;
+
+    // Spatial hash function
+    inline int getHash(int x, int y) const {
+        // Large primes to minimize collisions in the hash table
+        unsigned int ux = (unsigned int)x * 73856093;
+        unsigned int uy = (unsigned int)y * 19349663;
+        return (int)((ux ^ uy) & HASH_MASK);
+    }
+
+    // Helper to calculate grid coordinate
+    inline int fastFloor(float x) const {
+        return (int)std::floor(x);
+    }
+
+public:
+    SpatialHashSimulator() : heads(nullptr), numThreads(1) {
+        // Allocate array of atomics
+        heads = new std::atomic<int>[HASH_SIZE];
+    }
+
+    ~SpatialHashSimulator() {
+        delete[] heads;
+    }
+
+    void init(int numParticles, StepParameters params) override {
+        // Configure threads
+        numThreads = omp_get_max_threads();
+        if (numThreads < 1) numThreads = 1;
+        omp_set_num_threads(numThreads);
+
+        // Pre-allocate linked list buffer
+        if ((int)nexts.size() != numParticles) {
+            nexts.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = (int)particles.size();
+        const float r = params.cullRadius;
+        const float rSq = r * r;
+        const float invR = 1.0f / r;
+
+        // 1. Clear Grid Headers (Parallel)
+        #pragma omp parallel for
+        for (int i = 0; i < HASH_SIZE; i++) {
+            heads[i].store(-1, std::memory_order_relaxed);
+        }
+
+        // 2. Build Grid (Parallel)
+        // Insert particles into the spatial hash table
+        #pragma omp parallel for
+        for (int i = 0; i < n; i++) {
+            int cx = fastFloor(particles[i].position.x * invR);
+            int cy = fastFloor(particles[i].position.y * invR);
+            int h = getHash(cx, cy);
+
+            // Atomic insertion at the head of the list
+            // memory_order_relaxed is safe here because of the implicit barrier 
+            // at the end of the parallel region before the next phase.
+            nexts[i] = heads[h].exchange(i, std::memory_order_relaxed);
+        }
+
+        // 3. Compute Forces and Update (Parallel)
+        // Use dynamic scheduling to handle load imbalance due to particle clustering
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < n; i++) {
+            Vec2 pos = particles[i].position;
+            int cx = fastFloor(pos.x * invR);
+            int cy = fastFloor(pos.y * invR);
+            
+            Vec2 totalForce = {0.0f, 0.0f};
+
+            // Check the 3x3 block of cells around the particle
+            for (int dx = -1; dx <= 1; dx++) {
+                for (int dy = -1; dy <= 1; dy++) {
+                    int h = getHash(cx + dx, cy + dy);
+                    
+                    // Traverse the linked list for this bucket
+                    int j = heads[h].load(std::memory_order_relaxed);
+                    while (j != -1) {
+                        if (i != j) {
+                            Vec2 d = particles[j].position - pos;
+                            
+                            // Check squared distance to filter particles
+                            // This avoids the sqrt call in computeForce for particles
+                            // that are in the neighbor cells but outside the radius.
+                            float distSq = d.x*d.x + d.y*d.y;
+                            if (distSq <= rSq) {
+                                totalForce += computeForce(particles[i], particles[j], r);
+                            }
+                        }
+                        j = nexts[j];
+                    }
+                }
+            }
+
+            // Integrate and store result
+            newParticles[i] = updateParticle(particles[i], totalForce, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new SpatialHashSimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini3pro_3.cpp b/research/solutions/nbody_simulation/random_100k/gemini3pro_3.cpp
new file mode 100644
index 00000000..b0f7740a
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini3pro_3.cpp
@@ -0,0 +1,184 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    
+    struct ParticleRef {
+        int originalIndex;
+        uint32_t hash;
+    };
+
+    // Persistent buffers to minimize allocation overhead
+    std::vector<ParticleRef> pRefs;
+    std::vector<Particle> sortedParticles;
+    std::vector<int> cellStart;
+    std::vector<int> cellEnd;
+    std::vector<std::vector<Vec2>> threadForces;
+    
+    // Grid configuration
+    // Using a large power of 2 for hash map to minimize collisions while avoiding sparse matrix overhead
+    static const int HASH_SIZE = 131072; // 2^17
+    static const int HASH_MASK = HASH_SIZE - 1;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        // Detect and set thread count
+        int max_threads = omp_get_max_threads();
+        numThreads = (max_threads > 0) ? max_threads : 1;
+        omp_set_num_threads(numThreads);
+
+        // Pre-allocate memory
+        pRefs.resize(numParticles);
+        sortedParticles.resize(numParticles);
+        
+        // Resize hash map arrays
+        cellStart.resize(HASH_SIZE);
+        cellEnd.resize(HASH_SIZE);
+        
+        // Allocate per-thread force accumulation buffers
+        threadForces.resize(numThreads);
+        for(auto& v : threadForces) {
+            v.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        int N = particles.size();
+        
+        // Use cell size = cullRadius / 2.0 for finer granularity.
+        // This requires searching a 5x5 block of cells but approximates the circular cull area better 
+        // than a 3x3 block with cell size = cullRadius.
+        float cellSize = params.cullRadius * 0.5f;
+        float invCellSize = 1.0f / cellSize;
+        float cullRadiusSq = params.cullRadius * params.cullRadius;
+
+        // 1. Reset thread accumulation buffers
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            // Using explicit initialization to zero
+            std::fill(threadForces[tid].begin(), threadForces[tid].end(), Vec2{0.0f, 0.0f});
+        }
+
+        // 2. Assign particles to grid cells (Compute Hash) and store references
+        #pragma omp parallel for
+        for (int i = 0; i < N; i++) {
+            int ix = (int)std::floor(particles[i].position.x * invCellSize);
+            int iy = (int)std::floor(particles[i].position.y * invCellSize);
+            // Spatial hash using large primes
+            uint32_t h = ((ix * 73856093) ^ (iy * 19349663)) & HASH_MASK;
+            pRefs[i] = {i, h};
+        }
+
+        // 3. Sort particles by hash to improve cache locality during force calculation
+        std::sort(pRefs.begin(), pRefs.end(), [](const ParticleRef& a, const ParticleRef& b){
+            return a.hash < b.hash;
+        });
+
+        // 4. Reset grid and build start/end indices
+        std::fill(cellStart.begin(), cellStart.end(), -1);
+        
+        // Build Grid: Populate start/end indices for each bucket
+        // Also permute particles into sorted order for linear memory access
+        // Serial grid build is fast enough for 100k particles
+        for (int i = 0; i < N; i++) {
+            uint32_t h = pRefs[i].hash;
+            if (cellStart[h] == -1) cellStart[h] = i;
+            cellEnd[h] = i + 1;
+        }
+
+        #pragma omp parallel for
+        for (int i = 0; i < N; i++) {
+            sortedParticles[i] = particles[pRefs[i].originalIndex];
+        }
+
+        // 5. Parallel Force Computation
+        #pragma omp parallel 
+        {
+            int tid = omp_get_thread_num();
+            auto& myForces = threadForces[tid];
+
+            // Dynamic schedule handles load imbalance due to clustering
+            #pragma omp for schedule(dynamic, 64)
+            for (int i = 0; i < N; i++) {
+                const Particle& pi = sortedParticles[i];
+                int original_i = pRefs[i].originalIndex;
+                
+                int cx = (int)std::floor(pi.position.x * invCellSize);
+                int cy = (int)std::floor(pi.position.y * invCellSize);
+
+                // Check 5x5 neighbor block (radius 2 cells covers cullRadius)
+                for (int dy = -2; dy <= 2; dy++) {
+                    for (int dx = -2; dx <= 2; dx++) {
+                        int nx = cx + dx;
+                        int ny = cy + dy;
+                        uint32_t h = ((nx * 73856093) ^ (ny * 19349663)) & HASH_MASK;
+                        
+                        int start = cellStart[h];
+                        if (start == -1) continue;
+                        int end = cellEnd[h];
+
+                        // Iterate over potential neighbors in this bucket
+                        for (int j = start; j < end; j++) {
+                            // Enforce uniqueness and Newton's 3rd Law:
+                            // Process pair (i, j) only once. Since we check all neighbor combinations
+                            // symmetrically in the grid search, 'j > i' is sufficient to process each pair once.
+                            if (j <= i) continue;
+
+                            const Particle& pj = sortedParticles[j];
+
+                            // Verify particle is actually in the target cell to handle hash collisions
+                            // and ensure we don't process "ghost" neighbors from aliased cells
+                            int px = (int)std::floor(pj.position.x * invCellSize);
+                            int py = (int)std::floor(pj.position.y * invCellSize);
+                            if (px != nx || py != ny) continue;
+
+                            Vec2 dir = pj.position - pi.position;
+                            float d2 = dir.x*dir.x + dir.y*dir.y;
+                            
+                            // Optimization: Check squared distance before calling computeForce (avoids sqrt)
+                            if (d2 > cullRadiusSq || d2 < 1e-6f) continue;
+
+                            // Calculate force
+                            Vec2 force = computeForce(pi, pj, params.cullRadius);
+                            
+                            // Apply symmetric forces to original indices
+                            // force is exerted on pi by pj.
+                            int original_j = pRefs[j].originalIndex;
+                            
+                            myForces[original_i].x += force.x;
+                            myForces[original_i].y += force.y;
+                            myForces[original_j].x -= force.x;
+                            myForces[original_j].y -= force.y;
+                        }
+                    }
+                }
+            }
+        }
+
+        // 6. Integrate and Finalize
+        // Sum forces from all thread buffers and update particles
+        #pragma omp parallel for
+        for (int i = 0; i < N; i++) {
+            Vec2 totalForce = {0.0f, 0.0f};
+            for (int t = 0; t < numThreads; t++) {
+                totalForce.x += threadForces[t][i].x;
+                totalForce.y += threadForces[t][i].y;
+            }
+            newParticles[i] = updateParticle(particles[i], totalForce, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gemini3pro_4.cpp b/research/solutions/nbody_simulation/random_100k/gemini3pro_4.cpp
new file mode 100644
index 00000000..8f83e8f6
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gemini3pro_4.cpp
@@ -0,0 +1,192 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <limits>
+
+class OptimizedSimulator : public Simulator {
+private:
+    int numThreads = 16;
+    
+    // Persistent buffers to minimize allocation overhead
+    std::vector<int> cellCounts;
+    std::vector<int> cellOffsets;
+    std::vector<Particle> sortedParticles;
+    std::vector<int> origIndices;
+    std::vector<int> particleCellIndices;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        // Set OpenMP threads
+        omp_set_num_threads(numThreads);
+        
+        // Pre-allocate buffers for particle data
+        sortedParticles.resize(numParticles);
+        origIndices.resize(numParticles);
+        particleCellIndices.resize(numParticles);
+        
+        // Reserve space for grid (heuristic size, will resize if needed)
+        cellCounts.reserve(4096);
+        cellOffsets.reserve(4096);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        int n = (int)particles.size();
+        if (n == 0) return;
+
+        float r = params.cullRadius;
+        float rSq = r * r;
+
+        // 1. Compute Bounding Box
+        // Initialize with first particle to avoid checking limits constants
+        float minX = particles[0].position.x;
+        float maxX = particles[0].position.x;
+        float minY = particles[0].position.y;
+        float maxY = particles[0].position.y;
+
+        // Simple serial pass is efficient enough for 100k particles (memory bound)
+        for (int i = 1; i < n; i++) {
+            float px = particles[i].position.x;
+            float py = particles[i].position.y;
+            if (px < minX) minX = px;
+            if (px > maxX) maxX = px;
+            if (py < minY) minY = py;
+            if (py > maxY) maxY = py;
+        }
+
+        // Add padding to bounding box
+        minX -= 0.1f; minY -= 0.1f;
+        maxX += 0.1f; maxY += 0.1f;
+
+        // 2. Setup Grid Dimensions
+        // Use cell size = cullRadius. This ensures we only need to check 3x3 neighbor cells.
+        int gridW = (int)((maxX - minX) / r) + 1;
+        int gridH = (int)((maxY - minY) / r) + 1;
+        int numCells = gridW * gridH;
+
+        // Resize grid buffers if the grid has grown
+        if ((int)cellCounts.size() < numCells) {
+            cellCounts.resize(numCells);
+            cellOffsets.resize(numCells);
+        }
+
+        // Clear cell counts for this step
+        std::fill(cellCounts.begin(), cellCounts.begin() + numCells, 0);
+
+        // 3. Assign Particles to Cells and Count
+        for (int i = 0; i < n; i++) {
+            int cx = (int)((particles[i].position.x - minX) / r);
+            int cy = (int)((particles[i].position.y - minY) / r);
+            
+            // Clamp indices to be safe
+            if (cx < 0) cx = 0; else if (cx >= gridW) cx = gridW - 1;
+            if (cy < 0) cy = 0; else if (cy >= gridH) cy = gridH - 1;
+            
+            int cellIdx = cy * gridW + cx;
+            particleCellIndices[i] = cellIdx;
+            cellCounts[cellIdx]++;
+        }
+
+        // 4. Compute Cell Offsets (Prefix Sum)
+        int current = 0;
+        for (int c = 0; c < numCells; c++) {
+            cellOffsets[c] = current;
+            current += cellCounts[c];
+        }
+
+        // 5. Reorder Particles (Counting Sort / spatial hashing)
+        // This improves cache locality significantly during force calculation
+        // Create a temporary copy of offsets to track insertion positions
+        std::vector<int> currentPos(cellOffsets.begin(), cellOffsets.begin() + numCells);
+        
+        for (int i = 0; i < n; i++) {
+            int cIdx = particleCellIndices[i];
+            int dst = currentPos[cIdx]++;
+            sortedParticles[dst] = particles[i];
+            origIndices[dst] = i; // Store original index to write back results correctly
+        }
+
+        // 6. Parallel Force Calculation
+        // Iterate over sorted particles. Threads process chunks of spatially local particles.
+        #pragma omp parallel
+        {
+            // Thread-local cache to store neighbor cell ranges
+            // This avoids recalculating neighbors for every particle in the same cell
+            int cachedCellIdx = -1;
+            struct Range { int start; int end; };
+            Range neighborRanges[9]; // Max 9 neighbors (3x3)
+            int numNeighbors = 0;
+
+            #pragma omp for schedule(dynamic, 64)
+            for (int i = 0; i < n; i++) {
+                const Particle& pi = sortedParticles[i];
+                
+                // Determine the cell of the current particle from its position
+                int cx = (int)((pi.position.x - minX) / r);
+                int cy = (int)((pi.position.y - minY) / r);
+                if (cx < 0) cx = 0; else if (cx >= gridW) cx = gridW - 1;
+                if (cy < 0) cy = 0; else if (cy >= gridH) cy = gridH - 1;
+                
+                int cellIdx = cy * gridW + cx;
+
+                // If we moved to a new cell, update the neighbor list
+                if (cellIdx != cachedCellIdx) {
+                    cachedCellIdx = cellIdx;
+                    numNeighbors = 0;
+                    
+                    int min_x = std::max(0, cx - 1);
+                    int max_x = std::min(gridW - 1, cx + 1);
+                    int min_y = std::max(0, cy - 1);
+                    int max_y = std::min(gridH - 1, cy + 1);
+
+                    for (int ny = min_y; ny <= max_y; ny++) {
+                        int rowOffset = ny * gridW;
+                        for (int nx = min_x; nx <= max_x; nx++) {
+                            int nIdx = rowOffset + nx;
+                            int count = cellCounts[nIdx];
+                            // Only add non-empty cells
+                            if (count > 0) {
+                                int start = cellOffsets[nIdx];
+                                neighborRanges[numNeighbors++] = {start, start + count};
+                            }
+                        }
+                    }
+                }
+
+                Vec2 force = {0.0f, 0.0f};
+
+                // Iterate over all candidate particles in neighbor cells
+                for (int k = 0; k < numNeighbors; k++) {
+                    int start = neighborRanges[k].start;
+                    int end = neighborRanges[k].end;
+                    
+                    for (int j = start; j < end; j++) {
+                        // Skip self
+                        if (i == j) continue;
+                        
+                        const Particle& pj = sortedParticles[j];
+                        float dx = pj.position.x - pi.position.x;
+                        float dy = pj.position.y - pi.position.y;
+                        
+                        // Check squared distance first to avoid sqrt in computeForce
+                        float distSq = dx*dx + dy*dy;
+                        if (distSq < rSq) {
+                            force += computeForce(pi, pj, r);
+                        }
+                    }
+                }
+
+                // Update particle and write to correct location in newParticles
+                newParticles[origIndices[i]] = updateParticle(pi, force, params.deltaTime);
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new OptimizedSimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.1.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.1.cpp
new file mode 100644
index 00000000..a10b5916
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.1.cpp
@@ -0,0 +1,254 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+class MySimulator : public Simulator {
+private:
+    int maxParticles = 0;
+    int maxThreads = 1;
+
+    // Per-thread force buffers: size = maxThreads * maxParticles
+    std::vector<float> forceX;
+    std::vector<float> forceY;
+
+    // Spatial grid / hashing data
+    std::vector<int> cellIndices;          // cell index per particle, size = numParticles
+    std::vector<int> gridParticleIndices;  // particles sorted by cell, size = numParticles
+    std::vector<int> cellCounts;           // particles per cell, size = numCells
+    std::vector<int> cellStart;            // prefix sums, size = numCells + 1
+    std::vector<int> cellOffsets;          // temp offsets for fill, size = numCells
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        // Fix OpenMP configuration
+        omp_set_dynamic(0);              // disable dynamic adjustment
+        maxThreads = omp_get_max_threads();
+        omp_set_num_threads(maxThreads); // ensure parallel regions use this
+
+        maxParticles = numParticles;
+        forceX.assign(maxThreads * maxParticles, 0.0f);
+        forceY.assign(maxThreads * maxParticles, 0.0f);
+
+        cellIndices.resize(maxParticles);
+        gridParticleIndices.resize(maxParticles);
+
+        cellCounts.clear();
+        cellStart.clear();
+        cellOffsets.clear();
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = static_cast<int>(particles.size());
+        if (n == 0) return;
+
+        if (static_cast<int>(newParticles.size()) != n) {
+            newParticles.resize(n);
+        }
+
+        // Resize buffers if particle count increased beyond initial
+        if (n > maxParticles) {
+            maxParticles = n;
+            forceX.assign(maxThreads * maxParticles, 0.0f);
+            forceY.assign(maxThreads * maxParticles, 0.0f);
+            cellIndices.resize(maxParticles);
+            gridParticleIndices.resize(maxParticles);
+        } else {
+            cellIndices.resize(n);
+            gridParticleIndices.resize(n);
+        }
+
+        const float cullR = params.cullRadius;
+
+        // If cull radius is non-positive, no forces act; just integrate with zero force.
+        if (cullR <= 0.0f) {
+            #pragma omp parallel for schedule(static)
+            for (int i = 0; i < n; ++i) {
+                Vec2 zero(0.0f, 0.0f);
+                newParticles[i] = updateParticle(particles[i], zero, params.deltaTime);
+            }
+            return;
+        }
+
+        // Compute bounding box of particles (sequential - cheap)
+        float minX = particles[0].position.x;
+        float maxX = minX;
+        float minY = particles[0].position.y;
+        float maxY = minY;
+        for (int i = 1; i < n; ++i) {
+            const Vec2 &pos = particles[i].position;
+            const float x = pos.x;
+            const float y = pos.y;
+            if (x < minX) minX = x;
+            if (x > maxX) maxX = x;
+            if (y < minY) minY = y;
+            if (y > maxY) maxY = y;
+        }
+
+        // Spatial grid configuration
+        // Use cell size as a fraction of cullRadius to get reasonably fine grid.
+        float cellSize = cullR * 0.125f; // cullRadius / 8
+        if (cellSize < 1e-3f) {
+            cellSize = std::max(cullR, 1e-3f);
+        }
+        const float invCellSize = 1.0f / cellSize;
+
+        const float widthX = maxX - minX;
+        const float widthY = maxY - minY;
+
+        int gridW = std::max(1, static_cast<int>(std::ceil(widthX * invCellSize)));
+        int gridH = std::max(1, static_cast<int>(std::ceil(widthY * invCellSize)));
+        int numCells = gridW * gridH;
+
+        cellCounts.assign(numCells, 0);
+        cellStart.assign(numCells + 1, 0);
+        cellOffsets.assign(numCells, 0);
+
+        // First pass: assign each particle to a cell and count
+        for (int i = 0; i < n; ++i) {
+            const Vec2 &pos = particles[i].position;
+            int cx = static_cast<int>((pos.x - minX) * invCellSize);
+            int cy = static_cast<int>((pos.y - minY) * invCellSize);
+
+            if (cx < 0) cx = 0;
+            else if (cx >= gridW) cx = gridW - 1;
+            if (cy < 0) cy = 0;
+            else if (cy >= gridH) cy = gridH - 1;
+
+            const int cell = cy * gridW + cx;
+            cellIndices[i] = cell;
+            cellCounts[cell]++;
+        }
+
+        // Prefix sums to get cell start indices
+        int sum = 0;
+        for (int c = 0; c < numCells; ++c) {
+            cellStart[c] = sum;
+            sum += cellCounts[c];
+        }
+        cellStart[numCells] = sum; // should be == n
+
+        // Second pass: fill gridParticleIndices with particles grouped by cell
+        for (int c = 0; c < numCells; ++c) {
+            cellOffsets[c] = cellStart[c];
+        }
+        for (int i = 0; i < n; ++i) {
+            const int cell = cellIndices[i];
+            const int dst = cellOffsets[cell]++;
+            gridParticleIndices[dst] = i;
+        }
+
+        const float cull2 = cullR * cullR;
+        const int neighborRange = static_cast<int>(std::ceil(cullR / cellSize)) + 1;
+
+        #pragma omp parallel
+        {
+            const int tid = omp_get_thread_num();
+            const int T = omp_get_num_threads(); // should equal maxThreads
+
+            float *fx = &forceX[tid * maxParticles];
+            float *fy = &forceY[tid * maxParticles];
+
+            // Zero thread-local force buffers for active particles
+            for (int i2 = 0; i2 < n; ++i2) {
+                fx[i2] = 0.0f;
+                fy[i2] = 0.0f;
+            }
+
+            // Compute pairwise forces using spatial grid, each unordered pair exactly once
+            #pragma omp for schedule(dynamic)
+            for (int cell = 0; cell < numCells; ++cell) {
+                const int cy = cell / gridW;
+                const int cx = cell - cy * gridW;
+
+                const int iBegin = cellStart[cell];
+                const int iEnd   = cellStart[cell + 1];
+                if (iBegin >= iEnd) continue;
+
+                const int nyMin = std::max(0, cy - neighborRange);
+                const int nyMax = std::min(gridH - 1, cy + neighborRange);
+                const int nxMin0 = std::max(0, cx - neighborRange);
+                const int nxMax0 = std::min(gridW - 1, cx + neighborRange);
+
+                for (int ny = nyMin; ny <= nyMax; ++ny) {
+                    const int rowBase = ny * gridW;
+                    for (int nx = nxMin0; nx <= nxMax0; ++nx) {
+                        const int cell2 = rowBase + nx;
+                        if (cell2 < cell) continue; // avoid double counting cell pairs
+
+                        const int jBegin = cellStart[cell2];
+                        const int jEnd   = cellStart[cell2 + 1];
+                        if (jBegin >= jEnd) continue;
+
+                        if (cell2 == cell) {
+                            // Pairs within same cell: i<j
+                            for (int ia = iBegin; ia < iEnd; ++ia) {
+                                const int idxI = gridParticleIndices[ia];
+                                const Particle &pi = particles[idxI];
+                                for (int jb = ia + 1; jb < iEnd; ++jb) {
+                                    const int idxJ = gridParticleIndices[jb];
+                                    const Particle &pj = particles[idxJ];
+
+                                    const float dx = pj.position.x - pi.position.x;
+                                    const float dy = pj.position.y - pi.position.y;
+                                    const float dist2 = dx * dx + dy * dy;
+                                    if (dist2 < cull2) {
+                                        const Vec2 f = computeForce(pi, pj, cullR);
+                                        fx[idxI] += f.x;
+                                        fy[idxI] += f.y;
+                                        fx[idxJ] -= f.x;
+                                        fy[idxJ] -= f.y;
+                                    }
+                                }
+                            }
+                        } else {
+                            // Pairs between different cells: all combinations
+                            for (int ia = iBegin; ia < iEnd; ++ia) {
+                                const int idxI = gridParticleIndices[ia];
+                                const Particle &pi = particles[idxI];
+                                for (int jb = jBegin; jb < jEnd; ++jb) {
+                                    const int idxJ = gridParticleIndices[jb];
+                                    const Particle &pj = particles[idxJ];
+
+                                    const float dx = pj.position.x - pi.position.x;
+                                    const float dy = pj.position.y - pi.position.y;
+                                    const float dist2 = dx * dx + dy * dy;
+                                    if (dist2 < cull2) {
+                                        const Vec2 f = computeForce(pi, pj, cullR);
+                                        fx[idxI] += f.x;
+                                        fy[idxI] += f.y;
+                                        fx[idxJ] -= f.x;
+                                        fy[idxJ] -= f.y;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Barrier is implicit at the end of the omp for above (no nowait)
+
+            // Reduce per-thread forces and integrate particles
+            #pragma omp for schedule(static)
+            for (int i2 = 0; i2 < n; ++i2) {
+                float fxTotal = 0.0f;
+                float fyTotal = 0.0f;
+                for (int t = 0; t < T; ++t) {
+                    fxTotal += forceX[t * maxParticles + i2];
+                    fyTotal += forceY[t * maxParticles + i2];
+                }
+                Vec2 totalForce(fxTotal, fyTotal);
+                newParticles[i2] = updateParticle(particles[i2], totalForce, params.deltaTime);
+            }
+        } // end parallel
+    }
+};
+
+// Factory function - must be implemented
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.1_1.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.1_1.cpp
new file mode 100644
index 00000000..69e2c764
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.1_1.cpp
@@ -0,0 +1,211 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads;
+
+    // Grid parameters
+    int gridSizeX = 0;
+    int gridSizeY = 0;
+    int numCells = 0;
+    int cellRadius = 0;
+    float cellSize = 1.0f;
+    float invCellSize = 1.0f;
+
+    // Persistent buffers
+    std::vector<int> particleCellX;
+    std::vector<int> particleCellY;
+    std::vector<int> cellStart;           // starting index in cellParticleIndices
+    std::vector<int> cellCount;           // temporary counts per cell
+    std::vector<int> cellNext;            // for filling indices
+    std::vector<int> cellParticleIndices; // indices of particles by cell
+
+public:
+    MySimulator() {
+        numThreads = omp_get_max_threads();
+    }
+
+    void init(int numParticles, StepParameters params) override {
+        numThreads = omp_get_max_threads();
+        omp_set_num_threads(numThreads);
+
+        particleCellX.resize(numParticles);
+        particleCellY.resize(numParticles);
+        cellParticleIndices.resize(numParticles);
+        cellStart.clear();
+        cellCount.clear();
+        cellNext.clear();
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = static_cast<int>(particles.size());
+        if (N == 0) return;
+        if (static_cast<int>(newParticles.size()) < N)
+            newParticles.resize(N);
+
+        // 1. Compute bounding box (sequential, negligible vs force computations)
+        float minx = particles[0].position.x;
+        float maxx = minx;
+        float miny = particles[0].position.y;
+        float maxy = miny;
+        for (int i = 1; i < N; ++i) {
+            float x = particles[i].position.x;
+            float y = particles[i].position.y;
+            if (x < minx) minx = x;
+            if (x > maxx) maxx = x;
+            if (y < miny) miny = y;
+            if (y > maxy) maxy = y;
+        }
+
+        float width = maxx - minx;
+        float height = maxy - miny;
+        if (width < 1e-3f) width = 1e-3f;
+        if (height < 1e-3f) height = 1e-3f;
+        float area = width * height;
+
+        const float R = params.cullRadius;
+        const float R2 = R * R;
+
+        // 2. Choose cell size based on desired particles per cell, clamped by cullRadius
+        const float targetParticlesPerCell = 32.0f;
+        float estCellArea = area * targetParticlesPerCell / static_cast<float>(N);
+        if (estCellArea < 1e-4f) estCellArea = 1e-4f;
+        float chosenCellSize = std::sqrt(estCellArea);
+        if (chosenCellSize > R) chosenCellSize = R;
+        // Avoid extremely tiny cells for numerical stability
+        if (chosenCellSize < R * 0.05f) chosenCellSize = R * 0.05f;
+        if (chosenCellSize < 1e-3f) chosenCellSize = 1e-3f;
+
+        cellSize = chosenCellSize;
+        invCellSize = 1.0f / cellSize;
+
+        gridSizeX = static_cast<int>(std::ceil(width * invCellSize));
+        gridSizeY = static_cast<int>(std::ceil(height * invCellSize));
+        if (gridSizeX < 1) gridSizeX = 1;
+        if (gridSizeY < 1) gridSizeY = 1;
+        numCells = gridSizeX * gridSizeY;
+
+        cellRadius = static_cast<int>(std::ceil(R * invCellSize));
+        if (cellRadius < 1) cellRadius = 1;
+
+        // Ensure buffers have enough capacity
+        if (static_cast<int>(cellCount.size()) < numCells) {
+            cellCount.assign(numCells, 0);
+        } else {
+            std::fill(cellCount.begin(), cellCount.begin() + numCells, 0);
+        }
+        if (static_cast<int>(cellStart.size()) < numCells)
+            cellStart.resize(numCells);
+        if (static_cast<int>(cellNext.size()) < numCells)
+            cellNext.resize(numCells);
+        if (static_cast<int>(cellParticleIndices.size()) < N)
+            cellParticleIndices.resize(N);
+        if (static_cast<int>(particleCellX.size()) < N)
+            particleCellX.resize(N);
+        if (static_cast<int>(particleCellY.size()) < N)
+            particleCellY.resize(N);
+
+        // 3. Assign particles to cells and count per-cell occupancy
+        for (int i = 0; i < N; ++i) {
+            float fx = (particles[i].position.x - minx) * invCellSize;
+            float fy = (particles[i].position.y - miny) * invCellSize;
+            int cx = static_cast<int>(fx);
+            int cy = static_cast<int>(fy);
+            if (cx < 0) cx = 0;
+            else if (cx >= gridSizeX) cx = gridSizeX - 1;
+            if (cy < 0) cy = 0;
+            else if (cy >= gridSizeY) cy = gridSizeY - 1;
+
+            particleCellX[i] = cx;
+            particleCellY[i] = cy;
+            int cellId = cy * gridSizeX + cx;
+            ++cellCount[cellId];
+        }
+
+        // 4. Build prefix sums to compute cellStart
+        int sum = 0;
+        for (int c = 0; c < numCells; ++c) {
+            cellStart[c] = sum;
+            sum += cellCount[c];
+        }
+        // sum should equal N
+
+        // 5. Fill cellParticleIndices with particle indices
+        for (int c = 0; c < numCells; ++c)
+            cellNext[c] = cellStart[c];
+
+        for (int i = 0; i < N; ++i) {
+            int cx = particleCellX[i];
+            int cy = particleCellY[i];
+            int cellId = cy * gridSizeX + cx;
+            int idx = cellNext[cellId]++;
+            cellParticleIndices[idx] = i;
+        }
+
+        // 6. Compute forces in parallel
+        const int localGridSizeX = gridSizeX;
+        const int localGridSizeY = gridSizeY;
+        const int localNumCells = numCells;
+        const int localRadius = cellRadius;
+        const float localR = R;
+        const float localR2 = R2;
+
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; ++i) {
+            const Particle &pi = particles[i];
+            Vec2 force(0.0f, 0.0f);
+
+            int cx = particleCellX[i];
+            int cy = particleCellY[i];
+
+            int minCellY = cy - localRadius;
+            if (minCellY < 0) minCellY = 0;
+            int maxCellY = cy + localRadius;
+            if (maxCellY >= localGridSizeY) maxCellY = localGridSizeY - 1;
+
+            int minCellX = cx - localRadius;
+            if (minCellX < 0) minCellX = 0;
+            int maxCellX = cx + localRadius;
+            if (maxCellX >= localGridSizeX) maxCellX = localGridSizeX - 1;
+
+            for (int ny = minCellY; ny <= maxCellY; ++ny) {
+                int baseIdx = ny * localGridSizeX;
+                for (int nx = minCellX; nx <= maxCellX; ++nx) {
+                    int cellId = baseIdx + nx;
+                    int start = cellStart[cellId];
+                    int end = (cellId + 1 < localNumCells) ? cellStart[cellId + 1] : N;
+
+                    for (int idx = start; idx < end; ++idx) {
+                        int j = cellParticleIndices[idx];
+                        if (j == i) continue;
+
+                        const Particle &pj = particles[j];
+
+                        float dx = pj.position.x - pi.position.x;
+                        if (dx > localR || dx < -localR) continue;
+                        float dy = pj.position.y - pi.position.y;
+                        if (dy > localR || dy < -localR) continue;
+
+                        float dist2 = dx * dx + dy * dy;
+                        if (dist2 >= localR2) continue;
+
+                        force += computeForce(pi, pj, localR);
+                    }
+                }
+            }
+
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+// Factory function - must be implemented
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.1_2.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.1_2.cpp
new file mode 100644
index 00000000..4dd0a66d
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.1_2.cpp
@@ -0,0 +1,185 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <cmath>
+
+struct KDNode {
+    float minX, maxX, minY, maxY;
+    int start, end;
+    int left, right;
+};
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 0;
+    int leafSize = 16;
+    std::vector<int> indices;
+    std::vector<KDNode> nodes;
+    const Particle* points = nullptr;
+    float radius = 0.0f;
+    float radius2 = 0.0f;
+    float deltaTime = 0.0f;
+    int root = -1;
+
+    inline void computeBounding(int start, int end,
+                                float &minX, float &maxX,
+                                float &minY, float &maxY) {
+        minX = std::numeric_limits<float>::infinity();
+        minY = std::numeric_limits<float>::infinity();
+        maxX = -std::numeric_limits<float>::infinity();
+        maxY = -std::numeric_limits<float>::infinity();
+
+        for (int i = start; i < end; ++i) {
+            const Particle &p = points[ indices[i] ];
+            float x = p.position.x;
+            float y = p.position.y;
+            if (x < minX) minX = x;
+            if (x > maxX) maxX = x;
+            if (y < minY) minY = y;
+            if (y > maxY) maxY = y;
+        }
+
+        if (start == end) {
+            minX = maxX = minY = maxY = 0.0f;
+        }
+    }
+
+    int buildNode(int start, int end) {
+        if (start >= end) return -1;
+
+        int nodeIdx = (int)nodes.size();
+        nodes.emplace_back();
+        KDNode &node = nodes.back();
+
+        node.start = start;
+        node.end = end;
+
+        computeBounding(start, end, node.minX, node.maxX, node.minY, node.maxY);
+
+        int count = end - start;
+        if (count <= leafSize) {
+            node.left = -1;
+            node.right = -1;
+        } else {
+            float rangeX = node.maxX - node.minX;
+            float rangeY = node.maxY - node.minY;
+            int axis = (rangeX > rangeY) ? 0 : 1;
+
+            int mid = start + count / 2;
+            auto beginIt = indices.begin() + start;
+            auto midIt   = indices.begin() + mid;
+            auto endIt   = indices.begin() + end;
+
+            const Particle* localPoints = points;
+            if (axis == 0) {
+                std::nth_element(beginIt, midIt, endIt,
+                    [localPoints](int a, int b) {
+                        return localPoints[a].position.x < localPoints[b].position.x;
+                    });
+            } else {
+                std::nth_element(beginIt, midIt, endIt,
+                    [localPoints](int a, int b) {
+                        return localPoints[a].position.y < localPoints[b].position.y;
+                    });
+            }
+
+            node.left  = buildNode(start, mid);
+            node.right = buildNode(mid, end);
+        }
+
+        return nodeIdx;
+    }
+
+    inline bool bboxIntersectsCircle(const KDNode &node, const Vec2 &pos) const {
+        float dx = 0.0f;
+        if (pos.x < node.minX) dx = node.minX - pos.x;
+        else if (pos.x > node.maxX) dx = pos.x - node.maxX;
+
+        float dy = 0.0f;
+        if (pos.y < node.minY) dy = node.minY - pos.y;
+        else if (pos.y > node.maxY) dy = pos.y - node.maxY;
+
+        return dx * dx + dy * dy <= radius2;
+    }
+
+    void rangeSearch(int nodeIdx, int targetIndex,
+                     const Particle &target, Vec2 &force) const {
+        const KDNode &node = nodes[nodeIdx];
+        if (!bboxIntersectsCircle(node, target.position))
+            return;
+
+        if (node.left == -1) {
+            for (int i = node.start; i < node.end; ++i) {
+                int j = indices[i];
+                if (j == targetIndex) continue;
+
+                const Particle &pj = points[j];
+                float dx = pj.position.x - target.position.x;
+                float dy = pj.position.y - target.position.y;
+                float dist2 = dx * dx + dy * dy;
+                if (dist2 <= radius2) {
+                    force += computeForce(target, pj, radius);
+                }
+            }
+        } else {
+            rangeSearch(node.left,  targetIndex, target, force);
+            rangeSearch(node.right, targetIndex, target, force);
+        }
+    }
+
+public:
+    MySimulator() {
+        numThreads = 0;
+        leafSize = 16;
+    }
+
+    void init(int numParticles, StepParameters params) override {
+        (void)params;
+        numThreads = omp_get_max_threads();
+        if (numThreads <= 0) numThreads = 1;
+        omp_set_num_threads(numThreads);
+
+        if (numParticles > 0) {
+            indices.resize(numParticles);
+            nodes.reserve(numParticles * 2);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        int n = (int)particles.size();
+        if (n == 0) return;
+
+        if ((int)indices.size() < n)
+            indices.resize(n);
+
+        if ((int)nodes.capacity() < 2 * n)
+            nodes.reserve(2 * n);
+
+        points = particles.data();
+        radius = params.cullRadius;
+        radius2 = radius * radius;
+        deltaTime = params.deltaTime;
+
+        for (int i = 0; i < n; ++i)
+            indices[i] = i;
+
+        nodes.clear();
+        root = buildNode(0, n);
+
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < n; ++i) {
+            const Particle &pi = particles[i];
+            Vec2 force(0.0f, 0.0f);
+            rangeSearch(root, i, pi, force);
+            newParticles[i] = updateParticle(pi, force, deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.1_3.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.1_3.cpp
new file mode 100644
index 00000000..ff0b50dd
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.1_3.cpp
@@ -0,0 +1,169 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads;
+    float cellSizeFactor;
+    std::vector<int> cellCounts;
+    std::vector<int> cellOffsets;
+    std::vector<int> sortedIndices;
+    std::vector<int> particleCells;
+
+public:
+    MySimulator()
+        : numThreads(omp_get_max_threads()), cellSizeFactor(0.1f) {}
+
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        sortedIndices.resize(numParticles);
+        particleCells.resize(numParticles);
+        // cellCounts and cellOffsets will be sized when grid dimensions are known
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = static_cast<int>(particles.size());
+        if (n == 0)
+            return;
+
+        if (static_cast<int>(newParticles.size()) < n)
+            newParticles.resize(n);
+
+        Particle *pData = particles.data();
+        Particle *outData = newParticles.data();
+
+        // Compute bounding box
+        float minX = pData[0].position.x;
+        float maxX = pData[0].position.x;
+        float minY = pData[0].position.y;
+        float maxY = pData[0].position.y;
+
+        for (int i = 1; i < n; ++i) {
+            const float x = pData[i].position.x;
+            const float y = pData[i].position.y;
+            if (x < minX) minX = x;
+            if (x > maxX) maxX = x;
+            if (y < minY) minY = y;
+            if (y > maxY) maxY = y;
+        }
+
+        float widthX = maxX - minX;
+        float widthY = maxY - minY;
+        if (widthX < 1e-3f) widthX = 1e-3f;
+        if (widthY < 1e-3f) widthY = 1e-3f;
+
+        const float R = params.cullRadius;
+        float cellSize = R * cellSizeFactor;
+        if (cellSize < 1e-3f || cellSize > R) {
+            cellSize = R;
+        }
+        const float invCellSize = 1.0f / cellSize;
+
+        int gridW = static_cast<int>(std::ceil(widthX * invCellSize));
+        int gridH = static_cast<int>(std::ceil(widthY * invCellSize));
+        if (gridW < 1) gridW = 1;
+        if (gridH < 1) gridH = 1;
+        int numCells = gridW * gridH;
+
+        if (static_cast<int>(cellCounts.size()) < numCells) {
+            cellCounts.assign(numCells, 0);
+            cellOffsets.resize(numCells + 1);
+        } else {
+            std::fill(cellCounts.begin(), cellCounts.begin() + numCells, 0);
+            if (static_cast<int>(cellOffsets.size()) < numCells + 1)
+                cellOffsets.resize(numCells + 1);
+        }
+
+        if (static_cast<int>(sortedIndices.size()) < n)
+            sortedIndices.resize(n);
+        if (static_cast<int>(particleCells.size()) < n)
+            particleCells.resize(n);
+
+        // First pass: assign particles to cells and count
+        for (int i = 0; i < n; ++i) {
+            float fx = (pData[i].position.x - minX) * invCellSize;
+            float fy = (pData[i].position.y - minY) * invCellSize;
+            int ix = static_cast<int>(fx);
+            int iy = static_cast<int>(fy);
+            if (ix < 0) ix = 0;
+            else if (ix >= gridW) ix = gridW - 1;
+            if (iy < 0) iy = 0;
+            else if (iy >= gridH) iy = gridH - 1;
+            int cellIdx = iy * gridW + ix;
+            particleCells[i] = cellIdx;
+            cellCounts[cellIdx]++;
+        }
+
+        // Prefix sum to get cell offsets
+        cellOffsets[0] = 0;
+        for (int c = 0; c < numCells; ++c) {
+            cellOffsets[c + 1] = cellOffsets[c] + cellCounts[c];
+        }
+
+        // Reset counts to use as cursors when filling sortedIndices
+        std::fill(cellCounts.begin(), cellCounts.begin() + numCells, 0);
+
+        // Fill sortedIndices with particle indices grouped by cell
+        for (int i = 0; i < n; ++i) {
+            int cellIdx = particleCells[i];
+            int dst = cellOffsets[cellIdx] + cellCounts[cellIdx]++;
+            sortedIndices[dst] = i;
+        }
+
+        const int neighborRange = static_cast<int>(std::ceil(R / cellSize));
+        const float R2 = R * R;
+        const float dt = params.deltaTime;
+
+        // Compute forces and integrate
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < n; ++i) {
+            const Particle pi = pData[i];
+            const float pix = pi.position.x;
+            const float piy = pi.position.y;
+
+            Vec2 totalForce(0.0f, 0.0f);
+
+            const int cellIdx = particleCells[i];
+            const int ix = cellIdx % gridW;
+            const int iy = cellIdx / gridW;
+
+            for (int dy = -neighborRange; dy <= neighborRange; ++dy) {
+                int ny = iy + dy;
+                if (ny < 0 || ny >= gridH) continue;
+                int rowOffset = ny * gridW;
+                for (int dx = -neighborRange; dx <= neighborRange; ++dx) {
+                    int nx = ix + dx;
+                    if (nx < 0 || nx >= gridW) continue;
+                    int nCellIdx = rowOffset + nx;
+
+                    int begin = cellOffsets[nCellIdx];
+                    int end = cellOffsets[nCellIdx + 1];
+                    for (int idx = begin; idx < end; ++idx) {
+                        int j = sortedIndices[idx];
+                        if (j == i) continue;
+
+                        const Particle &pj = pData[j];
+
+                        float dxp = pj.position.x - pix;
+                        float dyp = pj.position.y - piy;
+                        float dist2 = dxp * dxp + dyp * dyp;
+                        if (dist2 < R2) {
+                            totalForce += computeForce(pi, pj, R);
+                        }
+                    }
+                }
+            }
+
+            outData[i] = updateParticle(pi, totalForce, dt);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.1_4.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.1_4.cpp
new file mode 100644
index 00000000..d76d14e7
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.1_4.cpp
@@ -0,0 +1,372 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    struct Node {
+        float minX, maxX, minY, maxY;
+        float cx, cy;
+        float mass;
+        float size;      // max(maxX-minX, maxY-minY)
+        int child[4];    // indices of children, -1 if none
+        int body;        // index of first body if leaf, -1 otherwise
+        int bodyCount;   // number of bodies in this leaf (for maxDepth leaves)
+        bool isLeaf;
+    };
+
+    std::vector<Node> nodes;
+    std::vector<int> nextBody;  // linked list for bodies in multi-body leaves
+
+    int maxDepth;
+    float theta;
+    float theta2;
+    int numThreads;
+
+    int createNode(float minX, float maxX, float minY, float maxY) {
+        Node node;
+        node.minX = minX;
+        node.maxX = maxX;
+        node.minY = minY;
+        node.maxY = maxY;
+        node.cx = 0.0f;
+        node.cy = 0.0f;
+        node.mass = 0.0f;
+        node.size = std::max(maxX - minX, maxY - minY);
+        node.body = -1;
+        node.bodyCount = 0;
+        node.isLeaf = true;
+        node.child[0] = node.child[1] = node.child[2] = node.child[3] = -1;
+        nodes.push_back(node);
+        return static_cast<int>(nodes.size()) - 1;
+    }
+
+    void subdivideNode(int nodeIdx) {
+        Node &node = nodes[nodeIdx];
+        float midX = 0.5f * (node.minX + node.maxX);
+        float midY = 0.5f * (node.minY + node.maxY);
+        // SW
+        node.child[0] = createNode(node.minX, midX, node.minY, midY);
+        // SE
+        node.child[1] = createNode(midX, node.maxX, node.minY, midY);
+        // NW
+        node.child[2] = createNode(node.minX, midX, midY, node.maxY);
+        // NE
+        node.child[3] = createNode(midX, node.maxX, midY, node.maxY);
+        node.isLeaf = false;
+    }
+
+    int getQuadrant(const Node &node, const Particle &p) const {
+        float midX = 0.5f * (node.minX + node.maxX);
+        float midY = 0.5f * (node.minY + node.maxY);
+        int quad = 0;
+        if (p.position.y >= midY) quad += 2; // north
+        if (p.position.x >= midX) quad += 1; // east
+        return quad;
+    }
+
+    void insertBodyToChild(int nodeIdx, int bodyIdx, int depth,
+                           const std::vector<Particle> &particles) {
+        Node &node = nodes[nodeIdx];
+        int quad = getQuadrant(node, particles[bodyIdx]);
+        int childIdx = node.child[quad];
+        if (childIdx == -1) {
+            float midX = 0.5f * (node.minX + node.maxX);
+            float midY = 0.5f * (node.minY + node.maxY);
+            float cminX, cmaxX, cminY, cmaxY;
+            if (quad & 1) { // east
+                cminX = midX;
+                cmaxX = node.maxX;
+            } else {        // west
+                cminX = node.minX;
+                cmaxX = midX;
+            }
+            if (quad & 2) { // north
+                cminY = midY;
+                cmaxY = node.maxY;
+            } else {        // south
+                cminY = node.minY;
+                cmaxY = midY;
+            }
+            childIdx = createNode(cminX, cmaxX, cminY, cmaxY);
+            node.child[quad] = childIdx;
+        }
+        insertBody(childIdx, bodyIdx, depth, particles);
+    }
+
+    void insertBody(int nodeIdx, int bodyIdx, int depth,
+                    const std::vector<Particle> &particles) {
+        Node &node = nodes[nodeIdx];
+        if (node.isLeaf) {
+            if (node.body == -1) {
+                node.body = bodyIdx;
+                node.bodyCount = 1;
+                nextBody[bodyIdx] = -1;
+            } else if (depth >= maxDepth) {
+                // Multi-body leaf at maximum depth
+                nextBody[bodyIdx] = node.body;
+                node.body = bodyIdx;
+                node.bodyCount++;
+            } else {
+                // Subdivide leaf and reinsert existing bodies
+                subdivideNode(nodeIdx);
+                int existing = node.body;
+                node.body = -1;
+                node.bodyCount = 0;
+                while (existing != -1) {
+                    int next = nextBody[existing];
+                    insertBodyToChild(nodeIdx, existing, depth + 1, particles);
+                    existing = next;
+                }
+                // Insert new body
+                insertBodyToChild(nodeIdx, bodyIdx, depth + 1, particles);
+            }
+        } else {
+            insertBodyToChild(nodeIdx, bodyIdx, depth + 1, particles);
+        }
+    }
+
+    void computeMass(int nodeIdx, const std::vector<Particle> &particles) {
+        Node &node = nodes[nodeIdx];
+        if (node.isLeaf) {
+            if (node.body == -1) {
+                node.mass = 0.0f;
+                node.cx = node.cy = 0.0f;
+                return;
+            }
+            float mSum = 0.0f;
+            float xSum = 0.0f;
+            float ySum = 0.0f;
+            int idx = node.body;
+            while (idx != -1) {
+                float m = particles[idx].mass;
+                mSum += m;
+                xSum += particles[idx].position.x * m;
+                ySum += particles[idx].position.y * m;
+                idx = nextBody[idx];
+            }
+            node.mass = mSum;
+            if (mSum > 0.0f) {
+                float invM = 1.0f / mSum;
+                node.cx = xSum * invM;
+                node.cy = ySum * invM;
+            } else {
+                node.cx = node.cy = 0.0f;
+            }
+        } else {
+            float mSum = 0.0f;
+            float xSum = 0.0f;
+            float ySum = 0.0f;
+            for (int k = 0; k < 4; ++k) {
+                int c = node.child[k];
+                if (c != -1) {
+                    computeMass(c, particles);
+                    float cm = nodes[c].mass;
+                    if (cm > 0.0f) {
+                        mSum += cm;
+                        xSum += nodes[c].cx * cm;
+                        ySum += nodes[c].cy * cm;
+                    }
+                }
+            }
+            node.mass = mSum;
+            if (mSum > 0.0f) {
+                float invM = 1.0f / mSum;
+                node.cx = xSum * invM;
+                node.cy = ySum * invM;
+            } else {
+                node.cx = node.cy = 0.0f;
+            }
+        }
+    }
+
+    inline Vec2 computeAggregateForce(const Particle &target, float mass,
+                                      float posX, float posY,
+                                      float cullRadius) const {
+        Vec2 dir;
+        dir.x = posX - target.position.x;
+        dir.y = posY - target.position.y;
+        float dist = dir.length();
+        if (dist < 1e-3f)
+            return Vec2(0.0f, 0.0f);
+        dir *= (1.0f / dist);
+        if (dist > cullRadius)
+            return Vec2(0.0f, 0.0f);
+        if (dist < 1e-1f)
+            dist = 1e-1f;
+        const float G = 0.01f;
+        Vec2 force = dir * target.mass * mass * (G / (dist * dist));
+        if (dist > cullRadius * 0.75f) {
+            float decay =
+                1.0f - (dist - cullRadius * 0.75f) / (cullRadius * 0.25f);
+            force *= decay;
+        }
+        return force;
+    }
+
+    void traverseNode(int nodeIdx, int targetIdx,
+                      const std::vector<Particle> &particles,
+                      Vec2 &forceAcc, float cullRadius,
+                      float cullRadius2) const {
+        const Node &node = nodes[nodeIdx];
+        if (node.mass <= 0.0f)
+            return;
+
+        const Particle &target = particles[targetIdx];
+        float tx = target.position.x;
+        float ty = target.position.y;
+
+        // Minimum distance squared from target to node's AABB
+        float dx = 0.0f;
+        if (tx < node.minX)
+            dx = node.minX - tx;
+        else if (tx > node.maxX)
+            dx = tx - node.maxX;
+
+        float dy = 0.0f;
+        if (ty < node.minY)
+            dy = node.minY - ty;
+        else if (ty > node.maxY)
+            dy = ty - node.maxY;
+
+        float minDist2 = dx * dx + dy * dy;
+        if (minDist2 > cullRadius2)
+            return;
+
+        if (node.isLeaf) {
+            int b = node.body;
+            while (b != -1) {
+                if (b != targetIdx) {
+                    forceAcc += computeForce(target, particles[b], cullRadius);
+                }
+                b = nextBody[b];
+            }
+            return;
+        }
+
+        // Distance squared from target to node's center of mass
+        float dxcm = node.cx - tx;
+        float dycm = node.cy - ty;
+        float dist2cm = dxcm * dxcm + dycm * dycm;
+
+        // Maximum distance squared from target to node's corners
+        float dxmax = std::max(std::fabs(tx - node.minX),
+                               std::fabs(tx - node.maxX));
+        float dymax = std::max(std::fabs(ty - node.minY),
+                               std::fabs(ty - node.maxY));
+        float maxDist2 = dxmax * dxmax + dymax * dymax;
+        bool fullyInside = (maxDist2 <= cullRadius2);
+
+        bool usedApprox = false;
+        if (fullyInside && dist2cm > 0.0f) {
+            float s = node.size;
+            // Barnes-Hut opening criterion: s / sqrt(dist2) < theta
+            if (s * s < theta2 * dist2cm) {
+                Vec2 f =
+                    computeAggregateForce(target, node.mass, node.cx, node.cy,
+                                          cullRadius);
+                forceAcc += f;
+                usedApprox = true;
+            }
+        }
+
+        if (!usedApprox) {
+            for (int k = 0; k < 4; ++k) {
+                int c = node.child[k];
+                if (c != -1) {
+                    traverseNode(c, targetIdx, particles, forceAcc,
+                                 cullRadius, cullRadius2);
+                }
+            }
+        }
+    }
+
+public:
+    MySimulator() {
+        maxDepth = 20;
+        theta = 0.5f;
+        theta2 = theta * theta;
+        numThreads = 0;
+    }
+
+    void init(int numParticles, StepParameters params) override {
+        numThreads = omp_get_max_threads();
+        if (numThreads <= 0)
+            numThreads = 1;
+        omp_set_num_threads(numThreads);
+        nodes.reserve(numParticles * 4);
+        nextBody.assign(numParticles, -1);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        int N = static_cast<int>(particles.size());
+        if (N == 0)
+            return;
+
+        if (static_cast<int>(nextBody.size()) < N) {
+            nextBody.assign(N, -1);
+        } else {
+            std::fill(nextBody.begin(), nextBody.begin() + N, -1);
+        }
+
+        // Compute bounding box of all particles
+        float minX = particles[0].position.x;
+        float maxX = particles[0].position.x;
+        float minY = particles[0].position.y;
+        float maxY = particles[0].position.y;
+
+        for (int i = 1; i < N; ++i) {
+            float x = particles[i].position.x;
+            float y = particles[i].position.y;
+            if (x < minX) minX = x;
+            if (x > maxX) maxX = x;
+            if (y < minY) minY = y;
+            if (y > maxY) maxY = y;
+        }
+
+        float padding = 1e-3f;
+        if (maxX - minX < 1e-3f) {
+            maxX = minX + 0.5f;
+            minX = minX - 0.5f;
+        } else {
+            minX -= padding;
+            maxX += padding;
+        }
+        if (maxY - minY < 1e-3f) {
+            maxY = minY + 0.5f;
+            minY = minY - 0.5f;
+        } else {
+            minY -= padding;
+            maxY += padding;
+        }
+
+        nodes.clear();
+        nodes.reserve(N * 4);
+        int rootIdx = createNode(minX, maxX, minY, maxY);
+
+        for (int i = 0; i < N; ++i) {
+            insertBody(rootIdx, i, 0, particles);
+        }
+
+        computeMass(rootIdx, particles);
+
+        float cullRadius = params.cullRadius;
+        float cullRadius2 = cullRadius * cullRadius;
+
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; ++i) {
+            Vec2 force(0.0f, 0.0f);
+            traverseNode(rootIdx, i, particles, force, cullRadius, cullRadius2);
+            newParticles[i] =
+                updateParticle(particles[i], force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.2.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.2.cpp
new file mode 100644
index 00000000..07144dfd
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.2.cpp
@@ -0,0 +1,365 @@
+#include "world.h"
+#include <omp.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+class MySimulator : public Simulator {
+private:
+    struct Node {
+        float cx = 0.0f, cy = 0.0f, half = 0.0f; // square AABB: [cx-half,cx+half] x [cy-half,cy+half]
+        int child[4] = {-1, -1, -1, -1};         // 0:LB, 1:RB, 2:LT, 3:RT
+        int particle = -1;                       // >=0 single particle index, -1 empty/internal, -2 bucket
+        int bucketHead = -1;
+        int bucketCount = 0;
+
+        int count = 0;      // particles in subtree
+        float mass = 0.0f;  // total mass
+        float comx = 0.0f;  // center of mass
+        float comy = 0.0f;
+
+        inline bool isLeaf() const { return child[0] == -1; }
+        inline bool isInternal() const { return child[0] != -1; }
+    };
+
+    int numThreads = 16;
+
+    std::vector<Node> nodes;
+
+    std::vector<int> bucketParticle;
+    std::vector<int> bucketNext;
+
+    std::vector<int> traversalOrder;
+
+    int maxDepth = 24;
+    float minHalf = 1e-5f;
+
+    float theta = 0.30f;
+    float theta2 = theta * theta;
+    float minApproxDist2 = 0.04f; // (0.2)^2
+
+    inline int newNode(float cx, float cy, float half) {
+        Node n;
+        n.cx = cx;
+        n.cy = cy;
+        n.half = half;
+        nodes.push_back(n);
+        return (int)nodes.size() - 1;
+    }
+
+    inline int quadrant(const Vec2 &p, const Node &n) const {
+        int q = 0;
+        if (p.x >= n.cx) q |= 1;
+        if (p.y >= n.cy) q |= 2;
+        return q;
+    }
+
+    inline void addToBucket(Node &n, int pIdx) {
+        int e = (int)bucketParticle.size();
+        bucketParticle.push_back(pIdx);
+        bucketNext.push_back(n.bucketHead);
+        n.bucketHead = e;
+        n.bucketCount++;
+        n.particle = -2;
+    }
+
+    void createChildren(int idx) {
+        Node &n = nodes[idx];
+        float h2 = n.half * 0.5f;
+        for (int q = 0; q < 4; q++) {
+            float ccx = n.cx + ((q & 1) ? h2 : -h2);
+            float ccy = n.cy + ((q & 2) ? h2 : -h2);
+            n.child[q] = newNode(ccx, ccy, h2);
+        }
+    }
+
+    void insertParticle(int nodeIdx, int pIdx, int depth, const std::vector<Particle> &particles) {
+        Node &n = nodes[nodeIdx];
+
+        if (n.isLeaf()) {
+            if (n.particle == -1 && n.bucketCount == 0) {
+                n.particle = pIdx;
+                return;
+            }
+
+            if (n.particle >= 0) {
+                int old = n.particle;
+
+                if (depth >= maxDepth || n.half <= minHalf) {
+                    n.particle = -2;
+                    n.bucketHead = -1;
+                    n.bucketCount = 0;
+                    addToBucket(n, old);
+                    addToBucket(n, pIdx);
+                    return;
+                }
+
+                n.particle = -1;
+                createChildren(nodeIdx);
+
+                int qOld = quadrant(particles[old].position, n);
+                int qNew = quadrant(particles[pIdx].position, n);
+
+                insertParticle(n.child[qOld], old, depth + 1, particles);
+                insertParticle(n.child[qNew], pIdx, depth + 1, particles);
+                return;
+            }
+
+            if (n.particle == -2) {
+                addToBucket(n, pIdx);
+                return;
+            }
+
+            // Should not reach here
+            n.particle = pIdx;
+            return;
+        } else {
+            int q = quadrant(particles[pIdx].position, n);
+            insertParticle(n.child[q], pIdx, depth + 1, particles);
+        }
+    }
+
+    void buildTree(const std::vector<Particle> &particles) {
+        const int N = (int)particles.size();
+        nodes.clear();
+        bucketParticle.clear();
+        bucketNext.clear();
+        traversalOrder.clear();
+
+        if (N == 0) return;
+
+        nodes.reserve((size_t)4 * (size_t)N + 1024);
+        bucketParticle.reserve((size_t)N / 8 + 1024);
+        bucketNext.reserve((size_t)N / 8 + 1024);
+
+        float minx = particles[0].position.x, maxx = minx;
+        float miny = particles[0].position.y, maxy = miny;
+        for (int i = 1; i < N; i++) {
+            float x = particles[i].position.x;
+            float y = particles[i].position.y;
+            minx = std::min(minx, x);
+            maxx = std::max(maxx, x);
+            miny = std::min(miny, y);
+            maxy = std::max(maxy, y);
+        }
+
+        float cx = 0.5f * (minx + maxx);
+        float cy = 0.5f * (miny + maxy);
+        float size = std::max(maxx - minx, maxy - miny);
+        if (size < 1e-3f) size = 1e-3f;
+        float half = 0.5f * size + 1e-3f;
+
+        newNode(cx, cy, half);
+
+        for (int i = 0; i < N; i++) insertParticle(0, i, 0, particles);
+
+        traversalOrder.reserve(nodes.size());
+        std::vector<int> st;
+        st.reserve(nodes.size());
+        st.push_back(0);
+        while (!st.empty()) {
+            int idx = st.back();
+            st.pop_back();
+            traversalOrder.push_back(idx);
+            const Node &n = nodes[idx];
+            if (n.isInternal()) {
+                // push all children
+                st.push_back(n.child[0]);
+                st.push_back(n.child[1]);
+                st.push_back(n.child[2]);
+                st.push_back(n.child[3]);
+            }
+        }
+
+        for (int k = (int)traversalOrder.size() - 1; k >= 0; k--) {
+            int idx = traversalOrder[k];
+            Node &n = nodes[idx];
+
+            if (n.isLeaf()) {
+                if (n.particle >= 0) {
+                    const Particle &p = particles[n.particle];
+                    n.count = 1;
+                    n.mass = p.mass;
+                    n.comx = p.position.x;
+                    n.comy = p.position.y;
+                } else if (n.particle == -2) {
+                    double m = 0.0, sx = 0.0, sy = 0.0;
+                    int e = n.bucketHead;
+                    int c = 0;
+                    while (e != -1) {
+                        int pi = bucketParticle[e];
+                        const Particle &p = particles[pi];
+                        double pm = (double)p.mass;
+                        m += pm;
+                        sx += pm * (double)p.position.x;
+                        sy += pm * (double)p.position.y;
+                        c++;
+                        e = bucketNext[e];
+                    }
+                    n.count = c;
+                    if (m > 0.0) {
+                        n.mass = (float)m;
+                        n.comx = (float)(sx / m);
+                        n.comy = (float)(sy / m);
+                    } else {
+                        n.mass = 0.0f;
+                        n.comx = n.comy = 0.0f;
+                    }
+                } else {
+                    n.count = 0;
+                    n.mass = 0.0f;
+                    n.comx = n.comy = 0.0f;
+                }
+            } else {
+                double m = 0.0, sx = 0.0, sy = 0.0;
+                int cnt = 0;
+                for (int q = 0; q < 4; q++) {
+                    const Node &c = nodes[n.child[q]];
+                    if (c.count == 0) continue;
+                    double cm = (double)c.mass;
+                    m += cm;
+                    sx += cm * (double)c.comx;
+                    sy += cm * (double)c.comy;
+                    cnt += c.count;
+                }
+                n.count = cnt;
+                if (m > 0.0) {
+                    n.mass = (float)m;
+                    n.comx = (float)(sx / m);
+                    n.comy = (float)(sy / m);
+                } else {
+                    n.mass = 0.0f;
+                    n.comx = n.comy = 0.0f;
+                }
+            }
+        }
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        (void)numParticles;
+        (void)params;
+        int maxT = omp_get_max_threads();
+        numThreads = std::min(16, maxT);
+        if (numThreads < 1) numThreads = 1;
+        omp_set_num_threads(numThreads);
+
+        nodes.clear();
+        bucketParticle.clear();
+        bucketNext.clear();
+        traversalOrder.clear();
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = (int)particles.size();
+        if ((int)newParticles.size() != N) newParticles.resize(N);
+        if (N == 0) return;
+
+        buildTree(particles);
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        const float dt = params.deltaTime;
+
+        #pragma omp parallel
+        {
+            std::vector<int> stack;
+            stack.reserve(512);
+
+            #pragma omp for schedule(guided, 64)
+            for (int i = 0; i < N; i++) {
+                const Particle &pi = particles[i];
+                const float px = pi.position.x;
+                const float py = pi.position.y;
+
+                float fx = 0.0f, fy = 0.0f;
+
+                stack.clear();
+                stack.push_back(0);
+
+                while (!stack.empty()) {
+                    int ni = stack.back();
+                    stack.pop_back();
+
+                    const Node &n = nodes[ni];
+                    if (n.count == 0) continue;
+
+                    float adx = std::fabs(px - n.cx);
+                    float ady = std::fabs(py - n.cy);
+
+                    float dx = adx - n.half;
+                    float dy = ady - n.half;
+                    if (dx < 0.0f) dx = 0.0f;
+                    if (dy < 0.0f) dy = 0.0f;
+                    if (dx * dx + dy * dy > r2) continue;
+
+                    if (n.isLeaf()) {
+                        if (n.particle >= 0) {
+                            int pj = n.particle;
+                            if (pj != i) {
+                                Vec2 f = computeForce(pi, particles[pj], r);
+                                fx += f.x;
+                                fy += f.y;
+                            }
+                        } else if (n.particle == -2) {
+                            int e = n.bucketHead;
+                            while (e != -1) {
+                                int pj = bucketParticle[e];
+                                if (pj != i) {
+                                    Vec2 f = computeForce(pi, particles[pj], r);
+                                    fx += f.x;
+                                    fy += f.y;
+                                }
+                                e = bucketNext[e];
+                            }
+                        }
+                        continue;
+                    }
+
+                    bool contains = (adx <= n.half) && (ady <= n.half);
+
+                    float dxmax = adx + n.half;
+                    float dymax = ady + n.half;
+                    bool fullyInside = (dxmax * dxmax + dymax * dymax) <= r2;
+
+                    if (fullyInside && !contains) {
+                        float dcomx = n.comx - px;
+                        float dcomy = n.comy - py;
+                        float d2 = dcomx * dcomx + dcomy * dcomy;
+                        if (d2 > minApproxDist2) {
+                            float s = n.half * 2.0f;
+                            if ((s * s) < (theta2 * d2)) {
+                                Particle agg;
+                                agg.id = -1;
+                                agg.mass = n.mass;
+                                agg.position = Vec2(n.comx, n.comy);
+                                agg.velocity = Vec2(0.0f, 0.0f);
+
+                                Vec2 f = computeForce(pi, agg, r);
+                                fx += f.x;
+                                fy += f.y;
+                                continue;
+                            }
+                        }
+                    }
+
+                    stack.push_back(n.child[0]);
+                    stack.push_back(n.child[1]);
+                    stack.push_back(n.child[2]);
+                    stack.push_back(n.child[3]);
+                }
+
+                newParticles[i] = updateParticle(pi, Vec2(fx, fy), dt);
+            }
+        }
+    }
+};
+
+Simulator *createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.2_1.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.2_1.cpp
new file mode 100644
index 00000000..a3da1e2c
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.2_1.cpp
@@ -0,0 +1,402 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <limits>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    static constexpr float G = 0.01f;
+
+    struct Node {
+        float cx = 0.0f, cy = 0.0f;
+        float half = 0.0f;
+
+        float mass = 0.0f;
+        float sumx = 0.0f, sumy = 0.0f; // weighted sums
+        float comx = 0.0f, comy = 0.0f;
+
+        int child[4] = {-1, -1, -1, -1}; // 0: (x<cx,y<cy), 1: (x>=cx,y<cy), 2: (x<cx,y>=cy), 3: (x>=cx,y>=cy)
+        int count = 0;                  // >=0 leaf; -1 internal
+
+        static constexpr int CAP = 12;
+        int idx[CAP];
+    };
+
+    int numThreads = 16;
+    int nCached = 0;
+
+    std::vector<float> px, py, pm;
+
+    std::vector<Node> nodes;
+    std::vector<int> order; // for postorder aggregate computation
+
+    static inline void boxMinMaxDist2(float px, float py, const Node &nd, float &minD2, float &maxD2) {
+        float dx = std::fabs(px - nd.cx);
+        float dy = std::fabs(py - nd.cy);
+
+        float ox = dx - nd.half;
+        float oy = dy - nd.half;
+        if (ox < 0.0f) ox = 0.0f;
+        if (oy < 0.0f) oy = 0.0f;
+        minD2 = ox * ox + oy * oy;
+
+        float fx = dx + nd.half;
+        float fy = dy + nd.half;
+        maxD2 = fx * fx + fy * fy;
+    }
+
+    static inline bool containsPoint(float x, float y, const Node &nd) {
+        return (std::fabs(x - nd.cx) <= nd.half) && (std::fabs(y - nd.cy) <= nd.half);
+    }
+
+    inline int newNode(float cx, float cy, float half) {
+        Node nd;
+        nd.cx = cx; nd.cy = cy; nd.half = half;
+        nd.count = 0;
+        nodes.push_back(nd);
+        return (int)nodes.size() - 1;
+    }
+
+    inline int quadrant(float x, float y, const Node &nd) const {
+        int qx = (x >= nd.cx) ? 1 : 0;
+        int qy = (y >= nd.cy) ? 1 : 0;
+        return (qy << 1) | qx;
+    }
+
+    void splitNode(int ni, int depth) {
+        Node &nd = nodes[ni];
+        float h2 = nd.half * 0.5f;
+        float off = h2;
+
+        int c0 = newNode(nd.cx - off, nd.cy - off, h2);
+        int c1 = newNode(nd.cx + off, nd.cy - off, h2);
+        int c2 = newNode(nd.cx - off, nd.cy + off, h2);
+        int c3 = newNode(nd.cx + off, nd.cy + off, h2);
+        nd.child[0] = c0; nd.child[1] = c1; nd.child[2] = c2; nd.child[3] = c3;
+
+        int oldCount = nd.count;
+        int oldIdx[Node::CAP];
+        for (int k = 0; k < oldCount; k++) oldIdx[k] = nd.idx[k];
+
+        nd.count = -1; // internal
+
+        for (int k = 0; k < oldCount; k++) {
+            int pi = oldIdx[k];
+            insertParticle(ni, pi, depth); // re-insert from this node; will go into children
+        }
+    }
+
+    void insertParticle(int ni, int piIndex, int depth) {
+        Node &nd = nodes[ni];
+
+        if (nd.count >= 0) { // leaf
+            if (nd.count < Node::CAP || depth >= 24 || nd.half <= 1e-4f) {
+                nd.idx[nd.count++] = piIndex;
+                return;
+            }
+            // split and retry insertion
+            splitNode(ni, depth);
+            insertParticle(ni, piIndex, depth);
+            return;
+        }
+
+        // internal
+        int q = quadrant(px[piIndex], py[piIndex], nd);
+        int ci = nd.child[q];
+        insertParticle(ci, piIndex, depth + 1);
+    }
+
+    void buildTree(int n) {
+        // bounding box
+        float minx = std::numeric_limits<float>::infinity();
+        float maxx = -std::numeric_limits<float>::infinity();
+        float miny = std::numeric_limits<float>::infinity();
+        float maxy = -std::numeric_limits<float>::infinity();
+
+        #pragma omp parallel for reduction(min:minx,miny) reduction(max:maxx,maxy) schedule(static)
+        for (int i = 0; i < n; i++) {
+            float x = px[i], y = py[i];
+            if (x < minx) minx = x;
+            if (x > maxx) maxx = x;
+            if (y < miny) miny = y;
+            if (y > maxy) maxy = y;
+        }
+
+        float cx = 0.5f * (minx + maxx);
+        float cy = 0.5f * (miny + maxy);
+        float rx = maxx - minx;
+        float ry = maxy - miny;
+        float range = std::max(rx, ry);
+        float half = 0.5f * range + 1e-3f;
+        if (half < 1e-3f) half = 1e-3f;
+
+        nodes.clear();
+        order.clear();
+        nodes.reserve((size_t)std::max(2048, n * 2));
+        order.reserve((size_t)std::max(2048, n * 2));
+
+        int root = newNode(cx, cy, half);
+
+        for (int i = 0; i < n; i++) {
+            insertParticle(root, i, 0);
+        }
+
+        // create traversal order (preorder), then aggregate in reverse for postorder
+        std::vector<int> st;
+        st.reserve(1024);
+        st.push_back(root);
+        while (!st.empty()) {
+            int ni = st.back();
+            st.pop_back();
+            order.push_back(ni);
+            Node &nd = nodes[ni];
+            if (nd.count < 0) {
+                // push children
+                for (int k = 0; k < 4; k++) {
+                    int ci = nd.child[k];
+                    if (ci != -1) st.push_back(ci);
+                }
+            }
+        }
+
+        // reset aggregates
+        for (auto &nd : nodes) {
+            nd.mass = 0.0f;
+            nd.sumx = 0.0f;
+            nd.sumy = 0.0f;
+            nd.comx = nd.cx;
+            nd.comy = nd.cy;
+        }
+
+        for (int oi = (int)order.size() - 1; oi >= 0; oi--) {
+            int ni = order[oi];
+            Node &nd = nodes[ni];
+
+            if (nd.count >= 0) {
+                float m = 0.0f, sx = 0.0f, sy = 0.0f;
+                for (int k = 0; k < nd.count; k++) {
+                    int pi = nd.idx[k];
+                    float pmass = pm[pi];
+                    m += pmass;
+                    sx += pmass * px[pi];
+                    sy += pmass * py[pi];
+                }
+                nd.mass = m;
+                nd.sumx = sx;
+                nd.sumy = sy;
+                if (m > 0.0f) {
+                    nd.comx = sx / m;
+                    nd.comy = sy / m;
+                } else {
+                    nd.comx = nd.cx;
+                    nd.comy = nd.cy;
+                }
+            } else {
+                float m = 0.0f, sx = 0.0f, sy = 0.0f;
+                for (int k = 0; k < 4; k++) {
+                    int ci = nd.child[k];
+                    if (ci == -1) continue;
+                    const Node &ch = nodes[ci];
+                    m += ch.mass;
+                    sx += ch.sumx;
+                    sy += ch.sumy;
+                }
+                nd.mass = m;
+                nd.sumx = sx;
+                nd.sumy = sy;
+                if (m > 0.0f) {
+                    nd.comx = sx / m;
+                    nd.comy = sy / m;
+                } else {
+                    nd.comx = nd.cx;
+                    nd.comy = nd.cy;
+                }
+            }
+        }
+    }
+
+    static inline void addAccelFromMassPoint(float tx, float ty, float axp, float ayp, float amass,
+                                            float cullRadius, float rInner, float rQuarter,
+                                            float &outAx, float &outAy) {
+        float dx = axp - tx;
+        float dy = ayp - ty;
+        float dist2 = dx * dx + dy * dy;
+        if (dist2 < 1e-12f) return;
+
+        float dist = std::sqrt(dist2);
+        if (dist < 1e-3f) return;
+        if (dist > cullRadius) return;
+
+        float invDist = 1.0f / dist;
+        float ux = dx * invDist;
+        float uy = dy * invDist;
+
+        float distEff = (dist < 1e-1f) ? 1e-1f : dist;
+        float inv = G / (distEff * distEff);
+        float s = amass * inv;
+
+        if (dist > rInner) {
+            float decay = 1.0f - (dist - rInner) / rQuarter;
+            s *= decay;
+        }
+
+        outAx += ux * s;
+        outAy += uy * s;
+    }
+
+public:
+    void init(int numParticles, StepParameters /*params*/) override {
+        nCached = numParticles;
+        numThreads = std::min(16, omp_get_max_threads());
+        omp_set_dynamic(0);
+        omp_set_num_threads(numThreads);
+
+        px.resize(numParticles);
+        py.resize(numParticles);
+        pm.resize(numParticles);
+
+        nodes.reserve((size_t)std::max(2048, numParticles * 2));
+        order.reserve((size_t)std::max(2048, numParticles * 2));
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = (int)particles.size();
+        if (n == 0) return;
+
+        if ((int)px.size() != n) {
+            init(n, params);
+        }
+
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < n; i++) {
+            px[i] = particles[i].position.x;
+            py[i] = particles[i].position.y;
+            pm[i] = particles[i].mass;
+        }
+
+        buildTree(n);
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        const float rInner = r * 0.75f;
+        const float rInner2 = rInner * rInner;
+        const float rQuarter = r * 0.25f;
+        const float dt = params.deltaTime;
+
+        const float thetaInner = 0.30f;
+        const float thetaDecay = 0.20f;
+        const float thetaInner2 = thetaInner * thetaInner;
+        const float thetaDecay2 = thetaDecay * thetaDecay;
+
+        #pragma omp parallel
+        {
+            std::vector<int> st;
+            st.reserve(512);
+
+            #pragma omp for schedule(guided, 64)
+            for (int i = 0; i < n; i++) {
+                float tx = px[i], ty = py[i];
+                float ax = 0.0f, ay = 0.0f;
+
+                st.clear();
+                st.push_back(0); // root
+
+                while (!st.empty()) {
+                    int ni = st.back();
+                    st.pop_back();
+                    const Node &nd = nodes[ni];
+                    if (nd.mass <= 0.0f) continue;
+
+                    float minD2, maxD2;
+                    boxMinMaxDist2(tx, ty, nd, minD2, maxD2);
+                    if (minD2 > r2) continue;
+
+                    if (nd.count >= 0) {
+                        // leaf: exact per particle
+                        for (int k = 0; k < nd.count; k++) {
+                            int j = nd.idx[k];
+                            if (j == i) continue;
+
+                            float dx = px[j] - tx;
+                            float dy = py[j] - ty;
+                            float d2 = dx * dx + dy * dy;
+                            if (d2 > r2 || d2 < 1e-12f) continue;
+
+                            float dist = std::sqrt(d2);
+                            if (dist < 1e-3f || dist > r) continue;
+
+                            float invDist = 1.0f / dist;
+                            float ux = dx * invDist;
+                            float uy = dy * invDist;
+
+                            float distEff = (dist < 1e-1f) ? 1e-1f : dist;
+                            float inv = G / (distEff * distEff);
+                            float s = pm[j] * inv;
+
+                            if (dist > rInner) {
+                                float decay = 1.0f - (dist - rInner) / rQuarter;
+                                s *= decay;
+                            }
+
+                            ax += ux * s;
+                            ay += uy * s;
+                        }
+                        continue;
+                    }
+
+                    // internal node
+                    bool contains = containsPoint(tx, ty, nd);
+                    if (contains) {
+                        for (int c = 0; c < 4; c++) {
+                            int ci = nd.child[c];
+                            if (ci != -1 && nodes[ci].mass > 0.0f) st.push_back(ci);
+                        }
+                        continue;
+                    }
+
+                    if (maxD2 <= r2) {
+                        bool allInner = (maxD2 <= rInner2);
+                        bool allDecay = (minD2 >= rInner2);
+                        float useTheta2 = -1.0f;
+                        if (allInner) useTheta2 = thetaInner2;
+                        else if (allDecay) useTheta2 = thetaDecay2;
+
+                        if (useTheta2 > 0.0f) {
+                            float dcx = tx - nd.cx;
+                            float dcy = ty - nd.cy;
+                            float dist2Center = dcx * dcx + dcy * dcy + 1e-12f;
+                            float w = nd.half * 2.0f;
+                            float w2 = w * w;
+                            if (w2 < useTheta2 * dist2Center) {
+                                addAccelFromMassPoint(tx, ty, nd.comx, nd.comy, nd.mass, r, rInner, rQuarter, ax, ay);
+                                continue;
+                            }
+                        }
+                    }
+
+                    // traverse children
+                    for (int c = 0; c < 4; c++) {
+                        int ci = nd.child[c];
+                        if (ci != -1 && nodes[ci].mass > 0.0f) st.push_back(ci);
+                    }
+                }
+
+                Particle out = particles[i];
+                out.velocity.x = out.velocity.x + ax * dt;
+                out.velocity.y = out.velocity.y + ay * dt;
+                out.position.x = out.position.x + out.velocity.x * dt;
+                out.position.y = out.position.y + out.velocity.y * dt;
+                newParticles[i] = out;
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.2_2.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.2_2.cpp
new file mode 100644
index 00000000..544f29bc
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.2_2.cpp
@@ -0,0 +1,314 @@
+#include "world.h"
+#include <omp.h>
+
+#include <vector>
+#include <cmath>
+#include <cfloat>
+#include <algorithm>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    struct Node {
+        float cx = 0.0f, cy = 0.0f, half = 0.0f;
+        float mass = 0.0f, comx = 0.0f, comy = 0.0f;
+        int child[4] = {-1, -1, -1, -1}; // 0: (-x,-y), 1: (+x,-y), 2: (-x,+y), 3: (+x,+y)
+        int head = -1; // leaf: head of particle linked list, -1 empty; internal: -2
+    };
+
+    int numThreads = 16;
+
+    std::vector<float> px, py, pm;
+    std::vector<int> nextInLeaf;
+    std::vector<Node> nodes;
+
+    float theta = 0.4f;
+    float minHalf = 1e-3f;
+    int maxDepth = 24;
+
+    inline void makeChildren(int idx) {
+        Node &n = nodes[idx];
+        float h = n.half * 0.5f;
+        int base = (int)nodes.size();
+        nodes.resize(base + 4);
+        for (int q = 0; q < 4; q++) {
+            Node c;
+            c.half = h;
+            c.cx = n.cx + ((q & 1) ? h : -h);
+            c.cy = n.cy + ((q & 2) ? h : -h);
+            c.comx = c.cx;
+            c.comy = c.cy;
+            c.mass = 0.0f;
+            c.head = -1;
+            c.child[0] = c.child[1] = c.child[2] = c.child[3] = -1;
+            nodes[base + q] = c;
+            n.child[q] = base + q;
+        }
+    }
+
+    inline int quadrant(int nodeIdx, int pIdx) const {
+        const Node &n = nodes[nodeIdx];
+        int q = (px[pIdx] >= n.cx) ? 1 : 0;
+        if (py[pIdx] >= n.cy) q |= 2;
+        return q;
+    }
+
+    void insertParticle(int nodeIdx, int pIdx, int depth) {
+        Node &n = nodes[nodeIdx];
+        if (n.head != -2) { // leaf
+            if (n.head == -1) {
+                n.head = pIdx;
+                nextInLeaf[pIdx] = -1;
+                return;
+            }
+            if (n.half <= minHalf || depth >= maxDepth) {
+                nextInLeaf[pIdx] = n.head;
+                n.head = pIdx;
+                return;
+            }
+
+            int oldHead = n.head;
+            n.head = -2;
+            makeChildren(nodeIdx);
+
+            int q = oldHead;
+            while (q != -1) {
+                int nq = nextInLeaf[q];
+                nextInLeaf[q] = -1;
+                int quad = quadrant(nodeIdx, q);
+                insertParticle(n.child[quad], q, depth + 1);
+                q = nq;
+            }
+
+            nextInLeaf[pIdx] = -1;
+            int quad = quadrant(nodeIdx, pIdx);
+            insertParticle(n.child[quad], pIdx, depth + 1);
+            return;
+        }
+
+        int quad = quadrant(nodeIdx, pIdx);
+        insertParticle(n.child[quad], pIdx, depth + 1);
+    }
+
+    void computeMassCOM() {
+        std::vector<int> order;
+        order.reserve(nodes.size());
+        std::vector<int> st;
+        st.reserve(256);
+        st.push_back(0);
+        while (!st.empty()) {
+            int idx = st.back();
+            st.pop_back();
+            order.push_back(idx);
+            const Node &n = nodes[idx];
+            if (n.head == -2) {
+                for (int k = 0; k < 4; k++) {
+                    int c = n.child[k];
+                    if (c != -1) st.push_back(c);
+                }
+            }
+        }
+
+        for (int oi = (int)order.size() - 1; oi >= 0; --oi) {
+            int idx = order[oi];
+            Node &n = nodes[idx];
+            if (n.head != -2) {
+                float m = 0.0f, sx = 0.0f, sy = 0.0f;
+                for (int p = n.head; p != -1; p = nextInLeaf[p]) {
+                    float mpv = pm[p];
+                    m += mpv;
+                    sx += mpv * px[p];
+                    sy += mpv * py[p];
+                }
+                n.mass = m;
+                if (m > 0.0f) {
+                    float inv = 1.0f / m;
+                    n.comx = sx * inv;
+                    n.comy = sy * inv;
+                } else {
+                    n.comx = n.cx;
+                    n.comy = n.cy;
+                }
+            } else {
+                float m = 0.0f, sx = 0.0f, sy = 0.0f;
+                for (int k = 0; k < 4; k++) {
+                    int c = n.child[k];
+                    if (c == -1) continue;
+                    const Node &ch = nodes[c];
+                    float cm = ch.mass;
+                    m += cm;
+                    sx += cm * ch.comx;
+                    sy += cm * ch.comy;
+                }
+                n.mass = m;
+                if (m > 0.0f) {
+                    float inv = 1.0f / m;
+                    n.comx = sx * inv;
+                    n.comy = sy * inv;
+                } else {
+                    n.comx = n.cx;
+                    n.comy = n.cy;
+                }
+            }
+        }
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        (void)params;
+        omp_set_dynamic(0);
+        int maxT = omp_get_max_threads();
+        numThreads = std::min(16, maxT > 0 ? maxT : 16);
+        omp_set_num_threads(numThreads);
+
+        px.resize(numParticles);
+        py.resize(numParticles);
+        pm.resize(numParticles);
+        nextInLeaf.resize(numParticles);
+        nodes.reserve(std::max(8, numParticles * 4));
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = (int)particles.size();
+        if ((int)newParticles.size() != n) newParticles.resize(n);
+
+        if ((int)px.size() != n) {
+            px.resize(n);
+            py.resize(n);
+            pm.resize(n);
+            nextInLeaf.resize(n);
+        }
+
+        float minx = FLT_MAX, miny = FLT_MAX, maxx = -FLT_MAX, maxy = -FLT_MAX;
+
+        #pragma omp parallel for schedule(static) reduction(min:minx,miny) reduction(max:maxx,maxy)
+        for (int i = 0; i < n; i++) {
+            const auto &p = particles[i];
+            float x = p.position.x;
+            float y = p.position.y;
+            px[i] = x;
+            py[i] = y;
+            pm[i] = p.mass;
+            nextInLeaf[i] = -1;
+            minx = std::min(minx, x);
+            miny = std::min(miny, y);
+            maxx = std::max(maxx, x);
+            maxy = std::max(maxy, y);
+        }
+
+        float cx = 0.5f * (minx + maxx);
+        float cy = 0.5f * (miny + maxy);
+        float rx = maxx - minx;
+        float ry = maxy - miny;
+        float range = std::max(rx, ry);
+        if (!(range > 0.0f)) range = 1.0f;
+        float half = 0.5f * range * 1.02f + 1e-2f;
+
+        nodes.clear();
+        Node root;
+        root.cx = cx;
+        root.cy = cy;
+        root.half = half;
+        root.mass = 0.0f;
+        root.comx = cx;
+        root.comy = cy;
+        root.head = -1;
+        root.child[0] = root.child[1] = root.child[2] = root.child[3] = -1;
+        nodes.push_back(root);
+
+        for (int i = 0; i < n; i++) insertParticle(0, i, 0);
+        computeMassCOM();
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        const float dt = params.deltaTime;
+        const float theta2 = theta * theta;
+
+        #pragma omp parallel
+        {
+            std::vector<int> stack;
+            stack.reserve(256);
+
+            Particle agg;
+            agg.id = -1;
+            agg.velocity = Vec2(0.0f, 0.0f);
+
+            #pragma omp for schedule(guided, 32)
+            for (int i = 0; i < n; i++) {
+                const Particle &pi = particles[i];
+                const float pix = px[i];
+                const float piy = py[i];
+
+                float fx = 0.0f, fy = 0.0f;
+
+                stack.clear();
+                stack.push_back(0);
+
+                while (!stack.empty()) {
+                    int idx = stack.back();
+                    stack.pop_back();
+
+                    const Node &nd = nodes[idx];
+                    if (!(nd.mass > 0.0f)) continue;
+
+                    float adx = std::fabs(pix - nd.cx);
+                    float ady = std::fabs(piy - nd.cy);
+
+                    float dx = adx - nd.half;
+                    float dy = ady - nd.half;
+                    if (dx < 0.0f) dx = 0.0f;
+                    if (dy < 0.0f) dy = 0.0f;
+                    float minDist2 = dx * dx + dy * dy;
+                    if (minDist2 > r2) continue;
+
+                    if (nd.head != -2) {
+                        for (int p = nd.head; p != -1; p = nextInLeaf[p]) {
+                            if (p == i) continue;
+                            float ddx = px[p] - pix;
+                            float ddy = py[p] - piy;
+                            float dist2 = ddx * ddx + ddy * ddy;
+                            if (dist2 > r2 || dist2 < 1e-6f) continue;
+                            Vec2 f = computeForce(pi, particles[p], r);
+                            fx += f.x;
+                            fy += f.y;
+                        }
+                        continue;
+                    }
+
+                    float mdx = adx + nd.half;
+                    float mdy = ady + nd.half;
+                    bool fullyInside = (mdx * mdx + mdy * mdy <= r2);
+                    bool containsSelf = (adx <= nd.half && ady <= nd.half);
+
+                    if (fullyInside && !containsSelf) {
+                        float cdx = nd.comx - pix;
+                        float cdy = nd.comy - piy;
+                        float dist2 = cdx * cdx + cdy * cdy;
+                        float size = nd.half * 2.0f;
+                        if (dist2 > 1e-6f && (size * size <= theta2 * dist2)) {
+                            agg.mass = nd.mass;
+                            agg.position = Vec2(nd.comx, nd.comy);
+                            Vec2 f = computeForce(pi, agg, r);
+                            fx += f.x;
+                            fy += f.y;
+                            continue;
+                        }
+                    }
+
+                    for (int k = 0; k < 4; k++) {
+                        int c = nd.child[k];
+                        if (c != -1 && nodes[c].mass > 0.0f) stack.push_back(c);
+                    }
+                }
+
+                newParticles[i] = updateParticle(pi, Vec2(fx, fy), dt);
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.2_3.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.2_3.cpp
new file mode 100644
index 00000000..077d6d50
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.2_3.cpp
@@ -0,0 +1,379 @@
+#include "world.h"
+#include <omp.h>
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cfloat>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    static constexpr int LEAF_CAP = 8;
+    static constexpr int MAX_DEPTH = 18;
+    static constexpr float MIN_SIZE = 1e-4f;
+
+    struct Node {
+        float minx = 0.0f, miny = 0.0f, size = 0.0f;
+        int child[4] = {-1, -1, -1, -1}; // 0: (0,0), 1: (1,0), 2: (0,1), 3: (1,1)
+        int count = 0;
+        int p[LEAF_CAP];
+        std::vector<int> extra; // only used if max-depth reached with too many particles
+        float mass = 0.0f;
+        float comx = 0.0f, comy = 0.0f;
+    };
+
+    int numThreads = 16;
+
+    std::vector<Node> nodes;
+    std::vector<int> order;
+
+    std::vector<float> px, py, vx, vy, pm;
+
+    inline int newNode(float minx, float miny, float size) {
+        Node n;
+        n.minx = minx;
+        n.miny = miny;
+        n.size = size;
+        nodes.push_back(std::move(n));
+        return (int)nodes.size() - 1;
+    }
+
+    inline int quadrant(const Node &n, float x, float y) const {
+        float midx = n.minx + n.size * 0.5f;
+        float midy = n.miny + n.size * 0.5f;
+        int q = (x >= midx) ? 1 : 0;
+        q |= (y >= midy) ? 2 : 0;
+        return q;
+    }
+
+    inline void subdivide(int idx) {
+        Node &n = nodes[idx];
+        float h = n.size * 0.5f;
+        for (int q = 0; q < 4; q++) {
+            float cx = n.minx + ((q & 1) ? h : 0.0f);
+            float cy = n.miny + ((q & 2) ? h : 0.0f);
+            n.child[q] = newNode(cx, cy, h);
+        }
+    }
+
+    void insertParticle(int idx, int pid, int depth) {
+        Node &n = nodes[idx];
+        if (n.child[0] == -1) {
+            if (n.count < LEAF_CAP) {
+                n.p[n.count++] = pid;
+                return;
+            }
+            if (depth >= MAX_DEPTH || n.size <= MIN_SIZE) {
+                n.extra.push_back(pid);
+                return;
+            }
+
+            int tmp[LEAF_CAP + 1];
+            for (int k = 0; k < LEAF_CAP; k++) tmp[k] = n.p[k];
+            tmp[LEAF_CAP] = pid;
+
+            n.count = 0;
+            subdivide(idx);
+
+            for (int k = 0; k < LEAF_CAP + 1; k++) {
+                int id = tmp[k];
+                const Node &nn = nodes[idx];
+                int q = quadrant(nn, px[id], py[id]);
+                insertParticle(nn.child[q], id, depth + 1);
+            }
+            return;
+        } else {
+            int q = quadrant(n, px[pid], py[pid]);
+            insertParticle(n.child[q], pid, depth + 1);
+        }
+    }
+
+    inline float minDist2ToAABB(const Node &n, float x, float y) const {
+        float maxx = n.minx + n.size;
+        float maxy = n.miny + n.size;
+        float dx = 0.0f;
+        if (x < n.minx) dx = n.minx - x;
+        else if (x > maxx) dx = x - maxx;
+        float dy = 0.0f;
+        if (y < n.miny) dy = n.miny - y;
+        else if (y > maxy) dy = y - maxy;
+        return dx * dx + dy * dy;
+    }
+
+    inline float maxDist2ToAABB(const Node &n, float x, float y) const {
+        float maxx = n.minx + n.size;
+        float maxy = n.miny + n.size;
+        float dx0 = std::fabs(x - n.minx);
+        float dx1 = std::fabs(x - maxx);
+        float dy0 = std::fabs(y - n.miny);
+        float dy1 = std::fabs(y - maxy);
+        float dx = (dx0 > dx1) ? dx0 : dx1;
+        float dy = (dy0 > dy1) ? dy0 : dy1;
+        return dx * dx + dy * dy;
+    }
+
+    inline bool containsPointHalfOpen(const Node &n, float x, float y) const {
+        float maxx = n.minx + n.size;
+        float maxy = n.miny + n.size;
+        return (x >= n.minx && x < maxx && y >= n.miny && y < maxy);
+    }
+
+    inline void addForcePair(float tx, float ty, float tmass,
+                             float ax, float ay, float amass,
+                             float r, float r2, float decayStart, float invDecayRange,
+                             float &fx, float &fy) const {
+        float dx = ax - tx;
+        float dy = ay - ty;
+        float dist2 = dx * dx + dy * dy;
+        if (dist2 < 1e-6f || dist2 >= r2) return;
+
+        float dist = std::sqrt(dist2);
+        if (dist < 1e-3f) return;
+
+        float invDist = 1.0f / dist;
+        float dirx = dx * invDist;
+        float diry = dy * invDist;
+
+        float d = (dist < 1e-1f) ? 1e-1f : dist;
+
+        const float G = 0.01f;
+        float invd2 = 1.0f / (d * d);
+        float fmag = (G * tmass * amass) * invd2;
+
+        if (d > decayStart) {
+            float decay = 1.0f - (d - decayStart) * invDecayRange;
+            if (decay <= 0.0f) return;
+            fmag *= decay;
+        }
+
+        fx += dirx * fmag;
+        fy += diry * fmag;
+    }
+
+    void buildTree() {
+        int n = (int)px.size();
+        nodes.clear();
+        order.clear();
+        if (n == 0) return;
+
+        float minx = px[0], maxx = px[0];
+        float miny = py[0], maxy = py[0];
+        for (int i = 1; i < n; i++) {
+            float x = px[i], y = py[i];
+            if (x < minx) minx = x;
+            if (x > maxx) maxx = x;
+            if (y < miny) miny = y;
+            if (y > maxy) maxy = y;
+        }
+
+        float spanx = maxx - minx;
+        float spany = maxy - miny;
+        float size = (spanx > spany) ? spanx : spany;
+        float pad = 1e-3f + 0.01f * size;
+        size = size + 2.0f * pad;
+        minx -= pad;
+        miny -= pad;
+
+        nodes.reserve((size_t)std::max(4 * n, 1024));
+        newNode(minx, miny, size);
+
+        for (int i = 0; i < n; i++) {
+            insertParticle(0, i, 0);
+        }
+
+        // Build traversal order (preorder), then compute mass/COM bottom-up.
+        order.reserve(nodes.size());
+        std::vector<int> st;
+        st.reserve(256);
+        st.push_back(0);
+        while (!st.empty()) {
+            int idx = st.back();
+            st.pop_back();
+            order.push_back(idx);
+            Node &nd = nodes[idx];
+            if (nd.child[0] != -1) {
+                // push in reverse so 0 is processed first in the implicit traversal
+                st.push_back(nd.child[3]);
+                st.push_back(nd.child[2]);
+                st.push_back(nd.child[1]);
+                st.push_back(nd.child[0]);
+            }
+        }
+
+        for (int oi = (int)order.size() - 1; oi >= 0; oi--) {
+            int idx = order[oi];
+            Node &nd = nodes[idx];
+            if (nd.child[0] == -1) {
+                float m = 0.0f, sx = 0.0f, sy = 0.0f;
+                for (int k = 0; k < nd.count; k++) {
+                    int pid = nd.p[k];
+                    float mm = pm[pid];
+                    m += mm;
+                    sx += mm * px[pid];
+                    sy += mm * py[pid];
+                }
+                for (int pid : nd.extra) {
+                    float mm = pm[pid];
+                    m += mm;
+                    sx += mm * px[pid];
+                    sy += mm * py[pid];
+                }
+                nd.mass = m;
+                if (m > 0.0f) {
+                    nd.comx = sx / m;
+                    nd.comy = sy / m;
+                } else {
+                    nd.comx = nd.minx + nd.size * 0.5f;
+                    nd.comy = nd.miny + nd.size * 0.5f;
+                }
+            } else {
+                float m = 0.0f, sx = 0.0f, sy = 0.0f;
+                for (int c = 0; c < 4; c++) {
+                    int ci = nd.child[c];
+                    const Node &ch = nodes[ci];
+                    float cm = ch.mass;
+                    m += cm;
+                    sx += cm * ch.comx;
+                    sy += cm * ch.comy;
+                }
+                nd.mass = m;
+                if (m > 0.0f) {
+                    nd.comx = sx / m;
+                    nd.comy = sy / m;
+                } else {
+                    nd.comx = nd.minx + nd.size * 0.5f;
+                    nd.comy = nd.miny + nd.size * 0.5f;
+                }
+            }
+        }
+    }
+
+public:
+    void init(int numParticles, StepParameters) override {
+        int hw = omp_get_max_threads();
+        numThreads = std::min(16, hw > 0 ? hw : 16);
+        omp_set_num_threads(numThreads);
+
+        px.resize(numParticles);
+        py.resize(numParticles);
+        vx.resize(numParticles);
+        vy.resize(numParticles);
+        pm.resize(numParticles);
+
+        nodes.clear();
+        order.clear();
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        int n = (int)particles.size();
+        if ((int)newParticles.size() != n) newParticles.resize(n);
+        if ((int)px.size() != n) {
+            px.resize(n); py.resize(n); vx.resize(n); vy.resize(n); pm.resize(n);
+        }
+
+        for (int i = 0; i < n; i++) {
+            px[i] = particles[i].position.x;
+            py[i] = particles[i].position.y;
+            vx[i] = particles[i].velocity.x;
+            vy[i] = particles[i].velocity.y;
+            pm[i] = particles[i].mass;
+        }
+
+        buildTree();
+        if (nodes.empty()) {
+            for (int i = 0; i < n; i++) newParticles[i] = particles[i];
+            return;
+        }
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        const float decayStart = r * 0.75f;
+        const float invDecayRange = (r > 0.0f) ? (1.0f / (r * 0.25f)) : 0.0f;
+        const float dt = params.deltaTime;
+
+        const float theta = 0.35f;
+        const float theta2 = theta * theta;
+
+        #pragma omp parallel
+        {
+            std::vector<int> st;
+            st.reserve(256);
+
+            #pragma omp for schedule(guided, 32)
+            for (int i = 0; i < n; i++) {
+                float tx = px[i], ty = py[i], tm = pm[i];
+                float fx = 0.0f, fy = 0.0f;
+
+                st.clear();
+                st.push_back(0);
+
+                while (!st.empty()) {
+                    int idx = st.back();
+                    st.pop_back();
+                    const Node &nd = nodes[idx];
+
+                    if (nd.mass <= 0.0f) continue;
+
+                    float mind2 = minDist2ToAABB(nd, tx, ty);
+                    if (mind2 >= r2) continue;
+
+                    bool leaf = (nd.child[0] == -1);
+                    float maxd2 = maxDist2ToAABB(nd, tx, ty);
+                    bool fullyInside = (maxd2 < r2);
+                    bool contains = containsPointHalfOpen(nd, tx, ty);
+
+                    if (fullyInside && !contains) {
+                        float dx = nd.comx - tx;
+                        float dy = nd.comy - ty;
+                        float dist2 = dx * dx + dy * dy;
+                        if (dist2 > 1e-6f) {
+                            // opening criterion without sqrt: (size / dist) < theta
+                            if ((nd.size * nd.size) < (theta2 * dist2)) {
+                                addForcePair(tx, ty, tm, nd.comx, nd.comy, nd.mass, r, r2, decayStart, invDecayRange, fx, fy);
+                                continue;
+                            }
+                        }
+                    }
+
+                    if (leaf) {
+                        for (int k = 0; k < nd.count; k++) {
+                            int j = nd.p[k];
+                            if (j == i) continue;
+                            addForcePair(tx, ty, tm, px[j], py[j], pm[j], r, r2, decayStart, invDecayRange, fx, fy);
+                        }
+                        for (int j : nd.extra) {
+                            if (j == i) continue;
+                            addForcePair(tx, ty, tm, px[j], py[j], pm[j], r, r2, decayStart, invDecayRange, fx, fy);
+                        }
+                    } else {
+                        // push children in reverse so 0 is processed first
+                        st.push_back(nd.child[3]);
+                        st.push_back(nd.child[2]);
+                        st.push_back(nd.child[1]);
+                        st.push_back(nd.child[0]);
+                    }
+                }
+
+                float invm = 1.0f / tm;
+                float nvx = vx[i] + fx * (dt * invm);
+                float nvy = vy[i] + fy * (dt * invm);
+                float npx = tx + nvx * dt;
+                float npy = ty + nvy * dt;
+
+                Particle out = particles[i];
+                out.velocity.x = nvx;
+                out.velocity.y = nvy;
+                out.position.x = npx;
+                out.position.y = npy;
+                newParticles[i] = out;
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.2_4.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.2_4.cpp
new file mode 100644
index 00000000..8c1d847c
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.2_4.cpp
@@ -0,0 +1,317 @@
+#include "world.h"
+#include <omp.h>
+
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+#include <cstdint>
+
+class MySimulator : public Simulator {
+private:
+    struct Node {
+        float cx, cy;
+        float hs;
+        int child[4];
+        int head;   // head particle index for leaf
+        int count;  // number of particles in leaf
+        float mass;
+        float comx, comy;
+        Node() : cx(0), cy(0), hs(0), head(-1), count(0), mass(0), comx(0), comy(0) {
+            child[0] = child[1] = child[2] = child[3] = -1;
+        }
+        Node(float _cx, float _cy, float _hs) : cx(_cx), cy(_cy), hs(_hs), head(-1), count(0), mass(0), comx(_cx), comy(_cy) {
+            child[0] = child[1] = child[2] = child[3] = -1;
+        }
+        inline bool isLeaf() const { return child[0] == -1; }
+    };
+
+    int numThreads = 16;
+    int maxDepth = 20;
+
+    std::vector<Node> nodes;
+    std::vector<int> nextIdx;
+
+    const std::vector<Particle>* curParticles = nullptr;
+
+    float theta = 0.40f;
+
+    inline int quadrant(const Node& n, float x, float y) const {
+        const int east = (x > n.cx) ? 1 : 0;
+        const int north = (y > n.cy) ? 1 : 0;
+        return (north << 1) | east;
+    }
+
+    inline void ensureChildren(int nodeIdx) {
+        Node &n = nodes[nodeIdx];
+        if (!n.isLeaf()) return;
+
+        float hh = n.hs * 0.5f;
+        if (hh <= 0.0f) hh = 1e-6f;
+
+        for (int q = 0; q < 4; q++) {
+            float ccx = n.cx + ((q & 1) ? hh : -hh);
+            float ccy = n.cy + ((q & 2) ? hh : -hh);
+            n.child[q] = (int)nodes.size();
+            nodes.emplace_back(ccx, ccy, hh);
+        }
+        n.head = -1;
+        n.count = 0;
+    }
+
+    void insertParticle(int nodeIdx, int pIdx, int depth) {
+        Node &n = nodes[nodeIdx];
+
+        if (n.isLeaf()) {
+            if (n.count == 0) {
+                n.head = pIdx;
+                nextIdx[pIdx] = -1;
+                n.count = 1;
+                return;
+            }
+
+            if (depth >= maxDepth) {
+                nextIdx[pIdx] = n.head;
+                n.head = pIdx;
+                n.count++;
+                return;
+            }
+
+            const auto &P = (*curParticles);
+            int first = n.head;
+            const float fx = P[first].position.x;
+            const float fy = P[first].position.y;
+            const float px = P[pIdx].position.x;
+            const float py = P[pIdx].position.y;
+
+            if (fx == px && fy == py) {
+                nextIdx[pIdx] = n.head;
+                n.head = pIdx;
+                n.count++;
+                return;
+            }
+
+            int oldHead = n.head;
+            ensureChildren(nodeIdx);
+
+            int cur = oldHead;
+            while (cur != -1) {
+                int nxt = nextIdx[cur];
+                nextIdx[cur] = -1;
+                insertParticle(nodeIdx, cur, depth);
+                cur = nxt;
+            }
+            insertParticle(nodeIdx, pIdx, depth);
+            return;
+        }
+
+        const auto &P = (*curParticles);
+        float x = P[pIdx].position.x;
+        float y = P[pIdx].position.y;
+
+        int q = quadrant(n, x, y);
+        int c = n.child[q];
+        if (c < 0) {
+            // Should not happen, but guard
+            ensureChildren(nodeIdx);
+            c = n.child[q];
+        }
+        insertParticle(c, pIdx, depth + 1);
+    }
+
+    void computeMassCom(int nodeIdx) {
+        Node &n = nodes[nodeIdx];
+        if (n.isLeaf()) {
+            float m = 0.0f, wx = 0.0f, wy = 0.0f;
+            const auto &P = (*curParticles);
+            for (int p = n.head; p != -1; p = nextIdx[p]) {
+                float pm = P[p].mass;
+                m += pm;
+                wx += pm * P[p].position.x;
+                wy += pm * P[p].position.y;
+            }
+            n.mass = m;
+            if (m > 0.0f) {
+                n.comx = wx / m;
+                n.comy = wy / m;
+            } else {
+                n.comx = n.cx;
+                n.comy = n.cy;
+            }
+            return;
+        }
+
+        float m = 0.0f, wx = 0.0f, wy = 0.0f;
+        for (int k = 0; k < 4; k++) {
+            int c = n.child[k];
+            if (c >= 0) {
+                computeMassCom(c);
+                const Node &cn = nodes[c];
+                m += cn.mass;
+                wx += cn.mass * cn.comx;
+                wy += cn.mass * cn.comy;
+            }
+        }
+        n.mass = m;
+        if (m > 0.0f) {
+            n.comx = wx / m;
+            n.comy = wy / m;
+        } else {
+            n.comx = n.cx;
+            n.comy = n.cy;
+        }
+    }
+
+    inline float minDist2ToAABB(float px, float py, const Node& n) const {
+        float adx = std::fabs(px - n.cx);
+        float ady = std::fabs(py - n.cy);
+        float dx = adx - n.hs;
+        float dy = ady - n.hs;
+        if (dx < 0.0f) dx = 0.0f;
+        if (dy < 0.0f) dy = 0.0f;
+        return dx * dx + dy * dy;
+    }
+
+    inline float maxDist2ToAABB(float px, float py, const Node& n) const {
+        float adx = std::fabs(px - n.cx) + n.hs;
+        float ady = std::fabs(py - n.cy) + n.hs;
+        return adx * adx + ady * ady;
+    }
+
+public:
+    void init(int numParticles, StepParameters) override {
+        int procs = omp_get_num_procs();
+        numThreads = std::min(16, std::max(1, procs));
+        omp_set_dynamic(0);
+        omp_set_num_threads(numThreads);
+
+        nextIdx.assign(numParticles, -1);
+        nodes.clear();
+        nodes.reserve((size_t)numParticles * 4u);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = (int)particles.size();
+        if ((int)newParticles.size() != n) newParticles.resize(n);
+        if ((int)nextIdx.size() != n) nextIdx.assign(n, -1);
+        else std::fill(nextIdx.begin(), nextIdx.end(), -1);
+
+        curParticles = &particles;
+
+        float minx = FLT_MAX, miny = FLT_MAX, maxx = -FLT_MAX, maxy = -FLT_MAX;
+        #pragma omp parallel
+        {
+            float lminx = FLT_MAX, lminy = FLT_MAX, lmaxx = -FLT_MAX, lmaxy = -FLT_MAX;
+            #pragma omp for nowait
+            for (int i = 0; i < n; i++) {
+                float x = particles[i].position.x;
+                float y = particles[i].position.y;
+                lminx = std::min(lminx, x);
+                lminy = std::min(lminy, y);
+                lmaxx = std::max(lmaxx, x);
+                lmaxy = std::max(lmaxy, y);
+            }
+            #pragma omp critical
+            {
+                minx = std::min(minx, lminx);
+                miny = std::min(miny, lminy);
+                maxx = std::max(maxx, lmaxx);
+                maxy = std::max(maxy, lmaxy);
+            }
+        }
+
+        float cx = 0.5f * (minx + maxx);
+        float cy = 0.5f * (miny + maxy);
+        float spanx = maxx - minx;
+        float spany = maxy - miny;
+        float half = 0.5f * std::max(spanx, spany);
+        half += 1e-3f;
+        half *= 1.001f;
+
+        nodes.clear();
+        nodes.emplace_back(cx, cy, half);
+
+        for (int i = 0; i < n; i++) {
+            insertParticle(0, i, 0);
+        }
+        computeMassCom(0);
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        const float theta2 = theta * theta;
+
+        #pragma omp parallel for schedule(guided, 128)
+        for (int i = 0; i < n; i++) {
+            const Particle &pi = particles[i];
+            const float px = pi.position.x;
+            const float py = pi.position.y;
+
+            Vec2 force(0.0f, 0.0f);
+
+            int stack[96];
+            int sp = 0;
+            stack[sp++] = 0;
+
+            while (sp) {
+                int idx = stack[--sp];
+                const Node &node = nodes[idx];
+                if (node.mass == 0.0f) continue;
+
+                if (minDist2ToAABB(px, py, node) > r2) continue;
+
+                if (node.isLeaf()) {
+                    for (int p = node.head; p != -1; p = nextIdx[p]) {
+                        if (p == i) continue;
+                        float dx = particles[p].position.x - px;
+                        float dy = particles[p].position.y - py;
+                        float d2 = dx * dx + dy * dy;
+                        if (d2 < r2 && d2 > 1e-12f) {
+                            force += computeForce(pi, particles[p], r);
+                        }
+                    }
+                } else {
+                    float adx = std::fabs(px - node.cx);
+                    float ady = std::fabs(py - node.cy);
+                    bool contains = (adx <= node.hs && ady <= node.hs);
+
+                    bool didApprox = false;
+                    if (!contains) {
+                        if (maxDist2ToAABB(px, py, node) <= r2) {
+                            float dx = node.comx - px;
+                            float dy = node.comy - py;
+                            float d2 = dx * dx + dy * dy;
+                            if (d2 > 1e-12f) {
+                                float s = node.hs * 2.0f;
+                                if ((s * s) <= theta2 * d2) {
+                                    Particle pseudo;
+                                    pseudo.id = -1;
+                                    pseudo.mass = node.mass;
+                                    pseudo.position = Vec2{node.comx, node.comy};
+                                    pseudo.velocity = Vec2{0.0f, 0.0f};
+                                    force += computeForce(pi, pseudo, r);
+                                    didApprox = true;
+                                }
+                            }
+                        }
+                    }
+
+                    if (!didApprox) {
+                        int c0 = node.child[0], c1 = node.child[1], c2 = node.child[2], c3 = node.child[3];
+                        if (c0 >= 0) stack[sp++] = c0;
+                        if (c1 >= 0) stack[sp++] = c1;
+                        if (c2 >= 0) stack[sp++] = c2;
+                        if (c3 >= 0) stack[sp++] = c3;
+                    }
+                }
+            }
+
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5.cpp b/research/solutions/nbody_simulation/random_100k/gpt5.cpp
new file mode 100644
index 00000000..78e45eda
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5.cpp
@@ -0,0 +1,219 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads;
+    // Spatial grid structures
+    std::vector<int> cellStart;
+    std::vector<int> cellCount;
+    std::vector<int> particleCell;
+    std::vector<int> sortedIndices;
+    // Task list for cell pair processing
+    struct CellPair { int a, b; };
+    std::vector<CellPair> tasks;
+    // Per-thread force buffers
+    std::vector<std::vector<Vec2>> threadForces;
+    // Grid and step parameters
+    float cellSizeScale = 0.5f; // cellSize = cullRadius * 0.5
+    float lastCullRadius = -1.0f;
+    int lastNumParticles = -1;
+
+public:
+    MySimulator() {
+        numThreads = omp_get_max_threads();
+    }
+
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        lastNumParticles = numParticles;
+        lastCullRadius = params.cullRadius;
+        // Prepare per-thread buffers
+        threadForces.resize(numThreads);
+        for (int t = 0; t < numThreads; ++t) {
+            threadForces[t].assign(numParticles, Vec2{0.0f, 0.0f});
+        }
+        // Reserve particle indexing buffers
+        particleCell.assign(numParticles, 0);
+        sortedIndices.assign(numParticles, 0);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = (int)particles.size();
+        if (N == 0) return;
+
+        if (lastNumParticles != N || lastCullRadius != params.cullRadius) {
+            init(N, params);
+        }
+
+        // Compute bounding box
+        float minX = particles[0].position.x, maxX = minX;
+        float minY = particles[0].position.y, maxY = minY;
+        for (int i = 1; i < N; ++i) {
+            float x = particles[i].position.x;
+            float y = particles[i].position.y;
+            if (x < minX) minX = x;
+            if (x > maxX) maxX = x;
+            if (y < minY) minY = y;
+            if (y > maxY) maxY = y;
+        }
+        // Slight padding to avoid boundary clamping issues
+        const float pad = 1e-4f;
+        minX -= pad; minY -= pad;
+        maxX += pad; maxY += pad;
+
+        const float s = params.cullRadius;
+        const float cellSize = std::max(1e-3f, s * cellSizeScale);
+        const float invCell = 1.0f / cellSize;
+
+        int nx = std::max(1, (int)std::ceil((maxX - minX) * invCell));
+        int ny = std::max(1, (int)std::ceil((maxY - minY) * invCell));
+        const int numCells = nx * ny;
+
+        cellCount.assign(numCells, 0);
+        cellStart.assign(numCells, 0);
+
+        // Assign particles to cells and count
+        for (int i = 0; i < N; ++i) {
+            int cx = (int)std::floor((particles[i].position.x - minX) * invCell);
+            int cy = (int)std::floor((particles[i].position.y - minY) * invCell);
+            if (cx < 0) cx = 0; if (cx >= nx) cx = nx - 1;
+            if (cy < 0) cy = 0; if (cy >= ny) cy = ny - 1;
+            int c = cy * nx + cx;
+            particleCell[i] = c;
+            cellCount[c]++;
+        }
+
+        // Prefix sum for cellStart
+        int sum = 0;
+        for (int c = 0; c < numCells; ++c) {
+            cellStart[c] = sum;
+            sum += cellCount[c];
+        }
+
+        // Fill sortedIndices
+        std::vector<int> writeHead = cellStart;
+        for (int i = 0; i < N; ++i) {
+            int c = particleCell[i];
+            int idx = writeHead[c]++;
+            sortedIndices[idx] = i;
+        }
+
+        // Build cell pair tasks
+        tasks.clear();
+        const int rCells = std::max(1, (int)std::ceil(s / cellSize));
+        tasks.reserve((size_t)numCells * (2 * rCells + 1) * (2 * rCells + 1) / 2);
+        for (int ay = 0; ay < ny; ++ay) {
+            for (int ax = 0; ax < nx; ++ax) {
+                int a = ay * nx + ax;
+                for (int dy = 0; dy <= rCells; ++dy) {
+                    int by = ay + dy;
+                    if (by < 0 || by >= ny) continue;
+                    int dxMin = -rCells;
+                    int dxMax = rCells;
+                    for (int dx = dxMin; dx <= dxMax; ++dx) {
+                        if (dy == 0 && dx < 0) continue; // avoid duplicates; handle a==b separately
+                        int bx = ax + dx;
+                        if (bx < 0 || bx >= nx) continue;
+                        int b = by * nx + bx;
+                        tasks.push_back({a, b});
+                    }
+                }
+            }
+        }
+
+        // Zero per-thread forces
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            auto &forces = threadForces[tid];
+            std::fill(forces.begin(), forces.end(), Vec2{0.0f, 0.0f});
+        }
+
+        const float s2 = s * s;
+
+        // Process tasks in parallel
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (int ti = 0; ti < (int)tasks.size(); ++ti) {
+            int tid = omp_get_thread_num();
+            auto &forces = threadForces[tid];
+
+            const int a = tasks[ti].a;
+            const int b = tasks[ti].b;
+
+            const int aStart = cellStart[a];
+            const int aCount = cellCount[a];
+            const int bStart = cellStart[b];
+            const int bCount = cellCount[b];
+
+            if (aCount == 0 || bCount == 0) continue;
+
+            if (a == b) {
+                for (int ia = 0; ia < aCount; ++ia) {
+                    int idx_i = sortedIndices[aStart + ia];
+                    const Particle &pi = particles[idx_i];
+                    for (int ib = ia + 1; ib < bCount; ++ib) {
+                        int idx_j = sortedIndices[bStart + ib];
+                        const Particle &pj = particles[idx_j];
+
+                        float dx = pj.position.x - pi.position.x;
+                        float dy = pj.position.y - pi.position.y;
+                        float dist2 = dx * dx + dy * dy;
+                        if (dist2 >= s2) continue;
+
+                        Vec2 fij = computeForce(pi, pj, s);
+                        if (fij.x != 0.0f || fij.y != 0.0f) {
+                            forces[idx_i] += fij;
+                            forces[idx_j] -= fij;
+                        }
+                    }
+                }
+            } else {
+                for (int ia = 0; ia < aCount; ++ia) {
+                    int idx_i = sortedIndices[aStart + ia];
+                    const Particle &pi = particles[idx_i];
+                    for (int ib = 0; ib < bCount; ++ib) {
+                        int idx_j = sortedIndices[bStart + ib];
+                        const Particle &pj = particles[idx_j];
+
+                        float dx = pj.position.x - pi.position.x;
+                        float dy = pj.position.y - pi.position.y;
+                        float dist2 = dx * dx + dy * dy;
+                        if (dist2 >= s2) continue;
+
+                        Vec2 fij = computeForce(pi, pj, s);
+                        if (fij.x != 0.0f || fij.y != 0.0f) {
+                            forces[idx_i] += fij;
+                            forces[idx_j] -= fij;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Reduce per-thread forces into a single array
+        std::vector<Vec2> totalForces(N, Vec2{0.0f, 0.0f});
+        for (int t = 0; t < numThreads; ++t) {
+            auto &forces = threadForces[t];
+            #pragma omp parallel for schedule(static)
+            for (int i = 0; i < N; ++i) {
+                totalForces[i] += forces[i];
+            }
+        }
+
+        // Integrate particles
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; ++i) {
+            newParticles[i] = updateParticle(particles[i], totalForces[i], params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5_1.cpp b/research/solutions/nbody_simulation/random_100k/gpt5_1.cpp
new file mode 100644
index 00000000..3bb34536
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5_1.cpp
@@ -0,0 +1,210 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <limits>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 0;
+
+    struct Grid {
+        float minX, minY;
+        float cellSize;
+        int width, height;
+        std::vector<int> cellOfParticle;
+        std::vector<int> counts;
+        std::vector<int> starts;      // size = numCells + 1
+        std::vector<int> indices;     // sorted particle indices by cell
+    };
+
+    void buildGrid(const std::vector<Particle>& particles, float cellSize, Grid& grid) {
+        size_t N = particles.size();
+        if (N == 0) return;
+
+        float minX = std::numeric_limits<float>::infinity();
+        float minY = std::numeric_limits<float>::infinity();
+        float maxX = -std::numeric_limits<float>::infinity();
+        float maxY = -std::numeric_limits<float>::infinity();
+
+        for (size_t i = 0; i < N; ++i) {
+            const auto& p = particles[i];
+            if (p.position.x < minX) minX = p.position.x;
+            if (p.position.y < minY) minY = p.position.y;
+            if (p.position.x > maxX) maxX = p.position.x;
+            if (p.position.y > maxY) maxY = p.position.y;
+        }
+
+        // Small padding to be safe with numeric edges
+        const float eps = 1e-6f;
+        minX -= eps;
+        minY -= eps;
+        maxX += eps;
+        maxY += eps;
+
+        int width = std::max(1, int((maxX - minX) / cellSize) + 1);
+        int height = std::max(1, int((maxY - minY) / cellSize) + 1);
+        int numCells = width * height;
+
+        grid.minX = minX;
+        grid.minY = minY;
+        grid.cellSize = cellSize;
+        grid.width = width;
+        grid.height = height;
+
+        grid.cellOfParticle.assign(N, 0);
+        grid.counts.assign(numCells, 0);
+        grid.starts.assign(numCells + 1, 0);
+        grid.indices.assign(N, 0);
+
+        // Compute cell for each particle and counts
+        for (size_t i = 0; i < N; ++i) {
+            int cx = int(std::floor((particles[i].position.x - minX) / cellSize));
+            int cy = int(std::floor((particles[i].position.y - minY) / cellSize));
+            if (cx < 0) cx = 0; else if (cx >= width) cx = width - 1;
+            if (cy < 0) cy = 0; else if (cy >= height) cy = height - 1;
+            int cell = cy * width + cx;
+            grid.cellOfParticle[i] = cell;
+            grid.counts[cell]++;
+        }
+
+        // Prefix sum
+        for (int c = 0; c < numCells; ++c) {
+            grid.starts[c + 1] = grid.starts[c] + grid.counts[c];
+        }
+
+        // Fill indices
+        std::vector<int> next = grid.starts;
+        for (size_t i = 0; i < N; ++i) {
+            int cell = grid.cellOfParticle[i];
+            int pos = next[cell]++;
+            grid.indices[pos] = (int)i;
+        }
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        int hwThreads = omp_get_max_threads();
+        numThreads = hwThreads > 0 ? hwThreads : 1;
+        omp_set_num_threads(numThreads);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const size_t N = particles.size();
+        if (N == 0) return;
+        newParticles.resize(N);
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+
+        // Build spatial grid
+        Grid grid;
+        buildGrid(particles, r, grid);
+
+        const int W = grid.width;
+        const int H = grid.height;
+        const int numCells = W * H;
+
+        int nt = numThreads;
+        if (nt <= 0) nt = 1;
+
+        std::vector<Vec2> localForces((size_t)nt * N);
+        std::vector<Vec2> totalForce(N, Vec2(0.0f, 0.0f));
+
+        // Neighbor offsets to cover each pair only once:
+        // Within cell handled separately (i<j).
+        // Cross-cell neighbor offsets: (1,-1), (1,0), (1,1), (0,1), (-1,1)
+        const int nbrCount = 5;
+        const int offX[nbrCount] = {1, 1, 1, 0, -1};
+        const int offY[nbrCount] = {-1, 0, 1, 1, 1};
+
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            Vec2* myForces = localForces.data() + (size_t)tid * N;
+            for (size_t i = 0; i < N; ++i) {
+                myForces[i] = Vec2(0.0f, 0.0f);
+            }
+
+            #pragma omp for schedule(dynamic, 1)
+            for (int cell = 0; cell < numCells; ++cell) {
+                int cx = cell % W;
+                int cy = cell / W;
+
+                int startA = grid.starts[cell];
+                int endA = grid.starts[cell + 1];
+
+                // Within-cell pairs
+                for (int ia = startA; ia < endA; ++ia) {
+                    int i = grid.indices[ia];
+                    const Particle& pi = particles[i];
+                    for (int ib = ia + 1; ib < endA; ++ib) {
+                        int j = grid.indices[ib];
+                        const Particle& pj = particles[j];
+
+                        float dx = pj.position.x - pi.position.x;
+                        float dy = pj.position.y - pi.position.y;
+                        float dist2 = dx * dx + dy * dy;
+                        if (dist2 < r2) {
+                            Vec2 fij = computeForce(pi, pj, r);
+                            myForces[i] += fij;
+                            myForces[j] -= fij; // Newton's third law
+                        }
+                    }
+                }
+
+                // Cross-cell pairs with selected neighbor offsets
+                for (int n = 0; n < nbrCount; ++n) {
+                    int nx = cx + offX[n];
+                    int ny = cy + offY[n];
+                    if (nx < 0 || nx >= W || ny < 0 || ny >= H) continue;
+                    int cellB = ny * W + nx;
+
+                    int startB = grid.starts[cellB];
+                    int endB = grid.starts[cellB + 1];
+
+                    for (int ia = startA; ia < endA; ++ia) {
+                        int i = grid.indices[ia];
+                        const Particle& pi = particles[i];
+                        for (int ib = startB; ib < endB; ++ib) {
+                            int j = grid.indices[ib];
+                            const Particle& pj = particles[j];
+
+                            float dx = pj.position.x - pi.position.x;
+                            float dy = pj.position.y - pi.position.y;
+                            float dist2 = dx * dx + dy * dy;
+                            if (dist2 < r2) {
+                                Vec2 fij = computeForce(pi, pj, r);
+                                myForces[i] += fij;
+                                myForces[j] -= fij;
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Reduction of forces across threads
+            #pragma omp for schedule(static)
+            for (size_t i = 0; i < N; ++i) {
+                Vec2 sum(0.0f, 0.0f);
+                for (int t = 0; t < nt; ++t) {
+                    sum += localForces[(size_t)t * N + i];
+                }
+                totalForce[i] = sum;
+            }
+
+            // Integrate positions/velocities
+            #pragma omp for schedule(static)
+            for (int i = 0; i < (int)N; ++i) {
+                newParticles[i] = updateParticle(particles[i], totalForce[i], params.deltaTime);
+            }
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5_2.cpp b/research/solutions/nbody_simulation/random_100k/gpt5_2.cpp
new file mode 100644
index 00000000..64806d68
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5_2.cpp
@@ -0,0 +1,231 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 0;
+    int N = 0;
+
+    // Persistent buffers
+    std::vector<int> particleCellID;      // size N
+    std::vector<int> cellStarts;          // size numCells + 1 (per step)
+    std::vector<int> cellCounts;          // size numCells (per step)
+    std::vector<int> sortedIndices;       // size N (per step)
+
+    // Thread-local force accumulators: size numThreads * N
+    std::vector<float> localFx;
+    std::vector<float> localFy;
+
+    float cellFactor = 3.0f; // number of cells per cullRadius
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        N = numParticles;
+        numThreads = std::max(1, omp_get_max_threads());
+        omp_set_num_threads(numThreads);
+
+        particleCellID.assign(N, 0);
+        sortedIndices.resize(N);
+
+        localFx.assign((size_t)numThreads * (size_t)N, 0.0f);
+        localFy.assign((size_t)numThreads * (size_t)N, 0.0f);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        if (particles.empty()) return;
+
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        float minX = FLT_MAX, minY = FLT_MAX;
+        float maxX = -FLT_MAX, maxY = -FLT_MAX;
+
+        // Determine bounding box
+        #pragma omp parallel for reduction(min:minX,minY) reduction(max:maxX,maxY) schedule(static)
+        for (int i = 0; i < (int)particles.size(); ++i) {
+            const auto &p = particles[i];
+            if (p.position.x < minX) minX = p.position.x;
+            if (p.position.y < minY) minY = p.position.y;
+            if (p.position.x > maxX) maxX = p.position.x;
+            if (p.position.y > maxY) maxY = p.position.y;
+        }
+
+        const float eps = 1e-3f;
+        minX -= eps; minY -= eps;
+        maxX += eps; maxY += eps;
+
+        float cellSize = std::max(r / cellFactor, 1e-3f);
+        int gridNx = std::max(1, (int)((maxX - minX) / cellSize) + 1);
+        int gridNy = std::max(1, (int)((maxY - minY) / cellSize) + 1);
+        int numCells = gridNx * gridNy;
+        int k = std::max(1, (int)std::ceil(r / cellSize));
+
+        // Thread-local counts per cell to avoid atomics
+        std::vector<int> localCounts((size_t)numThreads * (size_t)numCells, 0);
+        cellCounts.assign(numCells, 0);
+        cellStarts.resize(numCells + 1);
+
+        // Compute cell IDs and local counts
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            int *lc = localCounts.data() + (size_t)tid * (size_t)numCells;
+
+            #pragma omp for schedule(static)
+            for (int i = 0; i < N; ++i) {
+                const auto &p = particles[i];
+                int ix = (int)((p.position.x - minX) / cellSize);
+                int iy = (int)((p.position.y - minY) / cellSize);
+                if (ix < 0) ix = 0; else if (ix >= gridNx) ix = gridNx - 1;
+                if (iy < 0) iy = 0; else if (iy >= gridNy) iy = gridNy - 1;
+                int cid = iy * gridNx + ix;
+                particleCellID[i] = cid;
+                lc[cid] += 1;
+            }
+        }
+
+        // Reduce counts across threads
+        for (int c = 0; c < numCells; ++c) {
+            int sum = 0;
+            for (int t = 0; t < numThreads; ++t) {
+                sum += localCounts[(size_t)t * (size_t)numCells + (size_t)c];
+            }
+            cellCounts[c] = sum;
+        }
+
+        // Exclusive prefix sum for cell starts
+        int total = 0;
+        for (int c = 0; c < numCells; ++c) {
+            cellStarts[c] = total;
+            total += cellCounts[c];
+        }
+        cellStarts[numCells] = total;
+
+        // Compute thread-specific starting offsets per cell
+        std::vector<int> threadCellStart((size_t)numThreads * (size_t)numCells, 0);
+        for (int c = 0; c < numCells; ++c) {
+            int base = cellStarts[c];
+            int accum = 0;
+            for (int t = 0; t < numThreads; ++t) {
+                int cnt = localCounts[(size_t)t * (size_t)numCells + (size_t)c];
+                threadCellStart[(size_t)t * (size_t)numCells + (size_t)c] = base + accum;
+                accum += cnt;
+            }
+        }
+
+        // Fill sortedIndices per cell (stable within thread)
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            int* writePtr = threadCellStart.data() + (size_t)tid * (size_t)numCells;
+
+            #pragma omp for schedule(static)
+            for (int i = 0; i < N; ++i) {
+                int cid = particleCellID[i];
+                int pos = writePtr[cid]++;
+                sortedIndices[pos] = i;
+            }
+        }
+
+        // Zero thread-local forces
+        std::fill(localFx.begin(), localFx.end(), 0.0f);
+        std::fill(localFy.begin(), localFy.end(), 0.0f);
+
+        // Pairwise interactions: process each cell with neighbors to avoid double counting
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            float* fx = localFx.data() + (size_t)tid * (size_t)N;
+            float* fy = localFy.data() + (size_t)tid * (size_t)N;
+
+            #pragma omp for schedule(dynamic, 1)
+            for (int c = 0; c < numCells; ++c) {
+                int cx = c % gridNx;
+                int cy = c / gridNx;
+
+                int cStart = cellStarts[c];
+                int cEnd   = cellStarts[c + 1];
+
+                // Interactions within the same cell
+                for (int idxA = cStart; idxA < cEnd; ++idxA) {
+                    int i = sortedIndices[idxA];
+                    const Particle &pi = particles[i];
+                    float pix = pi.position.x;
+                    float piy = pi.position.y;
+
+                    for (int idxB = idxA + 1; idxB < cEnd; ++idxB) {
+                        int j = sortedIndices[idxB];
+                        const Particle &pj = particles[j];
+
+                        float dx = pj.position.x - pix;
+                        float dy = pj.position.y - piy;
+                        float d2 = dx * dx + dy * dy;
+                        if (d2 > r2) continue;
+
+                        Vec2 fi = computeForce(pi, pj, r);
+                        fx[i] += fi.x; fy[i] += fi.y;
+                        fx[j] -= fi.x; fy[j] -= fi.y;
+                    }
+                }
+
+                // Interactions with neighboring cells (half-plane to avoid duplicate pairs)
+                for (int ox = 0; ox <= k; ++ox) {
+                    for (int oy = -k; oy <= k; ++oy) {
+                        if (ox == 0 && oy < 0) continue; // ensure unique pairs
+                        if (ox == 0 && oy == 0) continue; // skip self, already handled
+
+                        int nx = cx + ox;
+                        int ny = cy + oy;
+                        if (nx < 0 || nx >= gridNx || ny < 0 || ny >= gridNy) continue;
+
+                        int nc = ny * gridNx + nx;
+                        int nStart = cellStarts[nc];
+                        int nEnd   = cellStarts[nc + 1];
+
+                        for (int idxA = cStart; idxA < cEnd; ++idxA) {
+                            int i = sortedIndices[idxA];
+                            const Particle &pi = particles[i];
+                            float pix = pi.position.x;
+                            float piy = pi.position.y;
+
+                            for (int idxB = nStart; idxB < nEnd; ++idxB) {
+                                int j = sortedIndices[idxB];
+                                const Particle &pj = particles[j];
+
+                                float dx = pj.position.x - pix;
+                                float dy = pj.position.y - piy;
+                                float d2 = dx * dx + dy * dy;
+                                if (d2 > r2) continue;
+
+                                Vec2 fi = computeForce(pi, pj, r);
+                                fx[i] += fi.x; fy[i] += fi.y;
+                                fx[j] -= fi.x; fy[j] -= fi.y;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Reduce forces across threads and integrate
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; ++i) {
+            float sx = 0.0f, sy = 0.0f;
+            for (int t = 0; t < numThreads; ++t) {
+                sx += localFx[(size_t)t * (size_t)N + (size_t)i];
+                sy += localFy[(size_t)t * (size_t)N + (size_t)i];
+            }
+            Vec2 force(sx, sy);
+            newParticles[i] = updateParticle(particles[i], force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5_3.cpp b/research/solutions/nbody_simulation/random_100k/gpt5_3.cpp
new file mode 100644
index 00000000..dde5045f
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5_3.cpp
@@ -0,0 +1,352 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <cstring>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 0;
+    int lastN = 0;
+
+    // Thread-local force buffers: forces[thread][i]
+    std::vector<std::vector<Vec2>> threadForces;
+
+    // Grid buffers (reused across steps)
+    std::vector<int> cellCounts;
+    std::vector<int> cellStarts;     // size = numCells + 1
+    std::vector<int> cellIndices;    // size = N
+
+    // Neighbor offset structure
+    struct Offset {
+        int dx, dy;
+        float minDist2;
+        bool self;
+    };
+    std::vector<Offset> neighborOffsets;
+
+    // Parameters for grid
+    float cellSize = 1.0f;
+    int R = 1;  // radius in cells
+    int gridX = 0, gridY = 0; // number of cells in X and Y
+
+    // Expand or allocate force buffers when N or threads changes
+    void ensureForceBuffers(int N) {
+        int T = numThreads;
+        if ((int)threadForces.size() != T || lastN != N) {
+            threadForces.clear();
+            threadForces.resize(T);
+            for (int t = 0; t < T; ++t) {
+                threadForces[t].assign(N, Vec2{0.0f, 0.0f});
+            }
+            lastN = N;
+        } else {
+            // zero buffers
+            #pragma omp parallel for schedule(static)
+            for (int t = 0; t < T; ++t) {
+                std::fill(threadForces[t].begin(), threadForces[t].end(), Vec2{0.0f, 0.0f});
+            }
+        }
+    }
+
+    // Build neighbor offsets for current R and cellSize and cullRadius
+    void buildNeighborOffsets(float cullRadius) {
+        neighborOffsets.clear();
+        neighborOffsets.reserve((2 * R + 1) * (2 * R + 1));
+
+        float s = cellSize;
+        float r2 = cullRadius * cullRadius;
+
+        for (int dx = 0; dx <= R; ++dx) {
+            for (int dy = -R; dy <= R; ++dy) {
+                // Only unique pairs: (dx>0, any dy) or (dx==0 && dy>=0)
+                if (dx == 0 && dy < 0) continue;
+
+                int adx = dx;
+                int ady = std::abs(dy);
+                float minDx = std::max(0, adx - 1) * s;
+                float minDy = std::max(0, ady - 1) * s;
+                float minD2 = minDx * minDx + minDy * minDy;
+
+                if (minD2 <= r2) {
+                    Offset off;
+                    off.dx = dx;
+                    off.dy = dy;
+                    off.minDist2 = minD2;
+                    off.self = (dx == 0 && dy == 0);
+                    neighborOffsets.push_back(off);
+                }
+            }
+        }
+    }
+
+    // Compute grid bounds and map particles into cells
+    void buildGrid(const std::vector<Particle>& particles, float cullRadius,
+                   float& minX, float& minY) {
+        const int N = (int)particles.size();
+
+        // Determine bounding box
+        float maxX = -1e30f, maxY = -1e30f;
+        minX = 1e30f; minY = 1e30f;
+        for (int i = 0; i < N; ++i) {
+            float x = particles[i].position.x;
+            float y = particles[i].position.y;
+            if (x < minX) minX = x;
+            if (x > maxX) maxX = x;
+            if (y < minY) minY = y;
+            if (y > maxY) maxY = y;
+        }
+
+        // Slight padding to ensure boundary particles fit
+        float pad = 1e-3f;
+        minX -= pad; minY -= pad;
+        maxX += pad; maxY += pad;
+
+        // Define grid: choose cell size relative to cullRadius
+        // Using k = 5 (cellSize = r / 5) for better pruning
+        cellSize = std::max(cullRadius / 5.0f, 1e-3f);
+        R = (int)std::ceil(cullRadius / cellSize);
+
+        gridX = std::max(1, (int)std::ceil((maxX - minX) / cellSize));
+        gridY = std::max(1, (int)std::ceil((maxY - minY) / cellSize));
+        int numCells = gridX * gridY;
+
+        if ((int)cellCounts.size() != numCells) {
+            cellCounts.assign(numCells, 0);
+            cellStarts.assign(numCells + 1, 0);
+        } else {
+            std::fill(cellCounts.begin(), cellCounts.end(), 0);
+            std::fill(cellStarts.begin(), cellStarts.end(), 0);
+        }
+
+        if ((int)cellIndices.size() != N) {
+            cellIndices.resize(N);
+        }
+
+        // Count particles per cell
+        for (int i = 0; i < N; ++i) {
+            int ix = (int)std::floor((particles[i].position.x - minX) / cellSize);
+            int iy = (int)std::floor((particles[i].position.y - minY) / cellSize);
+            if (ix < 0) ix = 0; else if (ix >= gridX) ix = gridX - 1;
+            if (iy < 0) iy = 0; else if (iy >= gridY) iy = gridY - 1;
+            int cid = iy * gridX + ix;
+            cellCounts[cid]++;
+        }
+
+        // Prefix sums for starts
+        int sum = 0;
+        for (int c = 0; c < numCells; ++c) {
+            cellStarts[c] = sum;
+            sum += cellCounts[c];
+        }
+        cellStarts[numCells] = sum;
+
+        // Temporary write positions
+        std::vector<int> writePos = cellStarts;
+
+        // Assign particle indices to cells
+        for (int i = 0; i < N; ++i) {
+            int ix = (int)std::floor((particles[i].position.x - minX) / cellSize);
+            int iy = (int)std::floor((particles[i].position.y - minY) / cellSize);
+            if (ix < 0) ix = 0; else if (ix >= gridX) ix = gridX - 1;
+            if (iy < 0) iy = 0; else if (iy >= gridY) iy = gridY - 1;
+            int cid = iy * gridX + ix;
+            cellIndices[writePos[cid]++] = i;
+        }
+
+        // Build neighbor offsets for this cell configuration and cullRadius
+        buildNeighborOffsets(cullRadius);
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        numThreads = omp_get_max_threads();
+        omp_set_num_threads(numThreads);
+        ensureForceBuffers(numParticles);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int N = (int)particles.size();
+        if (N == 0) return;
+
+        if (numThreads <= 0) {
+            numThreads = omp_get_max_threads();
+            omp_set_num_threads(numThreads);
+        }
+        ensureForceBuffers(N);
+
+        // Build grid
+        float minX = 0.0f, minY = 0.0f;
+        buildGrid(particles, params.cullRadius, minX, minY);
+
+        // Prepare list of cell-pair tasks to improve load balancing
+        struct Task {
+            int aStart, aEnd;
+            int bStart, bEnd;
+            bool self;
+        };
+        std::vector<Task> tasks;
+        tasks.reserve(gridX * gridY * (int)neighborOffsets.size());
+
+        int numCells = gridX * gridY;
+        for (int ay = 0; ay < gridY; ++ay) {
+            for (int ax = 0; ax < gridX; ++ax) {
+                int aId = ay * gridX + ax;
+                int aStart = cellStarts[aId];
+                int aEnd = cellStarts[aId + 1];
+                int aCount = aEnd - aStart;
+                if (aCount == 0) continue;
+
+                for (const auto& off : neighborOffsets) {
+                    int bx = ax + off.dx;
+                    int by = ay + off.dy;
+                    if (bx < 0 || bx >= gridX || by < 0 || by >= gridY) continue;
+                    int bId = by * gridX + bx;
+
+                    int bStart = cellStarts[bId];
+                    int bEnd = cellStarts[bId + 1];
+                    int bCount = bEnd - bStart;
+                    if (bCount == 0) continue;
+
+                    Task task;
+                    task.aStart = aStart;
+                    task.aEnd = aEnd;
+                    if (off.self) {
+                        task.bStart = aStart;
+                        task.bEnd = aEnd;
+                        task.self = true;
+                    } else {
+                        task.bStart = bStart;
+                        task.bEnd = bEnd;
+                        task.self = false;
+                    }
+                    tasks.push_back(task);
+                }
+            }
+        }
+
+        // Constants
+        const float r = params.cullRadius;
+        const float r2 = r * r;
+        const float dt = params.deltaTime;
+        const float epsClose = 1e-3f;
+        const float epsClose2 = epsClose * epsClose;
+        const float minDistClamp = 1e-1f;
+        const float G = 0.01f;
+        const float r75 = r * 0.75f;
+        const float r25 = r * 0.25f;
+
+        const Particle* P = particles.data();
+
+        // Parallel process tasks, accumulate into thread-local forces
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            Vec2* localF = threadForces[tid].data();
+
+            #pragma omp for schedule(dynamic, 1)
+            for (int ti = 0; ti < (int)tasks.size(); ++ti) {
+                const Task& task = tasks[ti];
+
+                if (task.self) {
+                    // Pairs within the same cell: i < j
+                    for (int ia = task.aStart; ia < task.aEnd; ++ia) {
+                        int i = cellIndices[ia];
+                        const Particle& pi = P[i];
+                        float xi = pi.position.x;
+                        float yi = pi.position.y;
+                        float mi = pi.mass;
+
+                        for (int ja = ia + 1; ja < task.aEnd; ++ja) {
+                            int j = cellIndices[ja];
+                            const Particle& pj = P[j];
+
+                            float dx = pj.position.x - xi;
+                            float dy = pj.position.y - yi;
+                            float d2 = dx * dx + dy * dy;
+                            if (d2 > r2 || d2 < epsClose2) continue;
+
+                            float dist = std::sqrt(d2);
+                            float invDist = 1.0f / dist;
+                            float dirx = dx * invDist;
+                            float diry = dy * invDist;
+
+                            float distUsed = (dist < minDistClamp) ? minDistClamp : dist;
+                            float mag = (G * mi * pj.mass) / (distUsed * distUsed);
+                            if (dist > r75) {
+                                float decay = 1.0f - (dist - r75) / r25;
+                                mag *= decay;
+                            }
+
+                            float fx = dirx * mag;
+                            float fy = diry * mag;
+
+                            localF[i].x += fx;
+                            localF[i].y += fy;
+                            localF[j].x -= fx;
+                            localF[j].y -= fy;
+                        }
+                    }
+                } else {
+                    // Cross-cell pairs: all i in A against all j in B
+                    for (int ia = task.aStart; ia < task.aEnd; ++ia) {
+                        int i = cellIndices[ia];
+                        const Particle& pi = P[i];
+                        float xi = pi.position.x;
+                        float yi = pi.position.y;
+                        float mi = pi.mass;
+
+                        for (int jb = task.bStart; jb < task.bEnd; ++jb) {
+                            int j = cellIndices[jb];
+                            const Particle& pj = P[j];
+
+                            float dx = pj.position.x - xi;
+                            float dy = pj.position.y - yi;
+                            float d2 = dx * dx + dy * dy;
+                            if (d2 > r2 || d2 < epsClose2) continue;
+
+                            float dist = std::sqrt(d2);
+                            float invDist = 1.0f / dist;
+                            float dirx = dx * invDist;
+                            float diry = dy * invDist;
+
+                            float distUsed = (dist < minDistClamp) ? minDistClamp : dist;
+                            float mag = (G * mi * pj.mass) / (distUsed * distUsed);
+                            if (dist > r75) {
+                                float decay = 1.0f - (dist - r75) / r25;
+                                mag *= decay;
+                            }
+
+                            float fx = dirx * mag;
+                            float fy = diry * mag;
+
+                            localF[i].x += fx;
+                            localF[i].y += fy;
+                            localF[j].x -= fx;
+                            localF[j].y -= fy;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Reduce forces across threads and update particles
+        newParticles.resize(N);
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < N; ++i) {
+            Vec2 totalF{0.0f, 0.0f};
+            for (int t = 0; t < numThreads; ++t) {
+                totalF.x += threadForces[t][i].x;
+                totalF.y += threadForces[t][i].y;
+            }
+            newParticles[i] = updateParticle(particles[i], totalF, dt);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/gpt5_4.cpp b/research/solutions/nbody_simulation/random_100k/gpt5_4.cpp
new file mode 100644
index 00000000..cfbf8f6c
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/gpt5_4.cpp
@@ -0,0 +1,285 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 0;
+
+    // Persistent buffers to avoid reallocations
+    std::vector<int> cellId;
+    std::vector<int> counts;
+    std::vector<int> cellStart;
+    std::vector<int> offsets;
+    std::vector<int> sortedIndices;
+
+    // Thread-local force accumulation buffers
+    std::vector<std::vector<Vec2>> threadForces;
+
+    // Precomputed neighbor offsets for current cellSize and cullRadius
+    std::vector<std::pair<int,int>> neighborOffsets;
+
+    // Grid state
+    float cellSize = 1.0f;
+    int gridW = 0, gridH = 0;
+    float minX = 0.0f, minY = 0.0f;
+
+    void ensureForceBuffers(size_t n) {
+        if (numThreads <= 0) {
+            numThreads = omp_get_max_threads();
+        }
+        threadForces.resize(numThreads);
+        for (int t = 0; t < numThreads; ++t) {
+            if (threadForces[t].size() != n) {
+                threadForces[t].assign(n, Vec2(0.0f, 0.0f));
+            } else {
+                std::fill(threadForces[t].begin(), threadForces[t].end(), Vec2(0.0f, 0.0f));
+            }
+        }
+    }
+
+    void precomputeNeighborOffsets(float cullRadius) {
+        neighborOffsets.clear();
+        int k = (int)std::ceil(cullRadius / cellSize);
+        if (k < 0) k = 0;
+
+        float s = cellSize;
+        float r2 = cullRadius * cullRadius;
+
+        // Only include offsets where neighbor cell index is "greater" to avoid duplicates
+        for (int dy = -k; dy <= k; ++dy) {
+            for (int dx = 0; dx <= k; ++dx) {
+                if (dx == 0 && dy <= 0) continue; // skip self and duplicates (dx>0) or (dx==0 and dy>0)
+                int adx = std::abs(dx);
+                int ady = std::abs(dy);
+                float minDx = (adx <= 1) ? 0.0f : (float)(adx - 1) * s;
+                float minDy = (ady <= 1) ? 0.0f : (float)(ady - 1) * s;
+                float minDist2 = minDx * minDx + minDy * minDy;
+                if (minDist2 <= r2) {
+                    neighborOffsets.emplace_back(dx, dy);
+                }
+            }
+        }
+    }
+
+    void buildGrid(const std::vector<Particle> &particles, float cullRadius) {
+        size_t n = particles.size();
+        if (n == 0) {
+            gridW = gridH = 0;
+            return;
+        }
+
+        // Compute bounding box
+        float localMinX = std::numeric_limits<float>::infinity();
+        float localMinY = std::numeric_limits<float>::infinity();
+        float localMaxX = -std::numeric_limits<float>::infinity();
+        float localMaxY = -std::numeric_limits<float>::infinity();
+
+        #pragma omp parallel
+        {
+            float tminx = localMinX;
+            float tminy = localMinY;
+            float tmaxx = localMaxX;
+            float tmaxy = localMaxY;
+
+            #pragma omp for nowait
+            for (int i = 0; i < (int)n; ++i) {
+                const auto &p = particles[i];
+                if (p.position.x < tminx) tminx = p.position.x;
+                if (p.position.y < tminy) tminy = p.position.y;
+                if (p.position.x > tmaxx) tmaxx = p.position.x;
+                if (p.position.y > tmaxy) tmaxy = p.position.y;
+            }
+
+            #pragma omp critical
+            {
+                if (tminx < localMinX) localMinX = tminx;
+                if (tminy < localMinY) localMinY = tminy;
+                if (tmaxx > localMaxX) localMaxX = tmaxx;
+                if (tmaxy > localMaxY) localMaxY = tmaxy;
+            }
+        }
+
+        // Set cell size relative to cullRadius. Smaller is better up to a point.
+        // Choose r/10, but clamp to at least 2.0 for stability and performance.
+        float desiredCellSize = std::max(2.0f, cullRadius / 10.0f);
+        cellSize = desiredCellSize;
+
+        // Add a small margin
+        const float eps = 1e-3f;
+        minX = localMinX - eps;
+        minY = localMinY - eps;
+        float maxX = localMaxX + eps;
+        float maxY = localMaxY + eps;
+
+        int w = (int)std::max(1, (int)std::floor((maxX - minX) / cellSize) + 1);
+        int h = (int)std::max(1, (int)std::floor((maxY - minY) / cellSize) + 1);
+        gridW = w;
+        gridH = h;
+        size_t nCells = (size_t)w * (size_t)h;
+
+        // Allocate grid arrays
+        cellId.resize(n);
+        counts.assign(nCells, 0);
+        cellStart.resize(nCells + 1);
+        offsets.resize(nCells);
+        sortedIndices.resize(n);
+
+        // Map particles to cells
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < (int)n; ++i) {
+            float fx = (particles[i].position.x - minX) / cellSize;
+            float fy = (particles[i].position.y - minY) / cellSize;
+            int ix = (int)std::floor(fx);
+            int iy = (int)std::floor(fy);
+            if (ix < 0) ix = 0; else if (ix >= w) ix = w - 1;
+            if (iy < 0) iy = 0; else if (iy >= h) iy = h - 1;
+            int cid = ix + iy * w;
+            cellId[i] = cid;
+        }
+
+        // Count particles per cell
+        for (size_t i = 0; i < n; ++i) {
+            counts[cellId[i]]++;
+        }
+
+        // Compute prefix sums
+        cellStart[0] = 0;
+        for (size_t c = 0; c < nCells; ++c) {
+            cellStart[c + 1] = cellStart[c] + counts[c];
+        }
+
+        // Setup offsets for insertion
+        for (size_t c = 0; c < nCells; ++c) {
+            offsets[c] = cellStart[c];
+        }
+
+        // Fill sortedIndices
+        for (size_t i = 0; i < n; ++i) {
+            int cid = cellId[i];
+            int pos = offsets[cid]++;
+            sortedIndices[pos] = (int)i;
+        }
+
+        // Precompute neighbor offsets for current grid configuration
+        precomputeNeighborOffsets(cullRadius);
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        numThreads = omp_get_max_threads();
+        omp_set_num_threads(numThreads);
+
+        // Prepare persistent buffers
+        cellId.clear();
+        counts.clear();
+        cellStart.clear();
+        offsets.clear();
+        sortedIndices.clear();
+        threadForces.clear();
+        neighborOffsets.clear();
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        size_t n = particles.size();
+        if (n == 0) return;
+
+        // Build grid
+        buildGrid(particles, params.cullRadius);
+
+        // Prepare thread-local force accumulators
+        ensureForceBuffers(n);
+
+        float r2 = params.cullRadius * params.cullRadius;
+        float cullRadius = params.cullRadius;
+
+        size_t nCells = (size_t)gridW * (size_t)gridH;
+
+        // Pairwise interactions using symmetric accumulation
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (int cell = 0; cell < (int)nCells; ++cell) {
+            int tid = omp_get_thread_num();
+            auto &lf = threadForces[tid];
+
+            int ix = cell % gridW;
+            int iy = cell / gridW;
+
+            int startA = cellStart[cell];
+            int endA = cellStart[cell + 1];
+
+            // Intra-cell interactions (i < j)
+            for (int a = startA; a < endA; ++a) {
+                int iIdx = sortedIndices[a];
+                const Vec2 pi_pos = particles[iIdx].position;
+                for (int b = a + 1; b < endA; ++b) {
+                    int jIdx = sortedIndices[b];
+                    const Vec2 pj_pos = particles[jIdx].position;
+
+                    float dx = pj_pos.x - pi_pos.x;
+                    float dy = pj_pos.y - pi_pos.y;
+                    float d2 = dx * dx + dy * dy;
+                    if (d2 > r2) continue;
+
+                    Vec2 f = computeForce(particles[iIdx], particles[jIdx], cullRadius);
+                    if (f.x != 0.0f || f.y != 0.0f) {
+                        lf[iIdx] += f;
+                        lf[jIdx] -= f;
+                    }
+                }
+            }
+
+            // Inter-cell interactions using precomputed neighbor offsets
+            for (const auto &d : neighborOffsets) {
+                int jx = ix + d.first;
+                int jy = iy + d.second;
+                if ((unsigned)jx >= (unsigned)gridW || (unsigned)jy >= (unsigned)gridH) continue;
+
+                int neighborCell = jx + jy * gridW;
+
+                int startB = cellStart[neighborCell];
+                int endB = cellStart[neighborCell + 1];
+
+                for (int a = startA; a < endA; ++a) {
+                    int iIdx = sortedIndices[a];
+                    const Vec2 pi_pos = particles[iIdx].position;
+                    for (int b = startB; b < endB; ++b) {
+                        int jIdx = sortedIndices[b];
+                        const Vec2 pj_pos = particles[jIdx].position;
+
+                        float dx = pj_pos.x - pi_pos.x;
+                        float dy = pj_pos.y - pi_pos.y;
+                        float d2 = dx * dx + dy * dy;
+                        if (d2 > r2) continue;
+
+                        Vec2 f = computeForce(particles[iIdx], particles[jIdx], cullRadius);
+                        if (f.x != 0.0f || f.y != 0.0f) {
+                            lf[iIdx] += f;
+                            lf[jIdx] -= f;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Reduce thread-local forces and update particles
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < (int)n; ++i) {
+            Vec2 totalF(0.0f, 0.0f);
+            for (int t = 0; t < numThreads; ++t) {
+                totalF += threadForces[t][i];
+            }
+            newParticles[i] = updateParticle(particles[i], totalF, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/grok4fastreasoning.cpp b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning.cpp
new file mode 100644
index 00000000..8d7d97ff
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning.cpp
@@ -0,0 +1,102 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        float R = params.cullRadius;
+        float cell_size = R / 5.0f;
+        const int NUM_BINS = 41;
+        const int OFFSET = 20;
+        const int total_cells = NUM_BINS * NUM_BINS;
+        std::vector<std::vector<int>> grid(total_cells);
+        for (auto& cell : grid) {
+            cell.reserve(20);
+        }
+
+        auto get_bin = [&](float p, float cs) -> int {
+            int b = static_cast<int>(std::floor(p / cs)) + OFFSET;
+            if (b < 0) return 0;
+            if (b >= NUM_BINS) return NUM_BINS - 1;
+            return b;
+        };
+
+        // Build grid
+        for (size_t i = 0; i < particles.size(); ++i) {
+            float px = particles[i].position.x;
+            float py = particles[i].position.y;
+            int bx = get_bin(px, cell_size);
+            int by = get_bin(py, cell_size);
+            int cid = bx * NUM_BINS + by;
+            grid[cid].push_back(static_cast<int>(i));
+        }
+
+        const float G = 0.01f;
+        const float EPS2 = 1e-6f;
+        const float SOFTEN = 0.1f;
+
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < static_cast<int>(particles.size()); ++i) {
+            const Particle& pi = particles[i];
+            Vec2 force{0.0f, 0.0f};
+            float px = pi.position.x;
+            float py = pi.position.y;
+
+            int min_bx = static_cast<int>(std::floor((px - R) / cell_size)) + OFFSET;
+            int max_bx = static_cast<int>(std::floor((px + R) / cell_size)) + OFFSET;
+            int min_by = static_cast<int>(std::floor((py - R) / cell_size)) + OFFSET;
+            int max_by = static_cast<int>(std::floor((py + R) / cell_size)) + OFFSET;
+
+            min_bx = std::max(0, min_bx);
+            max_bx = std::min(NUM_BINS - 1, max_bx);
+            min_by = std::max(0, min_by);
+            max_by = std::min(NUM_BINS - 1, max_by);
+
+            for (int bx = min_bx; bx <= max_bx; ++bx) {
+                for (int by = min_by; by <= max_by; ++by) {
+                    int cid = bx * NUM_BINS + by;
+                    for (int j : grid[cid]) {
+                        if (j == i) continue;
+                        const Particle& pj = particles[j];
+                        Vec2 dir = pj.position - pi.position;
+                        float dist2 = dir.x * dir.x + dir.y * dir.y;
+                        if (dist2 < EPS2 || dist2 > R * R) continue;
+                        float dist = std::sqrt(dist2);
+                        float inv_dist = 1.0f / dist;
+                        dir.x *= inv_dist;
+                        dir.y *= inv_dist;
+                        float d = dist;
+                        if (d < SOFTEN) d = SOFTEN;
+                        float mag = pi.mass * pj.mass * (G / (d * d));
+                        Vec2 f{dir.x * mag, dir.y * mag};
+                        if (dist > R * 0.75f) {
+                            float decay = 1.0f - (dist - R * 0.75f) / (R * 0.25f);
+                            if (decay < 0.0f) decay = 0.0f;
+                            f.x *= decay;
+                            f.y *= decay;
+                        }
+                        force.x += f.x;
+                        force.y += f.y;
+                    }
+                }
+            }
+
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_1.cpp b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_1.cpp
new file mode 100644
index 00000000..9f4c083e
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_1.cpp
@@ -0,0 +1,91 @@
+#include "world.h"
+#include <vector>
+#include <omp.h>
+#include <algorithm>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    float cell_size;
+    float world_size = 100.0f;
+    int numThreads = 16;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        cell_size = params.cullRadius / 4.0f;
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        size_t N = particles.size();
+        if (N == 0) return;
+
+        float s = cell_size;
+        int nx = static_cast<int>(std::ceil(world_size / s));
+        int ny = nx;
+        int num_buckets = nx * ny;
+        std::vector<std::vector<int>> buckets(num_buckets);
+
+        for (size_t i = 0; i < N; ++i) {
+            Vec2 pos = particles[i].position;
+            int ix = static_cast<int>(std::floor(pos.x / s));
+            int iy = static_cast<int>(std::floor(pos.y / s));
+            ix = std::max(0, std::min(nx - 1, ix));
+            iy = std::max(0, std::min(ny - 1, iy));
+            int idx = iy * nx + ix;
+            buckets[idx].push_back(static_cast<int>(i));
+        }
+
+        int kr = static_cast<int>(std::ceil(params.cullRadius / s));
+
+        #pragma omp parallel for schedule(static)
+        for (int ii = 0; ii < static_cast<int>(N); ++ii) {
+            int i = ii;
+            const Particle& pi = particles[i];
+            Vec2 force{0.0f, 0.0f};
+            Vec2 pos = pi.position;
+            int ix = static_cast<int>(std::floor(pos.x / s));
+            int iy = static_cast<int>(std::floor(pos.y / s));
+            ix = std::max(0, std::min(nx - 1, ix));
+            iy = std::max(0, std::min(ny - 1, iy));
+            int min_ix = std::max(0, ix - kr);
+            int max_ix = std::min(nx - 1, ix + kr);
+            int min_iy = std::max(0, iy - kr);
+            int max_iy = std::min(ny - 1, iy + kr);
+
+            for (int cy = min_iy; cy <= max_iy; ++cy) {
+                for (int cx = min_ix; cx <= max_ix; ++cx) {
+                    int bidx = cy * nx + cx;
+                    const auto& neigh = buckets[bidx];
+                    for (int j : neigh) {
+                        if (j == i) continue;
+                        Vec2 dir = particles[j].position - pos;
+                        float dist2 = dir.length2();
+                        float r2 = params.cullRadius * params.cullRadius;
+                        float min_dist2 = 1e-6f;
+                        if (dist2 < min_dist2 || dist2 > r2) continue;
+                        float dist = std::sqrt(dist2);
+                        dir *= (1.0f / dist);
+                        float d = dist;
+                        if (d < 0.1f) d = 0.1f;
+                        const float G = 0.01f;
+                        float scalar = pi.mass * particles[j].mass * (G / (d * d));
+                        Vec2 force_add = dir * scalar;
+                        if (dist > params.cullRadius * 0.75f) {
+                            float decay = 1.0f - (dist - params.cullRadius * 0.75f) / (params.cullRadius * 0.25f);
+                            force_add *= decay;
+                        }
+                        force += force_add;
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_2.cpp b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_2.cpp
new file mode 100644
index 00000000..1f4af0aa
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_2.cpp
@@ -0,0 +1,110 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <limits>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        size_t N = particles.size();
+        newParticles.resize(N);
+        if (N == 0) return;
+
+        int N_ = static_cast<int>(N);
+
+        // Compute bounds
+        float min_x = std::numeric_limits<float>::max();
+        float max_x = std::numeric_limits<float>::lowest();
+        float min_y = min_x;
+        float max_y = max_x;
+        for (const auto& p : particles) {
+            float x = p.position.x;
+            float y = p.position.y;
+            if (x < min_x) min_x = x;
+            if (x > max_x) max_x = x;
+            if (y < min_y) min_y = y;
+            if (y > max_y) max_y = y;
+        }
+
+        float cell_size = params.cullRadius / 2.0f;
+        float span_x = max_x - min_x;
+        int num_cols = 1;
+        if (span_x > 0.0f) {
+            num_cols = static_cast<int>(std::floor(span_x / cell_size)) + 1;
+        }
+        float span_y = max_y - min_y;
+        int num_rows = 1;
+        if (span_y > 0.0f) {
+            num_rows = static_cast<int>(std::floor(span_y / cell_size)) + 1;
+        }
+        int total_cells = num_rows * num_cols;
+
+        std::vector<int> cell_ids(N_);
+        #pragma omp parallel for
+        for (int i = 0; i < N_; ++i) {
+            float dx = (particles[i].position.x - min_x) / cell_size;
+            float dy = (particles[i].position.y - min_y) / cell_size;
+            int col = static_cast<int>(std::floor(dx));
+            int row = static_cast<int>(std::floor(dy));
+            col = std::max(0, std::min(num_cols - 1, col));
+            row = std::max(0, std::min(num_rows - 1, row));
+            cell_ids[i] = row * num_cols + col;
+        }
+
+        std::vector<std::vector<int>> grid(total_cells);
+        for (int i = 0; i < N_; ++i) {
+            grid[cell_ids[i]].push_back(i);
+        }
+
+        float cull2 = params.cullRadius * params.cullRadius;
+        float eps2 = 1e-6f;
+        int search_dist = static_cast<int>(std::ceil(params.cullRadius / cell_size));
+
+        #pragma omp parallel for schedule(dynamic, 256)
+        for (int i = 0; i < N_; ++i) {
+            const Particle& pi = particles[i];
+            Vec2 force{0.0f, 0.0f};
+
+            float dx = (pi.position.x - min_x) / cell_size;
+            float dy = (pi.position.y - min_y) / cell_size;
+            int col = static_cast<int>(std::floor(dx));
+            int row = static_cast<int>(std::floor(dy));
+            col = std::max(0, std::min(num_cols - 1, col));
+            row = std::max(0, std::min(num_rows - 1, row));
+
+            for (int drow = -search_dist; drow <= search_dist; ++drow) {
+                int trow = row + drow;
+                if (trow < 0 || trow >= num_rows) continue;
+                for (int dcol = -search_dist; dcol <= search_dist; ++dcol) {
+                    int tcol = col + dcol;
+                    if (tcol < 0 || tcol >= num_cols) continue;
+                    int tcell = trow * num_cols + tcol;
+                    for (int j : grid[tcell]) {
+                        if (j == i) continue;
+                        Vec2 dir = particles[j].position - pi.position;
+                        float dist2 = dir.length2();
+                        if (dist2 > eps2 && dist2 < cull2) {
+                            force += computeForce(pi, particles[j], params.cullRadius);
+                        }
+                    }
+                }
+            }
+
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_3.cpp b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_3.cpp
new file mode 100644
index 00000000..91148855
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_3.cpp
@@ -0,0 +1,85 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        if (particles.empty()) {
+            newParticles.clear();
+            return;
+        }
+        size_t N = particles.size();
+        newParticles.resize(N);
+
+        float min_x = particles[0].position.x;
+        float max_x = min_x;
+        float min_y = particles[0].position.y;
+        float max_y = min_y;
+        for (const auto& p : particles) {
+            min_x = std::min(min_x, p.position.x);
+            max_x = std::max(max_x, p.position.x);
+            min_y = std::min(min_y, p.position.y);
+            max_y = std::max(max_y, p.position.y);
+        }
+        float width_x = max_x - min_x;
+        float width_y = max_y - min_y;
+        const float CELL_SIZE = 5.0f;
+        int nx = static_cast<int>(std::floor(width_x / CELL_SIZE)) + 1;
+        int ny = static_cast<int>(std::floor(width_y / CELL_SIZE)) + 1;
+        if (nx <= 0) nx = 1;
+        if (ny <= 0) ny = 1;
+
+        std::vector<std::vector<std::vector<int>>> cell_lists(
+            nx, std::vector<std::vector<int>>(ny));
+        for (size_t i = 0; i < N; ++i) {
+            const auto& p = particles[i];
+            int ix = static_cast<int>(std::floor((p.position.x - min_x) / CELL_SIZE));
+            int iy = static_cast<int>(std::floor((p.position.y - min_y) / CELL_SIZE));
+            cell_lists[ix][iy].push_back(static_cast<int>(i));
+        }
+
+        int k = static_cast<int>(std::ceil(params.cullRadius / CELL_SIZE)) + 1;
+        float r2 = params.cullRadius * params.cullRadius;
+        float eps2 = 1e-6f;
+
+#pragma omp parallel for schedule(dynamic, 1000)
+        for (int i = 0; i < static_cast<int>(N); ++i) {
+            const Particle& pi = particles[i];
+            Vec2 force(0.0f, 0.0f);
+            int ix = static_cast<int>(std::floor((pi.position.x - min_x) / CELL_SIZE));
+            int iy = static_cast<int>(std::floor((pi.position.y - min_y) / CELL_SIZE));
+            int start_ix = std::max(0, ix - k);
+            int end_ix = std::min(nx - 1, ix + k);
+            int start_iy = std::max(0, iy - k);
+            int end_iy = std::min(ny - 1, iy + k);
+            for (int cix = start_ix; cix <= end_ix; ++cix) {
+                for (int ciy = start_iy; ciy <= end_iy; ++ciy) {
+                    for (int j : cell_lists[cix][ciy]) {
+                        if (j == i) continue;
+                        Vec2 dir = particles[j].position - pi.position;
+                        float dist2 = dir.length2();
+                        if (dist2 > r2 || dist2 < eps2) continue;
+                        force += computeForce(pi, particles[j], params.cullRadius);
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_4.cpp b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_4.cpp
new file mode 100644
index 00000000..f2af9384
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_100k/grok4fastreasoning_4.cpp
@@ -0,0 +1,144 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <limits>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+    struct QuadNode {
+        Vec2 minPos, maxPos;
+        std::vector<int> particleIndices;
+        QuadNode* children[4] = {nullptr, nullptr, nullptr, nullptr};
+
+        QuadNode(Vec2 mn, Vec2 mx) : minPos(mn), maxPos(mx) {}
+        ~QuadNode() {
+            for (int i = 0; i < 4; ++i) {
+                delete children[i];
+            }
+        }
+    };
+
+    void build(QuadNode* node, const std::vector<Particle>& particles, std::vector<int>& indices, float minSize) {
+        if (indices.empty()) return;
+
+        float width = node->maxPos.x - node->minPos.x;
+        float height = node->maxPos.y - node->minPos.y;
+        if (width <= minSize || height <= minSize || indices.size() <= 4) {
+            node->particleIndices = std::move(indices);
+            return;
+        }
+
+        float midX = (node->minPos.x + node->maxPos.x) * 0.5f;
+        float midY = (node->minPos.y + node->maxPos.y) * 0.5f;
+
+        std::vector<int> childIndices[4];
+        for (int idx : indices) {
+            const Vec2& pos = particles[idx].position;
+            int q;
+            if (pos.x <= midX) {
+                q = (pos.y <= midY) ? 0 : 1;
+            } else {
+                q = (pos.y <= midY) ? 2 : 3;
+            }
+            childIndices[q].push_back(idx);
+        }
+
+        Vec2 cmin[4] = {
+            {node->minPos.x, node->minPos.y},
+            {node->minPos.x, midY},
+            {midX, node->minPos.y},
+            {midX, midY}
+        };
+        Vec2 cmax[4] = {
+            {midX, midY},
+            {midX, node->maxPos.y},
+            {node->maxPos.x, midY},
+            {node->maxPos.x, node->maxPos.y}
+        };
+
+        for (int q = 0; q < 4; ++q) {
+            node->children[q] = new QuadNode(cmin[q], cmax[q]);
+            if (!childIndices[q].empty()) {
+                build(node->children[q], particles, childIndices[q], minSize);
+            } else {
+                delete node->children[q];
+                node->children[q] = nullptr;
+            }
+        }
+    }
+
+    void accumulateForce(QuadNode* node, const Particle& target, float cullRadius, Vec2& totalForce, const std::vector<Particle>& particles) {
+        if (!node) return;
+
+        Vec2 qpos = target.position;
+        float r2 = cullRadius * cullRadius;
+
+        Vec2 bmin = node->minPos;
+        Vec2 bmax = node->maxPos;
+
+        float dx = std::max(bmin.x - qpos.x, std::max(0.0f, qpos.x - bmax.x));
+        float dy = std::max(bmin.y - qpos.y, std::max(0.0f, qpos.y - bmax.y));
+        float minD2 = dx * dx + dy * dy;
+        if (minD2 > r2) return;
+
+        if (!node->particleIndices.empty()) {
+            for (int j : node->particleIndices) {
+                totalForce += computeForce(target, particles[j], cullRadius);
+            }
+        } else {
+            for (int q = 0; q < 4; ++q) {
+                accumulateForce(node->children[q], target, cullRadius, totalForce, particles);
+            }
+        }
+    }
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+    }
+
+    void simulateStep(std::vector<Particle>& particles,
+                      std::vector<Particle>& newParticles,
+                      StepParameters params) override {
+        size_t N = particles.size();
+        if (N == 0) return;
+
+        float INF = std::numeric_limits<float>::infinity();
+        float minX = INF, minY = INF, maxX = -INF, maxY = -INF;
+        for (const auto& p : particles) {
+            minX = std::min(minX, p.position.x);
+            minY = std::min(minY, p.position.y);
+            maxX = std::max(maxX, p.position.x);
+            maxY = std::max(maxY, p.position.y);
+        }
+
+        if (minX >= maxX || minY >= maxY) {
+            newParticles = particles;
+            return;
+        }
+
+        QuadNode* root = new QuadNode({minX, minY}, {maxX, maxY});
+        std::vector<int> indices(N);
+        std::iota(indices.begin(), indices.end(), 0);
+        build(root, particles, indices, 1e-3f);
+
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (size_t i = 0; i < N; ++i) {
+            const auto& pi = particles[i];
+            Vec2 force = {0.0f, 0.0f};
+            accumulateForce(root, pi, params.cullRadius, force, particles);
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+
+        delete root;
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_10k/gemini2.5pro.cpp b/research/solutions/nbody_simulation/random_10k/gemini2.5pro.cpp
new file mode 100644
index 00000000..0b194241
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_10k/gemini2.5pro.cpp
@@ -0,0 +1,128 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <limits>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+    std::vector<std::pair<int, int>> particle_cell_pairs;
+    std::vector<int> cell_starts;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        if (particle_cell_pairs.size() != (size_t)numParticles) {
+            particle_cell_pairs.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = particles.size();
+        if (n == 0) {
+            return;
+        }
+
+        float min_x = std::numeric_limits<float>::max();
+        float max_x = std::numeric_limits<float>::lowest();
+        float min_y = std::numeric_limits<float>::max();
+        float max_y = std::numeric_limits<float>::lowest();
+
+        #pragma omp parallel for reduction(min:min_x, min_y) reduction(max:max_x, max_y)
+        for (int i = 0; i < n; ++i) {
+            min_x = std::min(min_x, particles[i].position.x);
+            max_x = std::max(max_x, particles[i].position.x);
+            min_y = std::min(min_y, particles[i].position.y);
+            max_y = std::max(max_y, particles[i].position.y);
+        }
+
+        const float cell_size = params.cullRadius;
+        const float domain_width = max_x - min_x;
+        const float domain_height = max_y - min_y;
+
+        const int grid_cols = std::max(1, static_cast<int>(std::ceil(domain_width / cell_size)));
+        const int grid_rows = std::max(1, static_cast<int>(std::ceil(domain_height / cell_size)));
+        const int num_cells = grid_cols * grid_rows;
+
+        #pragma omp parallel for
+        for (int i = 0; i < n; ++i) {
+            const auto& p = particles[i];
+            int cell_x = static_cast<int>((p.position.x - min_x) / cell_size);
+            int cell_y = static_cast<int>((p.position.y - min_y) / cell_size);
+            
+            cell_x = std::max(0, std::min(grid_cols - 1, cell_x));
+            cell_y = std::max(0, std::min(grid_rows - 1, cell_y));
+            
+            int cell_idx = cell_y * grid_cols + cell_x;
+            particle_cell_pairs[i] = {cell_idx, i};
+        }
+
+        std::sort(particle_cell_pairs.begin(), particle_cell_pairs.end());
+
+        if (cell_starts.size() < (size_t)num_cells + 1) {
+            cell_starts.resize(num_cells + 1);
+        }
+        
+        cell_starts[0] = 0;
+        int last_cell_idx = -1;
+        if (n > 0) {
+            last_cell_idx = particle_cell_pairs[0].first;
+            for(int c = 0; c <= last_cell_idx; ++c) {
+                cell_starts[c] = 0;
+            }
+        }
+        for (int i = 1; i < n; ++i) {
+            int current_cell_idx = particle_cell_pairs[i].first;
+            if (current_cell_idx != last_cell_idx) {
+                for(int c = last_cell_idx + 1; c <= current_cell_idx; ++c) {
+                    cell_starts[c] = i;
+                }
+                last_cell_idx = current_cell_idx;
+            }
+        }
+        for (int c = last_cell_idx + 1; c <= num_cells; ++c) {
+            cell_starts[c] = n;
+        }
+
+        #pragma omp parallel for schedule(dynamic, 64)
+        for (int i = 0; i < n; ++i) {
+            const auto& pi = particles[i];
+            Vec2 total_force = {0.0f, 0.0f};
+
+            int cell_x = static_cast<int>((pi.position.x - min_x) / cell_size);
+            int cell_y = static_cast<int>((pi.position.y - min_y) / cell_size);
+            cell_x = std::max(0, std::min(grid_cols - 1, cell_x));
+            cell_y = std::max(0, std::min(grid_rows - 1, cell_y));
+
+            for (int dy = -1; dy <= 1; ++dy) {
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int neighbor_cx = cell_x + dx;
+                    int neighbor_cy = cell_y + dy;
+
+                    if (neighbor_cx >= 0 && neighbor_cx < grid_cols &&
+                        neighbor_cy >= 0 && neighbor_cy < grid_rows)
+                    {
+                        int neighbor_cell_idx = neighbor_cy * grid_cols + neighbor_cx;
+                        int start_idx = cell_starts[neighbor_cell_idx];
+                        int end_idx = cell_starts[neighbor_cell_idx + 1];
+
+                        for (int k = start_idx; k < end_idx; ++k) {
+                            int j = particle_cell_pairs[k].second;
+                            if (i == j) continue;
+                            total_force += computeForce(pi, particles[j], params.cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, total_force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_10k/gemini2.5pro_1.cpp b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_1.cpp
new file mode 100644
index 00000000..0675111f
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_1.cpp
@@ -0,0 +1,171 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cfloat>
+#include <utility>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads;
+
+    // Grid properties
+    Vec2 min_coord;
+    float cell_size;
+    int grid_dim_x;
+    int grid_dim_y;
+
+    // Grid data structures in CSR-like format
+    std::vector<int> cell_particle_indices;
+    std::vector<int> cell_starts;
+
+    // A temporary buffer used for building the grid to avoid reallocations
+    std::vector<std::pair<int, int>> particle_cell_pairs;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        // Use all available vCPUs on the c7i.4xlarge instance
+        numThreads = 16;
+        omp_set_num_threads(numThreads);
+
+        // Pre-allocate memory to avoid reallocations during simulation steps
+        if (particle_cell_pairs.capacity() < (size_t)numParticles) {
+            particle_cell_pairs.reserve(numParticles);
+        }
+        if (cell_particle_indices.capacity() < (size_t)numParticles) {
+            cell_particle_indices.reserve(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const size_t n = particles.size();
+        if (n == 0) return;
+
+        // 1. Determine the bounding box of all particles in parallel
+        Vec2 global_min_p = { FLT_MAX, FLT_MAX };
+        Vec2 global_max_p = { -FLT_MAX, -FLT_MAX };
+
+        #pragma omp parallel
+        {
+            Vec2 local_min_p = { FLT_MAX, FLT_MAX };
+            Vec2 local_max_p = { -FLT_MAX, -FLT_MAX };
+
+            #pragma omp for nowait
+            for (size_t i = 0; i < n; ++i) {
+                const auto& pos = particles[i].position;
+                local_min_p.x = std::min(local_min_p.x, pos.x);
+                local_min_p.y = std::min(local_min_p.y, pos.y);
+                local_max_p.x = std::max(local_max_p.x, pos.x);
+                local_max_p.y = std::max(local_max_p.y, pos.y);
+            }
+
+            #pragma omp critical
+            {
+                global_min_p.x = std::min(global_min_p.x, local_min_p.x);
+                global_min_p.y = std::min(global_min_p.y, local_min_p.y);
+                global_max_p.x = std::max(global_max_p.x, local_max_p.x);
+                global_max_p.y = std::max(global_max_p.y, local_max_p.y);
+            }
+        }
+        
+        min_coord = global_min_p;
+        cell_size = params.cullRadius;
+        if (cell_size <= 1e-5f) cell_size = 1.0f;
+        
+        grid_dim_x = static_cast<int>((global_max_p.x - min_coord.x) / cell_size) + 1;
+        grid_dim_y = static_cast<int>((global_max_p.y - min_coord.y) / cell_size) + 1;
+        const int num_cells = grid_dim_x * grid_dim_y;
+
+        // 2. Build the spatial grid
+        // Assign each particle to a cell
+        particle_cell_pairs.resize(n);
+        #pragma omp parallel for
+        for (size_t i = 0; i < n; ++i) {
+            const auto& p_pos = particles[i].position;
+            int cx = static_cast<int>((p_pos.x - min_coord.x) / cell_size);
+            int cy = static_cast<int>((p_pos.y - min_coord.y) / cell_size);
+            
+            // Clamp coordinates to be within grid bounds to handle floating point edge cases
+            cx = std::max(0, std::min(cx, grid_dim_x - 1));
+            cy = std::max(0, std::min(cy, grid_dim_y - 1));
+
+            particle_cell_pairs[i] = {cy * grid_dim_x + cx, static_cast<int>(i)};
+        }
+
+        // Sort particles by their cell index. This groups particles in the same cell together.
+        std::sort(particle_cell_pairs.begin(), particle_cell_pairs.end());
+
+        // Create the CSR-like representation of the grid
+        cell_particle_indices.resize(n);
+        cell_starts.assign(num_cells + 1, 0);
+
+        #pragma omp parallel for
+        for (size_t i = 0; i < n; ++i) {
+            cell_particle_indices[i] = particle_cell_pairs[i].second;
+        }
+
+        // This serial part builds the 'starts' array. It's O(N + num_cells), which is very fast.
+        if (n > 0) {
+            int last_cell_id = -1;
+            for (size_t i = 0; i < n; ++i) {
+                int current_cell_id = particle_cell_pairs[i].first;
+                if (current_cell_id > last_cell_id) {
+                    for (int c = last_cell_id + 1; c <= current_cell_id; ++c) {
+                        cell_starts[c] = i;
+                    }
+                    last_cell_id = current_cell_id;
+                }
+            }
+            for (int c = last_cell_id + 1; c <= num_cells; ++c) {
+                cell_starts[c] = n;
+            }
+        }
+
+        // 3. Compute forces in parallel using the grid for neighbor finding
+        #pragma omp parallel for schedule(dynamic, 16)
+        for (size_t i = 0; i < n; ++i) {
+            const auto& pi = particles[i];
+            Vec2 force = {0.0f, 0.0f};
+
+            const auto& p_pos = pi.position;
+            int cx = static_cast<int>((p_pos.x - min_coord.x) / cell_size);
+            int cy = static_cast<int>((p_pos.y - min_coord.y) / cell_size);
+
+            // Iterate over the 3x3 block of cells around the particle's cell
+            for (int dy = -1; dy <= 1; ++dy) {
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int neighbor_cx = cx + dx;
+                    int neighbor_cy = cy + dy;
+
+                    // Check if the neighboring cell is within the grid bounds
+                    if (neighbor_cx >= 0 && neighbor_cx < grid_dim_x &&
+                        neighbor_cy >= 0 && neighbor_cy < grid_dim_y) {
+                        
+                        int cell_idx = neighbor_cy * grid_dim_x + neighbor_cx;
+                        int start_idx = cell_starts[cell_idx];
+                        int end_idx = cell_starts[cell_idx + 1];
+
+                        // Iterate over particles in the neighboring cell
+                        for (int k = start_idx; k < end_idx; ++k) {
+                            int j = cell_particle_indices[k];
+                            if (static_cast<size_t>(j) == i) continue;
+                            
+                            // Check distance and compute force, matching baseline logic for correctness
+                            if ((pi.position - particles[j].position).length() < params.cullRadius) {
+                                force += computeForce(pi, particles[j], params.cullRadius);
+                            }
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_10k/gemini2.5pro_2.cpp b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_2.cpp
new file mode 100644
index 00000000..0f5c57e1
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_2.cpp
@@ -0,0 +1,144 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <numeric>
+
+class MySimulator : public Simulator {
+private:
+    // Using 16 threads for the 16 vCPUs of the c7i.4xlarge instance.
+    static constexpr int NUM_THREADS = 16;
+    // World size as specified in evaluation details. Assumes a square world from [0, worldSize].
+    static constexpr float WORLD_SIZE = 100.0f;
+    // A finer grid gives better performance by reducing unnecessary pair checks.
+    // cell size = cullRadius / this divisor. 4.0f is a good trade-off.
+    static constexpr float CELL_SIZE_DIVISOR = 4.0f;
+
+    // Grid data structures, initialized in init() and reused in each simulateStep().
+    float cellSize;
+    float invCellSize;
+    int gridDimX;
+    int gridDimY;
+
+    // Per-particle, stores the 1D index of the grid cell it belongs to.
+    std::vector<int> particleCellIndices;
+    // An array of particle indices [0, 1, ..., N-1] that will be sorted
+    // based on cell index.
+    std::vector<int> sortedParticleIds;
+    // For each cell, stores the starting index in sortedParticleIds.
+    // The end index is the start index of the next cell.
+    std::vector<int> cellStartIndices;
+
+public:
+    MySimulator() = default;
+    ~MySimulator() override = default;
+
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(NUM_THREADS);
+
+        cellSize = params.cullRadius / CELL_SIZE_DIVISOR;
+        invCellSize = 1.0f / cellSize;
+        gridDimX = static_cast<int>(std::ceil(WORLD_SIZE / cellSize));
+        gridDimY = static_cast<int>(std::ceil(WORLD_SIZE / cellSize));
+        
+        // Pre-allocate memory for our data structures to avoid reallocations.
+        particleCellIndices.resize(numParticles);
+        sortedParticleIds.resize(numParticles);
+        cellStartIndices.resize(gridDimX * gridDimY + 1);
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        const int numParticles = static_cast<int>(particles.size());
+        if (numParticles == 0) return;
+
+        const int numCells = gridDimX * gridDimY;
+
+        // Step 1: Assign particles to grid cells. This is a data-parallel operation.
+        #pragma omp parallel for
+        for (int i = 0; i < numParticles; ++i) {
+            const auto& p = particles[i];
+            
+            // Calculate 2D grid cell coordinates.
+            int cellX = static_cast<int>(p.position.x * invCellSize);
+            int cellY = static_cast<int>(p.position.y * invCellSize);
+            
+            // Clamp coordinates to be within grid bounds.
+            cellX = std::max(0, std::min(gridDimX - 1, cellX));
+            cellY = std::max(0, std::min(gridDimY - 1, cellY));
+            
+            // Store the 1D cell index.
+            particleCellIndices[i] = cellX + cellY * gridDimX;
+            sortedParticleIds[i] = i;
+        }
+
+        // Step 2: Sort particle IDs based on their cell index.
+        // This brings particles in the same cell together in the sortedParticleIds array.
+        // std::sort is sequential but very fast for N=10000.
+        std::sort(sortedParticleIds.begin(), sortedParticleIds.end(),
+                  [&](int a, int b) {
+                      return particleCellIndices[a] < particleCellIndices[b];
+                  });
+
+        // Step 3: Find the start index for each cell in the sorted list.
+        // This allows quick access to all particles in a cell.
+        // This is a sequential scan, but it's O(N + numCells), which is fast.
+        std::fill(cellStartIndices.begin(), cellStartIndices.end(), numParticles);
+        cellStartIndices[numCells] = numParticles; // Sentinel for the last cell.
+        int lastCellIdx = -1;
+        for (int i = 0; i < numParticles; ++i) {
+            int p_id = sortedParticleIds[i];
+            int cellIdx = particleCellIndices[p_id];
+            if (cellIdx > lastCellIdx) {
+                for (int c = lastCellIdx + 1; c <= cellIdx; ++c) {
+                    cellStartIndices[c] = i;
+                }
+                lastCellIdx = cellIdx;
+            }
+        }
+        
+        // Step 4: Compute forces in parallel using the grid.
+        #pragma omp parallel for schedule(dynamic, 16)
+        for (int i = 0; i < numParticles; ++i) {
+            const Particle& pi = particles[i];
+            Vec2 totalForce = {0.0f, 0.0f};
+
+            const int cellIdx = particleCellIndices[i];
+            const int cellY = cellIdx / gridDimX;
+            const int cellX = cellIdx % gridDimX;
+            
+            // Determine the search area around the particle's cell.
+            // This needs to be large enough to cover the cullRadius.
+            const int searchRadius = static_cast<int>(std::ceil(params.cullRadius * invCellSize));
+
+            // Iterate over the neighboring cells.
+            for (int ny = cellY - searchRadius; ny <= cellY + searchRadius; ++ny) {
+                for (int nx = cellX - searchRadius; nx <= cellX + searchRadius; ++nx) {
+                    // Check if the neighbor cell is within grid bounds.
+                    if (nx >= 0 && nx < gridDimX && ny >= 0 && ny < gridDimY) {
+                        int neighborCellIdx = nx + ny * gridDimX;
+                        int start = cellStartIndices[neighborCellIdx];
+                        int end = cellStartIndices[neighborCellIdx + 1];
+
+                        // Iterate over particles in the neighbor cell.
+                        for (int k = start; k < end; ++k) {
+                            int j = sortedParticleIds[k];
+                            // A particle does not exert force on itself.
+                            if (i == j) continue;
+                            const Particle& pj = particles[j];
+                            totalForce += computeForce(pi, pj, params.cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(pi, totalForce, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_10k/gemini2.5pro_3.cpp b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_3.cpp
new file mode 100644
index 00000000..d3b13821
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_3.cpp
@@ -0,0 +1,118 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+    // Persistent buffers to avoid reallocations in each simulation step
+    std::vector<std::pair<int, int>> particle_cell_map;
+    std::vector<int> sorted_particle_indices;
+    std::vector<int> cell_starts;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        // Resize persistent buffers only if the number of particles changes
+        if (particle_cell_map.size() != (size_t)numParticles) {
+            particle_cell_map.resize(numParticles);
+            sorted_particle_indices.resize(numParticles);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        const int n = particles.size();
+        if (n == 0) return;
+
+        // Step 1: Compute the bounding box of all particles in parallel.
+        float min_x = particles[0].position.x;
+        float max_x = particles[0].position.x;
+        float min_y = particles[0].position.y;
+        float max_y = particles[0].position.y;
+
+        #pragma omp parallel for reduction(min:min_x) reduction(max:max_x) reduction(min:min_y) reduction(max:max_y)
+        for (int i = 1; i < n; ++i) {
+            min_x = std::min(min_x, particles[i].position.x);
+            max_x = std::max(max_x, particles[i].position.x);
+            min_y = std::min(min_y, particles[i].position.y);
+            max_y = std::max(max_y, particles[i].position.y);
+        }
+
+        // Step 2: Set up the spatial grid.
+        const float cell_size = params.cullRadius;
+        const int grid_width = (max_x > min_x) ? static_cast<int>(std::ceil((max_x - min_x) / cell_size)) : 1;
+        const int grid_height = (max_y > min_y) ? static_cast<int>(std::ceil((max_y - min_y) / cell_size)) : 1;
+        const int grid_size = grid_width * grid_height;
+        
+        cell_starts.assign(grid_size + 1, 0);
+
+        // Step 3: Assign each particle to a grid cell in parallel.
+        #pragma omp parallel for
+        for (int i = 0; i < n; ++i) {
+            int cx = static_cast<int>((particles[i].position.x - min_x) / cell_size);
+            int cy = static_cast<int>((particles[i].position.y - min_y) / cell_size);
+            cx = std::max(0, std::min(cx, grid_width - 1));
+            cy = std::max(0, std::min(cy, grid_height - 1));
+            particle_cell_map[i] = {cy * grid_width + cx, i};
+        }
+
+        // Step 4: Sort particles based on their cell index.
+        std::sort(particle_cell_map.begin(), particle_cell_map.end());
+
+        // Step 5: Create a lookup structure (cell_starts) to quickly find particles in a cell.
+        int current_cell = -1;
+        for (int i = 0; i < n; ++i) {
+            int cell_idx = particle_cell_map[i].first;
+            sorted_particle_indices[i] = particle_cell_map[i].second;
+            if (cell_idx != current_cell) {
+                for (int c = current_cell + 1; c <= cell_idx; ++c) {
+                    cell_starts[c] = i;
+                }
+                current_cell = cell_idx;
+            }
+        }
+        for (int c = current_cell + 1; c <= grid_size; ++c) {
+            cell_starts[c] = n;
+        }
+
+        // Step 6: Compute forces in parallel using the grid.
+        #pragma omp parallel for schedule(dynamic, 16)
+        for (int i = 0; i < n; ++i) {
+            const Particle& p_i = particles[i];
+            Vec2 total_force = {0.0f, 0.0f};
+
+            int cx = static_cast<int>((p_i.position.x - min_x) / cell_size);
+            int cy = static_cast<int>((p_i.position.y - min_y) / cell_size);
+            cx = std::max(0, std::min(cx, grid_width - 1));
+            cy = std::max(0, std::min(cy, grid_height - 1));
+
+            // Iterate over the 3x3 block of neighboring cells
+            for (int neighbor_cy = std::max(0, cy - 1); neighbor_cy <= std::min(grid_height - 1, cy + 1); ++neighbor_cy) {
+                for (int neighbor_cx = std::max(0, cx - 1); neighbor_cx <= std::min(grid_width - 1, cx + 1); ++neighbor_cx) {
+                    int neighbor_cell_idx = neighbor_cy * grid_width + neighbor_cx;
+                    
+                    int start_idx = cell_starts[neighbor_cell_idx];
+                    int end_idx = cell_starts[neighbor_cell_idx + 1];
+
+                    // Iterate over particles in the neighbor cell
+                    for (int k = start_idx; k < end_idx; ++k) {
+                        int j = sorted_particle_indices[k];
+                        if (i == j) continue;
+                        
+                        total_force += computeForce(p_i, particles[j], params.cullRadius);
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(p_i, total_force, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/research/solutions/nbody_simulation/random_10k/gemini2.5pro_4.cpp b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_4.cpp
new file mode 100644
index 00000000..a245e791
--- /dev/null
+++ b/research/solutions/nbody_simulation/random_10k/gemini2.5pro_4.cpp
@@ -0,0 +1,135 @@
+#include "world.h"
+#include <omp.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+class MySimulator : public Simulator {
+private:
+    int numThreads = 16;
+
+    // Grid-related members, recalculated each step
+    Vec2 worldMin;
+    float cellSize;
+    int gridWidth;
+    int gridHeight;
+
+    // Pre-allocated grid structure and buffers
+    std::vector<std::vector<int>> grid;
+    std::vector<Vec2> localMins;
+    std::vector<Vec2> localMaxs;
+
+public:
+    void init(int numParticles, StepParameters params) override {
+        omp_set_num_threads(numThreads);
+        if (localMins.size() < (size_t)numThreads) {
+            localMins.resize(numThreads);
+            localMaxs.resize(numThreads);
+        }
+    }
+
+    void simulateStep(std::vector<Particle> &particles,
+                      std::vector<Particle> &newParticles,
+                      StepParameters params) override {
+        
+        const int n = particles.size();
+        if (n == 0) {
+            return;
+        }
+
+        // Step 1: Determine world bounds using a parallel reduction.
+        #pragma omp parallel
+        {
+            int tid = omp_get_thread_num();
+            Vec2 lmin = { std::numeric_limits<float>::max(), std::numeric_limits<float>::max() };
+            Vec2 lmax = { std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest() };
+            
+            #pragma omp for nowait
+            for (int i = 0; i < n; ++i) {
+                lmin.x = std::min(lmin.x, particles[i].position.x);
+                lmin.y = std::min(lmin.y, particles[i].position.y);
+                lmax.x = std::max(lmax.x, particles[i].position.x);
+                lmax.y = std::max(lmax.y, particles[i].position.y);
+            }
+            localMins[tid] = lmin;
+            localMaxs[tid] = lmax;
+        }
+
+        Vec2 globalMin = localMins[0];
+        Vec2 globalMax = localMaxs[0];
+        for(int i = 1; i < numThreads; ++i) {
+            globalMin.x = std::min(globalMin.x, localMins[i].x);
+            globalMin.y = std::min(globalMin.y, localMins[i].y);
+            globalMax.x = std::max(globalMax.x, localMaxs[i].x);
+            globalMax.y = std::max(globalMax.y, localMaxs[i].y);
+        }
+        
+        worldMin = globalMin;
+        Vec2 worldMax = globalMax;
+
+        // Add a small buffer to prevent particles from falling outside the grid.
+        worldMin.x -= 0.1f;
+        worldMin.y -= 0.1f;
+        worldMax.x += 0.1f;
+        worldMax.y += 0.1f;
+        
+        // Step 2: Setup grid based on world bounds and cull radius.
+        cellSize = params.cullRadius;
+        gridWidth = static_cast<int>((worldMax.x - worldMin.x) / cellSize) + 1;
+        gridHeight = static_cast<int>((worldMax.y - worldMin.y) / cellSize) + 1;
+        int numCells = gridWidth * gridHeight;
+
+        // Reuse grid memory across steps.
+        if (grid.size() < (size_t)numCells) {
+            grid.resize(numCells);
+        }
+        for (int i = 0; i < numCells; ++i) {
+            grid[i].clear();
+        }
+        
+        // Step 3: Populate grid with particle indices. This is sequential but fast.
+        for (int i = 0; i < n; ++i) {
+            int cx = static_cast<int>((particles[i].position.x - worldMin.x) / cellSize);
+            int cy = static_cast<int>((particles[i].position.y - worldMin.y) / cellSize);
+            cx = std::max(0, std::min(cx, gridWidth - 1));
+            cy = std::max(0, std::min(cy, gridHeight - 1));
+            grid[cx + cy * gridWidth].push_back(i);
+        }
+        
+        // Step 4: Compute forces and update particles in parallel.
+        #pragma omp parallel for schedule(dynamic, 16)
+        for (int i = 0; i < n; ++i) {
+            Vec2 totalForce = {0.0f, 0.0f};
+            
+            // Find the cell for the current particle.
+            int cx = static_cast<int>((particles[i].position.x - worldMin.x) / cellSize);
+            int cy = static_cast<int>((particles[i].position.y - worldMin.y) / cellSize);
+            cx = std::max(0, std::min(cx, gridWidth - 1));
+            cy = std::max(0, std::min(cy, gridHeight - 1));
+            
+            // Iterate over the 3x3 block of cells around the particle's cell.
+            for (int dx = -1; dx <= 1; ++dx) {
+                for (int dy = -1; dy <= 1; ++dy) {
+                    int neighbor_cx = cx + dx;
+                    int neighbor_cy = cy + dy;
+                    
+                    if (neighbor_cx >= 0 && neighbor_cx < gridWidth &&
+                        neighbor_cy >= 0 && neighbor_cy < gridHeight) {
+                        
+                        int cellIndex = neighbor_cx + neighbor_cy * gridWidth;
+                        for (int j_idx : grid[cellIndex]) {
+                            if (i == j_idx) continue;
+                            totalForce += computeForce(particles[i], particles[j_idx], params.cullRadius);
+                        }
+                    }
+                }
+            }
+            newParticles[i] = updateParticle(particles[i], totalForce, params.deltaTime);
+        }
+    }
+};
+
+Simulator* createSimulator() {
+    return new MySimulator();
+}
\ No newline at end of file
diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh
index 19112889..0ce8da86 100755
--- a/scripts/run_eval.sh
+++ b/scripts/run_eval.sh
@@ -466,7 +466,7 @@ fi
 mkdir -p "$RESULTS_DIR"
 
 # Build command
-CMD="uv run frontier-eval batch $TRACK"
+CMD="uv run frontier batch $TRACK"
 CMD="$CMD --solutions-dir $SOLUTIONS_DIR"
 CMD="$CMD --results-dir $RESULTS_DIR"
 CMD="$CMD --problems-dir $PROBLEMS_DIR"
diff --git a/scripts/validate_problems.py b/scripts/validate_problems.py
index 64287086..4b4b4e29 100644
--- a/scripts/validate_problems.py
+++ b/scripts/validate_problems.py
@@ -12,12 +12,13 @@
 """
 
 import argparse
-import json
-import subprocess
 import sys
 from pathlib import Path
 from typing import Optional
 
+from frontier_cs.config import get_problem_extension
+from frontier_cs.single_evaluator import SingleEvaluator
+
 
 def find_reference_solution(track: str, problem_id: str) -> Optional[Path]:
     """
@@ -32,8 +33,10 @@ def find_reference_solution(track: str, problem_id: str) -> Optional[Path]:
         if ref_path.exists():
             return ref_path
     else:
-        # Research: reference.py in problem directory
-        ref_path = Path(f"research/problems/{problem_id}/reference.py")
+        # Research: extension based on config.yaml language field
+        problem_path = Path(f"research/problems/{problem_id}")
+        ext = get_problem_extension(problem_path)
+        ref_path = problem_path / f"reference.{ext}"
         if ref_path.exists():
             return ref_path
     return None
@@ -43,62 +46,23 @@ def run_evaluation(
     track: str, problem_id: str, solution_path: Path, timeout: int = 300
 ) -> dict:
     """
-    Run evaluation using frontier CLI.
+    Run evaluation using Python API.
 
     Returns:
         Dict with keys: success, score, message
     """
-    cmd = [
-        "uv",
-        "run",
-        "frontier",
-        "eval",
-        problem_id,
-        str(solution_path),
-        "--json",
-    ]
-
-    if track == "algorithmic":
-        cmd.append("--algorithmic")
-
+    evaluator = SingleEvaluator(timeout=timeout)
     try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=timeout,
-        )
-
-        # Parse JSON output (may have prefix text before JSON array)
-        if result.returncode == 0 and result.stdout.strip():
-            stdout = result.stdout.strip()
-            # Find JSON array in output
-            json_start = stdout.find("[")
-            if json_start >= 0:
-                try:
-                    data = json.loads(stdout[json_start:])
-                    if isinstance(data, list) and len(data) > 0:
-                        item = data[0]
-                        return {
-                            "success": item.get("status") == "success",
-                            "score": item.get("score", 0),
-                            "message": item.get("message", ""),
-                        }
-                except json.JSONDecodeError:
-                    pass
-
-        # Fallback: check stderr for error messages
+        result = evaluator.evaluate_file(track, problem_id, solution_path)
+        message_parts = []
+        if result.message:
+            message_parts.append(result.message)
+        if result.logs:
+            message_parts.append(result.logs)
         return {
-            "success": False,
-            "score": 0,
-            "message": result.stderr or result.stdout or "Unknown error",
-        }
-
-    except subprocess.TimeoutExpired:
-        return {
-            "success": False,
-            "score": 0,
-            "message": f"Evaluation timed out after {timeout}s",
+            "success": result.success,
+            "score": result.score,
+            "message": "\n".join(message_parts).strip(),
         }
     except Exception as e:
         return {
@@ -132,7 +96,9 @@ def validate_problem(
         if track == "algorithmic":
             print(f"  Expected: algorithmic/problems/{problem_id}/reference.cpp")
         else:
-            print(f"  Expected: research/problems/{problem_id}/reference.py")
+            problem_path = Path(f"research/problems/{problem_id}")
+            ext = get_problem_extension(problem_path)
+            print(f"  Expected: research/problems/{problem_id}/reference.{ext}")
         return False
 
     print(f"  Reference: {ref_path}")
@@ -145,7 +111,8 @@ def validate_problem(
         print(f"  Result: {result}")
 
     # Check result
-    if result["success"] and result["score"] > 0:
+    # Accept score >= 0 for baseline references (score=0 means evaluation works, just no speedup)
+    if result["success"] and result["score"] is not None and result["score"] >= 0:
         print(f"  PASS: score = {result['score']}")
         return True
     else:
diff --git a/src/frontier_cs/__init__.py b/src/frontier_cs/__init__.py
index 40dac3b9..5052081f 100644
--- a/src/frontier_cs/__init__.py
+++ b/src/frontier_cs/__init__.py
@@ -2,19 +2,19 @@
 Frontier-CS: Evaluation framework for frontier CS problems.
 
 Usage:
-    from frontier_cs import FrontierCSEvaluator
+    from frontier_cs import SingleEvaluator
 
-    evaluator = FrontierCSEvaluator()
+    evaluator = SingleEvaluator()
 
-    # Algorithmic problems
+    # Algorithmic problems (uses Docker by default)
     score = evaluator.evaluate("algorithmic", problem_id=1, code=cpp_code)
 
-    # Research problems (local Docker)
+    # Research problems (uses SkyPilot by default)
     score = evaluator.evaluate("research", problem_id="flash_attn", code=py_code)
 
-    # Research problems (SkyPilot cloud)
+    # Override backend
     score = evaluator.evaluate("research", problem_id="flash_attn", code=py_code,
-                               backend="skypilot")
+                               backend="docker")
 
     # Batch evaluation with incremental progress
     from frontier_cs.batch import BatchEvaluator
@@ -31,12 +31,12 @@
     batch.# Use batch.scan_solutions_dir() or evaluate_pairs()
 """
 
-from .evaluator import FrontierCSEvaluator
+from .single_evaluator import SingleEvaluator
 from .config import RuntimeConfig, ResourcesConfig, DockerConfig, ProblemConfig
 from .runner import EvaluationResult
 
 __all__ = [
-    "FrontierCSEvaluator",
+    "SingleEvaluator",
     "RuntimeConfig",
     "ResourcesConfig",
     "DockerConfig",
diff --git a/src/frontier_cs/batch/evaluator.py b/src/frontier_cs/batch/evaluator.py
index 6b499aff..95b0572b 100644
--- a/src/frontier_cs/batch/evaluator.py
+++ b/src/frontier_cs/batch/evaluator.py
@@ -28,7 +28,8 @@
     HAS_TQDM = False
 
 from ..runner.base import EvaluationResult, EvaluationStatus
-from ..runner.docker import DockerRunner
+from ..runner.research_docker import ResearchDockerRunner
+from ..config import get_problem_extension
 from .pair import Pair, expand_pairs, read_pairs_file, read_problems_file, read_models_file, read_variants_file
 from .state import EvaluationState, PairResult, hash_file, hash_directory
 
@@ -142,6 +143,32 @@ def _find_base_dir(self) -> Path:
             raise RuntimeError(f"pyproject.toml not found in {base}")
         return base
 
+    def _get_problems_dir(self) -> Path:
+        """Get the problems directory for the current track."""
+        if self.problems_dir:
+            return self.problems_dir
+        if self.track == "algorithmic":
+            return self.base_dir / "algorithmic" / "problems"
+        return self.base_dir / "research" / "problems"
+
+    def _get_problem_extension(self, problem: str) -> str:
+        """Get file extension for a problem based on its config.yaml.
+
+        For algorithmic track, always returns "cpp".
+        For research track, reads config.yaml to determine language.
+        """
+        if self.track == "algorithmic":
+            return "cpp"
+
+        # Research track: use shared function
+        problems_dir = self._get_problems_dir()
+        problem_path = problems_dir / problem
+        return get_problem_extension(problem_path)
+
+    def _build_problem_extensions(self, problems: List[str]) -> Dict[str, str]:
+        """Build a mapping of problem -> extension for a list of problems."""
+        return {problem: self._get_problem_extension(problem) for problem in problems}
+
     def _create_runner(self):
         """Create the appropriate runner based on track and backend."""
         if self.track == "algorithmic":
@@ -154,22 +181,22 @@ def _create_runner(self):
                     idle_timeout=self.idle_timeout,
                 )
             else:
-                from ..runner.algorithmic import AlgorithmicRunner
-                return AlgorithmicRunner(
+                from ..runner.algorithmic_local import AlgorithmicLocalRunner
+                return AlgorithmicLocalRunner(
                     judge_url=self.judge_url,
                     problems_dir=self.problems_dir,
                 )
         else:
             # research track
             if self.backend == "docker":
-                return DockerRunner(
+                return ResearchDockerRunner(
                     base_dir=self.base_dir,
                     problems_dir=self.problems_dir,
                     timeout=self.timeout,
                 )
             else:
-                from ..runner.skypilot import SkyPilotRunner
-                return SkyPilotRunner(
+                from ..runner.research_skypilot import ResearchSkyPilotRunner
+                return ResearchSkyPilotRunner(
                     base_dir=self.base_dir,
                     problems_dir=self.problems_dir,
                     bucket_url=self.bucket_url,
@@ -450,9 +477,9 @@ def worker(worker_id: int):
 
     def _create_cluster_pool(self) -> None:
         """Create a pool of SkyPilot clusters for parallel evaluation."""
-        from ..runner.skypilot import SkyPilotRunner
+        from ..runner.research_skypilot import ResearchSkyPilotRunner
 
-        logger.info(f"Creating {self.clusters} SkyPilot clusters...")
+        logger.info(f"Creating {self.clusters} SkyPilot clusters...")
 
         # Add date hash to cluster names to avoid reusing old clusters with stale config
         date_str = datetime.now().strftime("%m%d%H%M")
@@ -485,8 +512,8 @@ def _cleanup_cluster_pool(self) -> None:
             return
 
         logger.info(f"Terminating {len(self._cluster_names)} clusters...")
-        from ..runner.skypilot import SkyPilotRunner
-        SkyPilotRunner.down_clusters(self._cluster_names)
+        from ..runner.research_skypilot import ResearchSkyPilotRunner
+        ResearchSkyPilotRunner.down_clusters(self._cluster_names)
         self._cluster_names = []
 
     def _get_default_solutions_dir(self) -> Path:
@@ -674,11 +701,12 @@ def evaluate_model(
     ) -> EvaluationState:
         """Evaluate all problems for a given model."""
         solutions_dir = self._get_default_solutions_dir()
-        ext = "cpp" if self.track == "algorithmic" else "py"
+        problem_extensions = self._build_problem_extensions(problems)
 
         pairs = expand_pairs(
             problems, [model], variants,
-            solutions_dir=solutions_dir, validate_paths=True, ext=ext,
+            solutions_dir=solutions_dir, validate_paths=True,
+            problem_extensions=problem_extensions,
         )
 
         if not pairs:
@@ -698,11 +726,12 @@ def evaluate_problem(
     ) -> EvaluationState:
         """Evaluate a problem across all given models."""
         solutions_dir = self._get_default_solutions_dir()
-        ext = "cpp" if self.track == "algorithmic" else "py"
+        problem_extensions = self._build_problem_extensions([problem])
 
         pairs = expand_pairs(
             [problem], models, variants,
-            solutions_dir=solutions_dir, validate_paths=True, ext=ext,
+            solutions_dir=solutions_dir, validate_paths=True,
+            problem_extensions=problem_extensions,
         )
 
         if not pairs:
@@ -737,11 +766,12 @@ def evaluate_from_files(
         variants = read_variants_file(variants_file) if variants_file else [0]
 
         solutions_dir = self._get_default_solutions_dir()
-        ext = "cpp" if self.track == "algorithmic" else "py"
+        problem_extensions = self._build_problem_extensions(problems)
 
         pairs = expand_pairs(
             problems, models, variants,
-            solutions_dir=solutions_dir, validate_paths=True, ext=ext,
+            solutions_dir=solutions_dir, validate_paths=True,
+            problem_extensions=problem_extensions,
         )
 
         logger.info(f"Expanded {len(problems)} problems × {len(models)} models × {len(variants)} variants = {len(pairs)} pairs")
@@ -810,11 +840,12 @@ def evaluate_missing(
     ) -> EvaluationState:
         """Evaluate only missing pairs (those not yet in results)."""
         solutions_dir = self._get_default_solutions_dir()
-        ext = "cpp" if self.track == "algorithmic" else "py"
+        problem_extensions = self._build_problem_extensions(problems)
 
         all_pairs = expand_pairs(
             problems, models, variants,
-            solutions_dir=solutions_dir, validate_paths=True, ext=ext,
+            solutions_dir=solutions_dir, validate_paths=True,
+            problem_extensions=problem_extensions,
         )
 
         missing = [p for p in all_pairs if p.id not in self.state.results]
diff --git a/src/frontier_cs/batch/pair.py b/src/frontier_cs/batch/pair.py
index e3c73d40..d9215ddf 100644
--- a/src/frontier_cs/batch/pair.py
+++ b/src/frontier_cs/batch/pair.py
@@ -15,7 +15,7 @@
 import hashlib
 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from ..models import get_model_prefix
 from ..gen.solution_format import format_solution_filename, get_solution_path
@@ -106,6 +106,7 @@ def expand_pairs(
     solutions_dir: Optional[Path] = None,
     validate_paths: bool = True,
     ext: str = "py",
+    problem_extensions: Optional[Dict[str, str]] = None,
     interleave: bool = False,
 ) -> List[Pair]:
     """
@@ -117,7 +118,8 @@ def expand_pairs(
         variants: List of variant indices (default: [0] for no suffix)
         solutions_dir: Directory containing solutions (for validation)
         validate_paths: Whether to validate solution paths exist
-        ext: File extension (default: "py", use "cpp" for algorithmic)
+        ext: Default file extension (default: "py", use "cpp" for algorithmic)
+        problem_extensions: Optional per-problem extension mapping (overrides ext)
         interleave: Interleave pairs by problem for load balancing (default: True)
 
     Returns:
@@ -129,6 +131,9 @@ def expand_pairs(
     pairs: List[Pair] = []
 
     for problem in problems:
+        # Use problem-specific extension if available, otherwise default
+        problem_ext = (problem_extensions or {}).get(problem, ext)
+
         for model in models:
             model_prefix = get_model_prefix(model)
 
@@ -138,7 +143,7 @@ def expand_pairs(
                     solutions_dir or Path("."),
                     problem,
                     model_prefix,
-                    ext,
+                    problem_ext,
                     variant_idx,
                 )
 
diff --git a/src/frontier_cs/cli.py b/src/frontier_cs/cli.py
index 5d05188b..49f08d9e 100644
--- a/src/frontier_cs/cli.py
+++ b/src/frontier_cs/cli.py
@@ -7,8 +7,8 @@
     frontier eval research flash_attn solution.py
     frontier eval algorithmic 1 solution.cpp
 
-    # With SkyPilot
-    frontier eval research flash_attn solution.py --skypilot
+    # Override backend
+    frontier eval research flash_attn solution.py --backend docker
 
     # All problems for a solution
     frontier eval research --all-problems solution.py
@@ -24,13 +24,14 @@
     frontier batch research --solutions-dir path/to/solutions
 """
 
-import argparse
-import logging
-import sys
+import argparse
+import contextlib
+import logging
+import sys
 from pathlib import Path
 from typing import List, Optional
 
-from .evaluator import FrontierCSEvaluator
+from .single_evaluator import SingleEvaluator
 from .runner import EvaluationResult
 
 logger = logging.getLogger(__name__)
@@ -74,8 +75,8 @@ def create_parser() -> argparse.ArgumentParser:
   # Evaluate an algorithmic problem
   frontier eval algorithmic 1 solution.cpp
 
-  # Evaluate with SkyPilot (cloud)
-  frontier eval research flash_attn solution.py --skypilot
+  # Override backend
+  frontier eval research flash_attn solution.py --backend docker
 
   # Evaluate multiple problems
   frontier eval research --problems flash_attn,cross_entropy solution.py
@@ -443,7 +444,7 @@ def print_results_json(results: List[EvaluationResult]) -> None:
 
 def get_problem_ids(
     args: argparse.Namespace,
-    evaluator: FrontierCSEvaluator,
+    evaluator: SingleEvaluator,
     track: str,
 ) -> List[str]:
     """Get list of problem IDs to evaluate."""
@@ -749,7 +750,7 @@ def signal_handler(signum, frame):
 
 def run_list(args: argparse.Namespace) -> int:
     """Run list command."""
-    evaluator = FrontierCSEvaluator(backend="docker")
+    evaluator = SingleEvaluator(backend="docker")
 
     if args.track == "algorithmic":
         # Only list algorithmic problems in compact format
@@ -785,7 +786,7 @@ def run_list(args: argparse.Namespace) -> int:
 
 def run_show(args: argparse.Namespace) -> int:
     """Run show command."""
-    evaluator = FrontierCSEvaluator(backend="docker")
+    evaluator = SingleEvaluator(backend="docker")
     statement = evaluator.get_problem_statement(args.track, args.problem_id)
     if statement:
         print(statement)
@@ -795,9 +796,9 @@ def run_show(args: argparse.Namespace) -> int:
     return 0
 
 
-def run_eval(args: argparse.Namespace) -> int:
-    """Run eval command."""
-    track = args.track
+def run_eval(args: argparse.Namespace) -> int:
+    """Run eval command."""
+    track = args.track
 
     # Determine backend: explicit --backend or track default
     if args.backend:
@@ -807,17 +808,17 @@ def run_eval(args: argparse.Namespace) -> int:
         backend = "skypilot" if track == "research" else "docker"
     idle_timeout = None if args.keep_cluster else getattr(args, 'idle_timeout', 10)
     timeout = getattr(args, 'timeout', None)
-    evaluator = FrontierCSEvaluator(
-        backend=backend,
-        judge_url=args.judge_url,
-        cloud=args.cloud,
-        region=args.region,
-        keep_cluster=getattr(args, 'keep_cluster', False),
-        idle_timeout=idle_timeout,
-        timeout=timeout,
-    )
-
-    # Get problem IDs
+    evaluator = SingleEvaluator(
+        backend=backend,
+        judge_url=args.judge_url,
+        cloud=args.cloud,
+        region=args.region,
+        keep_cluster=getattr(args, 'keep_cluster', False),
+        idle_timeout=idle_timeout,
+        timeout=timeout,
+    )
+
+    # Get problem IDs
     problem_ids = get_problem_ids(args, evaluator, track)
 
     if not problem_ids:
@@ -837,20 +838,27 @@ def run_eval(args: argparse.Namespace) -> int:
         print("Error: No solution provided.", file=sys.stderr)
         return 1
 
-    # Run evaluations
-    results = []
-    for pid in problem_ids:
-        if not args.quiet:
-            print(f"Evaluating {pid}...", end=" ", flush=True)
-
-        result = evaluator.evaluate(track, pid, code)
-        results.append(result)
-
-        if not args.quiet:
-            if result.success:
-                print(f"Score: {result.score}")
-            else:
-                print(f"ERROR: {result.message}")
+    # Run evaluations
+    results = []
+    eval_stdout = contextlib.nullcontext()
+    if args.json:
+        # Keep JSON clean: suppress human-readable prints and route runner stdout to stderr
+        args.quiet = True
+        eval_stdout = contextlib.redirect_stdout(sys.stderr)
+
+    with eval_stdout:
+        for pid in problem_ids:
+            if not args.quiet:
+                print(f"Evaluating {pid}...", end=" ", flush=True)
+
+            result = evaluator.evaluate(track, pid, code)
+            results.append(result)
+
+            if not args.quiet:
+                if result.success:
+                    print(f"Score: {result.score}")
+                else:
+                    print(f"ERROR: {result.message}")
 
     # Output results
     if args.json:
diff --git a/src/frontier_cs/config.py b/src/frontier_cs/config.py
index f2e64501..7e03970f 100644
--- a/src/frontier_cs/config.py
+++ b/src/frontier_cs/config.py
@@ -72,6 +72,7 @@ class RuntimeConfig:
     resources: ResourcesConfig = field(default_factory=ResourcesConfig)
     docker: DockerConfig = field(default_factory=DockerConfig)
     environment: Optional[str] = None  # For LLM prompts
+    language: Optional[str] = None  # Target language: "python", "cpp", etc.
 
 
 @dataclass
@@ -129,6 +130,8 @@ def load_problem_config(problem_path: Path) -> ProblemConfig:
         rt.requires_gpu = bool(runtime["requires_gpu"])
     if runtime.get("environment"):
         rt.environment = str(runtime["environment"])
+    if runtime.get("language"):
+        rt.language = str(runtime["language"])
 
     # Parse docker section
     docker = runtime.get("docker", {})
@@ -184,3 +187,79 @@ def get_effective_gpu_type(runtime_config: RuntimeConfig) -> Optional[str]:
         return "L4"
 
     return None
+
+
+# =============================================================================
+# Language Configuration
+# =============================================================================
+
+@dataclass
+class LanguageConfig:
+    """Configuration for a target programming language."""
+    name: str              # "python", "cpp"
+    extension: str         # "py", "cpp"
+    code_block_tag: str    # Markdown code block tag: "python", "cpp"
+
+
+# Registry of supported languages
+LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = {
+    "python": LanguageConfig(
+        name="python",
+        extension="py",
+        code_block_tag="python",
+    ),
+    "cpp": LanguageConfig(
+        name="cpp",
+        extension="cpp",
+        code_block_tag="cpp",
+    ),
+}
+
+DEFAULT_LANGUAGE = "python"
+
+
+def get_language_config(problem_path: Optional[Path] = None) -> LanguageConfig:
+    """
+    Get language configuration for a problem.
+
+    Reads the `language` field from config.yaml runtime section.
+    Defaults to Python if not specified.
+
+    Args:
+        problem_path: Path to the problem directory
+
+    Returns:
+        LanguageConfig for the problem's target language
+
+    Raises:
+        ValueError: If the language is not supported
+    """
+    language = DEFAULT_LANGUAGE
+
+    if problem_path and problem_path.is_dir():
+        runtime_config = load_runtime_config(problem_path)
+        if runtime_config.language:
+            language = runtime_config.language
+
+    if language not in LANGUAGE_CONFIGS:
+        raise ValueError(
+            f"Unsupported language: {language}. "
+            f"Supported: {list(LANGUAGE_CONFIGS.keys())}"
+        )
+
+    return LANGUAGE_CONFIGS[language]
+
+
+def get_problem_extension(problem_path: Optional[Path] = None) -> str:
+    """
+    Get file extension for a problem based on its language config.
+
+    Convenience function that returns just the extension string.
+
+    Args:
+        problem_path: Path to the problem directory
+
+    Returns:
+        File extension without dot (e.g., "py", "cpp")
+    """
+    return get_language_config(problem_path).extension
diff --git a/src/frontier_cs/gen/api_keys.py b/src/frontier_cs/gen/api_keys.py
index ba6d719e..2fd67680 100644
--- a/src/frontier_cs/gen/api_keys.py
+++ b/src/frontier_cs/gen/api_keys.py
@@ -48,9 +48,9 @@ class KeyInfo:
 
 
 class APIKeyPool:
-    """Thread-safe pool of API keys with weighted load balancing."""
+    """Thread-safe pool of API keys with weighted load balancing and concurrency control."""
 
-    def __init__(self, keys: List[KeyInfo], *, name: str):
+    def __init__(self, keys: List[KeyInfo], *, name: str, max_concurrent: Optional[int] = None):
         self.name = name
         self._states = [
             {
@@ -66,6 +66,9 @@ def __init__(self, keys: List[KeyInfo], *, name: str):
         ]
         self._lock = threading.Lock()
         self._total_weight = sum(s["weight"] for s in self._states)
+        # Concurrency control: limit concurrent requests to this provider
+        self.max_concurrent = max_concurrent
+        self._semaphore = threading.Semaphore(max_concurrent) if max_concurrent else None
 
     def acquire(self) -> Tuple[Optional[str], Optional[int]]:
         """Acquire an API key using weighted selection."""
@@ -137,6 +140,17 @@ def size(self) -> int:
         with self._lock:
             return len(self._states)
 
+    def acquire_slot(self, timeout: Optional[float] = None) -> bool:
+        """Acquire a concurrency slot. Returns True if acquired, False if timed out."""
+        if self._semaphore is None:
+            return True
+        return self._semaphore.acquire(timeout=timeout)
+
+    def release_slot(self) -> None:
+        """Release a concurrency slot."""
+        if self._semaphore is not None:
+            self._semaphore.release()
+
 
 def _matches_env_base(key_name: str, base: str) -> bool:
     """Check if an environment variable name matches a base name pattern."""
@@ -169,6 +183,30 @@ def _collect_provider_keys(provider: str, base_names: List[str]) -> List[str]:
     return keys
 
 
+def _compute_max_concurrent(provider_results: List[KeyCheckResult]) -> Optional[int]:
+    """Compute max concurrent requests for a provider based on RPM limits.
+
+    Logic: sum RPM across unique orgs (same org shares limit), then divide by 10
+    (conservative estimate since LLM calls take time).
+    """
+    if not provider_results:
+        return None
+
+    # Group by org_id, take max RPM per org
+    org_rpms: Dict[str, int] = {}
+    for r in provider_results:
+        if r.valid and r.rpm_limit and r.org_id:
+            org_rpms[r.org_id] = max(org_rpms.get(r.org_id, 0), r.rpm_limit)
+
+    if not org_rpms:
+        return None
+
+    total_rpm = sum(org_rpms.values())
+    n_orgs = len(org_rpms)
+    # RPM/10, capped at 20 per org (LLM generation takes time)
+    return min(n_orgs * 20, max(1, total_rpm // 10))
+
+
 def build_key_pools(
     valid_keys: Optional[Dict[str, List[str]]] = None,
     key_info: Optional[Dict[str, List[KeyCheckResult]]] = None,
@@ -201,9 +239,11 @@ def build_key_pools(
                 else:
                     key_infos.append(KeyInfo(key, 100, None))
 
-            pools[provider] = APIKeyPool(key_infos, name=provider)
+            # Compute max concurrent based on RPM
+            max_concurrent = _compute_max_concurrent(provider_results)
+            pools[provider] = APIKeyPool(key_infos, name=provider, max_concurrent=max_concurrent)
     else:
-        # Collect from environment (legacy behavior, equal weights)
+        # Collect from environment (legacy behavior, equal weights, no concurrency limit)
         for provider, bases in PROVIDER_ENV_KEY_MAP.items():
             keys = _collect_provider_keys(provider, bases)
             if keys:
@@ -493,11 +533,11 @@ def precheck_required_providers(
         else:
             parts.append(f"{n_orgs} orgs")
         if total_rpm:
-            # Suggest concurrent: RPM/10, capped at 20 per org
+            # Max concurrent: RPM/10, capped at 20 per org
             # (LLM generation takes minutes, so conservative estimate)
-            suggested_concurrent = min(n_orgs * 20, max(1, total_rpm // 10))
+            max_concurrent = min(n_orgs * 20, max(1, total_rpm // 10))
             parts.append(f"~{total_rpm} RPM")
-            parts.append(f"suggest ~{suggested_concurrent} concurrent")
+            parts.append(f"max {max_concurrent} concurrent")
         print(f"  {provider}: {', '.join(parts)}")
 
     print(f"\nAPI key validation complete. {valid_count} valid key(s).\n")
diff --git a/src/frontier_cs/gen/llm.py b/src/frontier_cs/gen/llm.py
index 6ca790db..7cc6bd3e 100644
--- a/src/frontier_cs/gen/llm.py
+++ b/src/frontier_cs/gen/llm.py
@@ -44,7 +44,7 @@ def detect_provider(model: str, actual_model_lower: Optional[str] = None) -> str
     if provider_hint == "xai" or "grok" in actual_lower:
         return "xai"
     if provider_hint == "deepseek" or "deepseek" in actual_lower:
-        return "openrouter"
+        return "deepseek"
     return provider_hint or "openai"
 
 
diff --git a/src/frontier_cs/models.py b/src/frontier_cs/models.py
index 49ec311d..bec6640c 100644
--- a/src/frontier_cs/models.py
+++ b/src/frontier_cs/models.py
@@ -3,8 +3,7 @@
 
 Provides consistent model prefix conversion used across:
 - Solution generation (generate_solutions.py)
-- Solution matrix checking (frontier-eval check)
-- Batch evaluation (frontier-eval batch)
+- Batch evaluation (frontier batch)
 """
 
 import re
diff --git a/src/frontier_cs/runner/__init__.py b/src/frontier_cs/runner/__init__.py
index 577fec45..4da316d2 100644
--- a/src/frontier_cs/runner/__init__.py
+++ b/src/frontier_cs/runner/__init__.py
@@ -1,29 +1,29 @@
 """
 Runner module for executing evaluations.
 
-Provides different backends for running evaluations:
-- DockerRunner: Local Docker evaluation
-- SkyPilotRunner: Cloud evaluation via SkyPilot
-- AlgorithmicRunner: Judge server for algorithmic problems
-"""
-
-from .base import Runner, ResearchRunner, EvaluationResult
-from .docker import DockerRunner
-from .algorithmic import AlgorithmicRunner
-
-__all__ = [
-    "Runner",
-    "ResearchRunner",
-    "EvaluationResult",
-    "DockerRunner",
-    "AlgorithmicRunner",
-]
-
-# SkyPilotRunner is optional (requires skypilot)
-try:
-    from .skypilot import SkyPilotRunner
-    from .algorithmic_skypilot import AlgorithmicSkyPilotRunner
-    __all__.append("SkyPilotRunner")
-    __all__.append("AlgorithmicSkyPilotRunner")
-except ImportError:
-    pass
+Provides different backends for running evaluations:
+- ResearchDockerRunner: Local Docker evaluation for research
+- ResearchSkyPilotRunner: Cloud evaluation via SkyPilot for research
+- AlgorithmicLocalRunner: Judge server for algorithmic problems
+"""
+
+from .base import Runner, ResearchRunner, EvaluationResult
+from .research_docker import ResearchDockerRunner
+from .algorithmic_local import AlgorithmicLocalRunner
+
+__all__ = [
+    "Runner",
+    "ResearchRunner",
+    "EvaluationResult",
+    "ResearchDockerRunner",
+    "AlgorithmicLocalRunner",
+]
+
+# ResearchSkyPilotRunner is optional (requires skypilot)
+try:
+    from .research_skypilot import ResearchSkyPilotRunner
+    from .algorithmic_skypilot import AlgorithmicSkyPilotRunner
+    __all__.append("ResearchSkyPilotRunner")
+    __all__.append("AlgorithmicSkyPilotRunner")
+except ImportError:
+    pass
diff --git a/src/frontier_cs/runner/algorithmic.py b/src/frontier_cs/runner/algorithmic_local.py
similarity index 96%
rename from src/frontier_cs/runner/algorithmic.py
rename to src/frontier_cs/runner/algorithmic_local.py
index aa0e6c16..d085cada 100644
--- a/src/frontier_cs/runner/algorithmic.py
+++ b/src/frontier_cs/runner/algorithmic_local.py
@@ -19,7 +19,7 @@
 logger = logging.getLogger(__name__)
 
 
-class AlgorithmicRunner(Runner):
+class AlgorithmicLocalRunner(Runner):
     """
     Runner for algorithmic problems.
 
@@ -178,7 +178,7 @@ def evaluate(
                 problem_id=pid,
                 status=EvaluationStatus.ERROR,
                 message=f"Judge server at {self.judge_url} not available. "
-                        f"Run 'docker compose up -d' in algorithmic/ or use --skypilot",
+                        f"Run 'docker compose up -d' in algorithmic/ or use --backend skypilot",
             )
 
         # Check for empty code
diff --git a/src/frontier_cs/runner/algorithmic_skypilot.py b/src/frontier_cs/runner/algorithmic_skypilot.py
index a59e5b5f..094ccaf0 100644
--- a/src/frontier_cs/runner/algorithmic_skypilot.py
+++ b/src/frontier_cs/runner/algorithmic_skypilot.py
@@ -14,14 +14,14 @@
 
 import requests
 
-from .algorithmic import AlgorithmicRunner
+from .algorithmic_local import AlgorithmicLocalRunner
 from .base import EvaluationResult, EvaluationStatus
 from ..gen.solution_format import FAILED_EXTENSION
 
 logger = logging.getLogger(__name__)
 
 
-class AlgorithmicSkyPilotRunner(AlgorithmicRunner):
+class AlgorithmicSkyPilotRunner(AlgorithmicLocalRunner):
     """
     Runner that auto-launches go-judge on SkyPilot.
 
@@ -269,11 +269,15 @@ def evaluate(
         # Use parent class with the cloud judge URL
         self.judge_url = judge_url
         self.session = requests.Session()
-        return super().evaluate(
-            problem_id,
-            solution_code,
-            lang=lang,
-        )
+        try:
+            return super().evaluate(
+                problem_id,
+                solution_code,
+                lang=lang,
+            )
+        finally:
+            if not self.keep_cluster and self._initialized:
+                self.stop_cluster()
 
     def evaluate_file(
         self,
diff --git a/src/frontier_cs/runner/base.py b/src/frontier_cs/runner/base.py
index ad675ed7..88925c47 100644
--- a/src/frontier_cs/runner/base.py
+++ b/src/frontier_cs/runner/base.py
@@ -3,11 +3,14 @@
 """
 
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from enum import Enum
-from pathlib import Path
-from typing import Any, Dict, Optional
-
+from dataclasses import dataclass, field
+from enum import Enum
+import json
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from ..config import load_problem_config
+from ..gen.solution_format import FAILED_EXTENSION
 
 class EvaluationStatus(Enum):
     """Status of an evaluation."""
@@ -91,7 +94,7 @@ def get_problem_path(self, problem_id: str) -> Path:
         raise NotImplementedError
 
 
-class ResearchRunner(Runner):
+class ResearchRunner(Runner):
     """Base class for research problem runners (Docker and SkyPilot).
 
     Provides common functionality:
@@ -116,10 +119,81 @@ def _find_base_dir(self) -> Path:
             raise RuntimeError(f"research/ not found in {base}")
         return base
 
-    def get_problem_path(self, problem_id: str) -> Path:
+    def get_problem_path(self, problem_id: str) -> Path:
         """Get the path to a research problem directory.
 
         With nested solution structure, problem_id is already the nested path
         (e.g., "cant_be_late/high_availability_loose_deadline_large_overhead").
         """
-        return self.problems_dir / problem_id
+        return self.problems_dir / problem_id
+
+    def _get_problem_path_or_error(
+        self, problem_id: str
+    ) -> tuple[Optional[Path], Optional[EvaluationResult]]:
+        problem_path = self.get_problem_path(problem_id)
+        if not problem_path.exists():
+            return (
+                None,
+                EvaluationResult(
+                    problem_id=problem_id,
+                    status=EvaluationStatus.ERROR,
+                    message=f"Problem not found: {problem_path}",
+                ),
+            )
+        return (problem_path, None)
+
+    def _validate_solution_file(
+        self, problem_id: str, solution_path: Path
+    ) -> Optional[EvaluationResult]:
+        if not solution_path.exists():
+            return EvaluationResult(
+                problem_id=problem_id,
+                status=EvaluationStatus.ERROR,
+                message=f"Solution file not found: {solution_path}",
+            )
+
+        if solution_path.suffix == f".{FAILED_EXTENSION}":
+            try:
+                meta = json.loads(solution_path.read_text(encoding="utf-8"))
+                error_msg = meta.get("error", "Generation failed")
+            except (json.JSONDecodeError, OSError):
+                error_msg = "Generation failed"
+            return EvaluationResult(
+                problem_id=problem_id,
+                status=EvaluationStatus.ERROR,
+                score=0,
+                message=f"Generation failed: {error_msg}",
+            )
+
+        return None
+
+    def _load_runtime_settings(self, problem_path: Path) -> dict:
+        problem_config = load_problem_config(problem_path)
+        runtime_config = problem_config.runtime
+        docker_config = runtime_config.docker
+        uv_project = problem_config.dependencies.get("uv_project")
+        return {
+            "problem_config": problem_config,
+            "runtime": runtime_config,
+            "docker": docker_config,
+            "uv_project": uv_project,
+            "timeout_seconds": runtime_config.timeout_seconds,
+        }
+
+    def _build_uv_install_cmd(self, uv_project: Optional[str]) -> str:
+        if not uv_project:
+            return "# No uv_project specified in config.yaml"
+
+        return (
+            f'if [ -d "{uv_project}" ] && [ -f "{uv_project}/pyproject.toml" ]; then\n'
+            f'    echo "[framework] Installing dependencies from {uv_project}"\n'
+            f'    if [ -f "{uv_project}/uv_overrides.txt" ]; then\n'
+            f'        uv pip install --system --overrides "{uv_project}/uv_overrides.txt" -e "{uv_project}"\n'
+            f'    else\n'
+            f'        uv pip install --system -e "{uv_project}"\n'
+            f'    fi\n'
+            f'fi'
+        )
+
+    def _build_timeout_prefix(self, timeout_seconds: Optional[int]) -> str:
+        return f"timeout {timeout_seconds}s " if timeout_seconds else ""
diff --git a/src/frontier_cs/runner/cluster_cleanup.py b/src/frontier_cs/runner/cluster_cleanup.py
new file mode 100644
index 00000000..9ed73b56
--- /dev/null
+++ b/src/frontier_cs/runner/cluster_cleanup.py
@@ -0,0 +1,29 @@
+"""
+Shared cluster cleanup registry for SkyPilot-based runners.
+"""
+
+from __future__ import annotations
+
+import threading
+
+
+class ActiveClusterRegistry:
+    """Track active SkyPilot clusters for cleanup on exit."""
+
+    _active_clusters: set[str] = set()
+    _lock = threading.Lock()
+
+    @classmethod
+    def register(cls, name: str) -> None:
+        with cls._lock:
+            cls._active_clusters.add(name)
+
+    @classmethod
+    def unregister(cls, name: str) -> None:
+        with cls._lock:
+            cls._active_clusters.discard(name)
+
+    @classmethod
+    def snapshot(cls) -> list[str]:
+        with cls._lock:
+            return list(cls._active_clusters)
diff --git a/src/frontier_cs/runner/docker.py b/src/frontier_cs/runner/research_docker.py
similarity index 79%
rename from src/frontier_cs/runner/docker.py
rename to src/frontier_cs/runner/research_docker.py
index 8cf47c6a..c838c464 100644
--- a/src/frontier_cs/runner/docker.py
+++ b/src/frontier_cs/runner/research_docker.py
@@ -4,20 +4,18 @@
 Runs evaluations in local Docker containers.
 """
 
-import json
-import shutil
-import subprocess
-import tempfile
-import time
-from pathlib import Path
+import shutil
+import subprocess
+import tempfile
+import time
+from pathlib import Path
 from typing import Optional, Tuple
 
 from .base import ResearchRunner, EvaluationResult, EvaluationStatus
-from ..config import load_problem_config, DockerConfig, DEFAULT_DOCKER_IMAGE
-from ..gen.solution_format import FAILED_EXTENSION
+from ..config import DockerConfig, DEFAULT_DOCKER_IMAGE, get_problem_extension
 
 
-class DockerRunner(ResearchRunner):
+class ResearchDockerRunner(ResearchRunner):
     """
     Runner for research problems using local Docker.
 
@@ -38,7 +36,7 @@ def __init__(
         timeout: Optional[int] = None,
     ):
         """
-        Initialize DockerRunner.
+        Initialize ResearchDockerRunner.
 
         Args:
             base_dir: Base directory of Frontier-CS repo (auto-detected if None)
@@ -78,64 +76,39 @@ def evaluate(
             problem_id: Problem ID (e.g., "flash_attn", "gemm_optimization/squares")
             solution_code: Python solution code
 
-        Returns:
-            EvaluationResult with score and status
-        """
-        problem_path = self.get_problem_path(problem_id)
-
-        if not problem_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Problem not found: {problem_path}",
-            )
-
-        # Create temp directory with solution
-        with tempfile.TemporaryDirectory(prefix="frontier_eval_") as temp_dir:
-            temp_path = Path(temp_dir)
-            solution_path = temp_path / "solution.py"
-            solution_path.write_text(solution_code, encoding="utf-8")
-
-            return self._run_evaluation(problem_id, problem_path, solution_path)
+        Returns:
+            EvaluationResult with score and status
+        """
+        problem_path, error = self._get_problem_path_or_error(problem_id)
+        if error:
+            return error
+
+        # Create temp directory with solution
+        with tempfile.TemporaryDirectory(prefix="frontier_eval_") as temp_dir:
+            temp_path = Path(temp_dir)
+            ext = get_problem_extension(problem_path)
+            solution_path = temp_path / f"solution.{ext}"
+            solution_path.write_text(solution_code, encoding="utf-8")
+
+            return self._run_evaluation(problem_id, problem_path, solution_path)
 
     def evaluate_file(
         self,
         problem_id: str,
         solution_path: Path,
         *,
-        solution_id: Optional[str] = None,  # Unused, for API compatibility with SkyPilotRunner
-    ) -> EvaluationResult:
-        """Evaluate a solution file for a research problem."""
-        if not solution_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Solution file not found: {solution_path}",
-            )
-
-        # Check for generation failure marker (.FAILED file)
-        if solution_path.suffix == f".{FAILED_EXTENSION}":
-            try:
-                meta = json.loads(solution_path.read_text(encoding="utf-8"))
-                error_msg = meta.get("error", "Generation failed")
-            except (json.JSONDecodeError, OSError):
-                error_msg = "Generation failed"
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                score=0,
-                message=f"Generation failed: {error_msg}",
-            )
-
-        problem_path = self.get_problem_path(problem_id)
-        if not problem_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Problem not found: {problem_path}",
-            )
-
-        return self._run_evaluation(problem_id, problem_path, solution_path)
+        solution_id: Optional[str] = None,  # Unused, for API compatibility with ResearchSkyPilotRunner
+    ) -> EvaluationResult:
+        """Evaluate a solution file for a research problem."""
+        error = self._validate_solution_file(problem_id, solution_path)
+        if error:
+            return error
+
+        problem_path, error = self._get_problem_path_or_error(problem_id)
+        if error:
+            return error
+
+        return self._run_evaluation(problem_id, problem_path, solution_path)
 
     def _run_evaluation(
         self,
@@ -146,17 +119,16 @@ def _run_evaluation(
         """Run the actual evaluation in Docker."""
         start_time = time.time()
 
-        # Load config from problem's config.yaml
-        problem_config = load_problem_config(problem_path)
-        runtime_config = problem_config.runtime
-        docker_config = runtime_config.docker
-        uv_project = problem_config.dependencies.get("uv_project")
+        settings = self._load_runtime_settings(problem_path)
+        runtime_config = settings["runtime"]
+        docker_config = settings["docker"]
+        uv_project = settings["uv_project"]
 
         # Determine timeout: user-specified > problem config > default
         if self.timeout is not None:
             effective_timeout = self.timeout
         else:
-            effective_timeout = runtime_config.timeout_seconds or self.DEFAULT_TIMEOUT
+            effective_timeout = settings["timeout_seconds"] or self.DEFAULT_TIMEOUT
 
         # Check GPU requirements
         needs_gpu = docker_config.gpu or runtime_config.requires_gpu or runtime_config.resources.has_gpu
@@ -244,10 +216,11 @@ def _setup_workspace(
                 dest = workspace / "research" / parent / "common"
                 shutil.copytree(common_dir, dest)
 
-        # Create solution structure
+        # Create solution structure (rename to solution.{ext})
         solution_dir = workspace / "solution"
         solution_dir.mkdir(parents=True)
-        shutil.copy2(solution_path, solution_dir / "solution.py")
+        dest_name = f"solution{solution_path.suffix}"
+        shutil.copy2(solution_path, solution_dir / dest_name)
 
     def _run_docker(
         self,
@@ -299,19 +272,9 @@ def _run_docker(
         logs = result.stdout + "\n" + result.stderr
         return result, logs
 
-    def _get_run_script(self, uv_project: Optional[str] = None, dind: bool = False) -> str:
-        """Get the bash script to run inside Docker."""
-        # Build uv install command if uv_project is specified
-        if uv_project:
-            uv_install_cmd = f'''
-# Install dependencies from uv_project
-if [ -d "{uv_project}" ] && [ -f "{uv_project}/pyproject.toml" ]; then
-    echo "[framework] Installing dependencies from {uv_project}"
-    uv pip install --system -e "{uv_project}"
-fi
-'''
-        else:
-            uv_install_cmd = "# No uv_project specified"
+    def _get_run_script(self, uv_project: Optional[str] = None, dind: bool = False) -> str:
+        """Get the bash script to run inside Docker."""
+        uv_install_cmd = self._build_uv_install_cmd(uv_project)
 
         # Build Docker CLI install command for DinD
         if dind:
@@ -342,7 +305,7 @@ def _get_run_script(self, uv_project: Optional[str] = None, dind: bool = False)
 # Create execution_env and copy solution BEFORE set_up_env.sh
 # (some scripts expect this structure to exist)
 mkdir -p /work/execution_env/solution_env
-cp /work/solution/solution.py /work/execution_env/solution_env/
+cp /work/solution/solution.* /work/execution_env/solution_env/
 
 # Find the problem directory
 PROBLEM_DIR=$(find research -mindepth 1 -maxdepth 4 -name "evaluator.py" -exec dirname {{}} \\; | head -1)
diff --git a/src/frontier_cs/runner/skypilot.py b/src/frontier_cs/runner/research_skypilot.py
similarity index 81%
rename from src/frontier_cs/runner/skypilot.py
rename to src/frontier_cs/runner/research_skypilot.py
index 18492957..123072df 100644
--- a/src/frontier_cs/runner/skypilot.py
+++ b/src/frontier_cs/runner/research_skypilot.py
@@ -8,20 +8,19 @@
 - bucket: Write results directly to S3/GCS bucket during job execution
 """
 
-import hashlib
-import json
-import shutil
-import subprocess
-import tempfile
-import textwrap
-import time
+import hashlib
+import shutil
+import subprocess
+import tempfile
+import textwrap
+import time
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, Tuple
 
-from .base import ResearchRunner, EvaluationResult, EvaluationStatus
-from ..config import load_problem_config
-from ..gen.solution_format import FAILED_EXTENSION
+from .base import ResearchRunner, EvaluationResult, EvaluationStatus
+from .cluster_cleanup import ActiveClusterRegistry
+from ..config import get_problem_extension
 
 
 def _sanitize_name(name: str) -> str:
@@ -40,7 +39,7 @@ def _sanitize_name(name: str) -> str:
     return "".join(cleaned).strip("-") or "job"
 
 
-class SkyPilotRunner(ResearchRunner):
+class ResearchSkyPilotRunner(ResearchRunner):
     """
     Runner for research problems using SkyPilot.
 
@@ -55,9 +54,9 @@ class SkyPilotRunner(ResearchRunner):
     DEFAULT_MEMORY = "16+"
     DEFAULT_DISK_SIZE = 200  # Large disk for PyTorch, Docker images, and datasets
     DEFAULT_GPU = "L4:1"
-    DEFAULT_TIMEOUT = 1800  # 30 minutes
-    DEFAULT_IDLE_TIMEOUT = 10  # 10 minutes
-
+    DEFAULT_TIMEOUT = 1800  # 30 minutes
+    DEFAULT_IDLE_TIMEOUT = 10  # 10 minutes
+
     def __init__(
         self,
         base_dir: Optional[Path] = None,
@@ -69,7 +68,7 @@ def __init__(
         bucket_url: Optional[str] = None,
     ):
         """
-        Initialize SkyPilotRunner.
+        Initialize ResearchSkyPilotRunner.
 
         Args:
             base_dir: Base directory of Frontier-CS repo
@@ -106,19 +105,15 @@ def evaluate(
         Returns:
             EvaluationResult with score and status
         """
-        problem_path = self.get_problem_path(problem_id)
-
-        if not problem_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Problem not found: {problem_path}",
-            )
+        problem_path, error = self._get_problem_path_or_error(problem_id)
+        if error:
+            return error
 
         # Create temp directory with solution
         with tempfile.TemporaryDirectory(prefix="frontier_sky_") as temp_dir:
             temp_path = Path(temp_dir)
-            solution_path = temp_path / "solution.py"
+            ext = get_problem_extension(problem_path)
+            solution_path = temp_path / f"solution.{ext}"
             solution_path.write_text(solution_code, encoding="utf-8")
 
             return self._run_evaluation(problem_id, problem_path, solution_path, solution_id)
@@ -129,38 +124,17 @@ def evaluate_file(
         solution_path: Path,
         *,
         solution_id: Optional[str] = None,
-    ) -> EvaluationResult:
-        """Evaluate a solution file using SkyPilot."""
-        if not solution_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Solution file not found: {solution_path}",
-            )
-
-        # Check for generation failure marker (.FAILED file)
-        if solution_path.suffix == f".{FAILED_EXTENSION}":
-            try:
-                meta = json.loads(solution_path.read_text(encoding="utf-8"))
-                error_msg = meta.get("error", "Generation failed")
-            except (json.JSONDecodeError, OSError):
-                error_msg = "Generation failed"
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                score=0,
-                message=f"Generation failed: {error_msg}",
-            )
-
-        problem_path = self.get_problem_path(problem_id)
-        if not problem_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Problem not found: {problem_path}",
-            )
-
-        return self._run_evaluation(problem_id, problem_path, solution_path, solution_id)
+    ) -> EvaluationResult:
+        """Evaluate a solution file using SkyPilot."""
+        error = self._validate_solution_file(problem_id, solution_path)
+        if error:
+            return error
+
+        problem_path, error = self._get_problem_path_or_error(problem_id)
+        if error:
+            return error
+
+        return self._run_evaluation(problem_id, problem_path, solution_path, solution_id)
 
     def _run_evaluation(
         self,
@@ -174,14 +148,13 @@ def _run_evaluation(
 
         start_time = time.time()
 
-        # Load config from config.yaml
-        problem_config = load_problem_config(problem_path)
-        runtime_config = problem_config.runtime
-        docker_config = runtime_config.docker
-        res = runtime_config.resources
+        settings = self._load_runtime_settings(problem_path)
+        runtime_config = settings["runtime"]
+        docker_config = settings["docker"]
+        res = runtime_config.resources
 
         # Extract uv_project for automatic dependency installation
-        uv_project = problem_config.dependencies.get("uv_project")
+        uv_project = settings["uv_project"]
 
         # Determine resources
         accelerators = res.accelerators
@@ -191,12 +164,13 @@ def _run_evaluation(
             accelerators = self.DEFAULT_GPU
 
         # Determine timeout from config or default
-        effective_timeout = runtime_config.timeout_seconds or self.DEFAULT_TIMEOUT
+        effective_timeout = settings["timeout_seconds"] or self.DEFAULT_TIMEOUT
 
         # Create cluster name with date to avoid conflicts between runs
         date_str = datetime.now().strftime("%m%d%H%M")
         digest = hashlib.md5(f"{problem_id}-{date_str}".encode()).hexdigest()[:8]
-        cluster_name = _sanitize_name(f"eval-{problem_id}-{digest}")[:63]
+        cluster_name = _sanitize_name(f"eval-{problem_id}-{digest}")[:63]
+        ActiveClusterRegistry.register(cluster_name)
 
         # Build pair_id for bucket storage
         pair_id = f"{solution_id}:{problem_id}" if solution_id else None
@@ -306,14 +280,15 @@ def _run_evaluation(
                     duration_seconds=time.time() - start_time,
                 )
 
-            finally:
-                # Only down immediately if no autostop and not keeping cluster
-                if not self.keep_cluster and self.idle_timeout is None:
-                    try:
-                        down_request = sky.down(cluster_name)
-                        sky.stream_and_get(down_request)
-                    except Exception:
-                        pass
+            finally:
+                # Always down after evaluation unless explicitly keeping the cluster.
+                if not self.keep_cluster:
+                    try:
+                        down_request = sky.down(cluster_name)
+                        sky.stream_and_get(down_request)
+                    except Exception:
+                        pass
+                ActiveClusterRegistry.unregister(cluster_name)
 
     def _setup_mounts(
         self,
@@ -337,10 +312,11 @@ def _setup_mounts(
             if common_dir.is_dir():
                 mounts[f"{remote_base}/research/{parent}/common"] = str(common_dir.resolve())
 
-        # Mount solution
+        # Mount solution (rename to solution.{ext})
         solution_dir = workspace / "solution"
         solution_dir.mkdir(parents=True)
-        shutil.copy2(solution_path, solution_dir / "solution.py")
+        dest_name = f"solution{solution_path.suffix}"
+        shutil.copy2(solution_path, solution_dir / dest_name)
         mounts[f"{remote_base}/solution"] = str(solution_dir.resolve())
 
         return mounts
@@ -373,7 +349,7 @@ def _get_run_script(
     ) -> str:
         """Get run script for SkyPilot task."""
         gpu_flags = "--gpus all" if gpu else ""
-        timeout_prefix = f"timeout {timeout_seconds}s " if timeout_seconds else ""
+        timeout_prefix = self._build_timeout_prefix(timeout_seconds)
         dind_flags = '-v /var/run/docker.sock:/var/run/docker.sock' if dind else ""
 
         # Build Docker CLI install command for DinD (socket is mounted but CLI needed)
@@ -420,20 +396,7 @@ def _get_run_script(
         else:
             bucket_write = ""
 
-        # Build uv pip install command if uv_project is specified in config.yaml
-        # If uv_overrides.txt exists in the project, use it to protect system packages
-        if uv_project:
-            uv_sync_cmd = textwrap.dedent(f'''
-                    if [ -d "{uv_project}" ] && [ -f "{uv_project}/pyproject.toml" ]; then
-                        echo "[framework] Installing dependencies from {uv_project}"
-                        if [ -f "{uv_project}/uv_overrides.txt" ]; then
-                            uv pip install --system --overrides "{uv_project}/uv_overrides.txt" -e "{uv_project}"
-                        else
-                            uv pip install --system -e "{uv_project}"
-                        fi
-                    fi''').strip()
-        else:
-            uv_sync_cmd = "# No uv_project specified in config.yaml"
+        uv_sync_cmd = self._build_uv_install_cmd(uv_project)
 
         return textwrap.dedent(f"""\
             set -euo pipefail
@@ -459,7 +422,7 @@ def _get_run_script(
                     # Create execution_env and copy solution BEFORE set_up_env.sh
                     # (some scripts expect this structure to exist)
                     mkdir -p /work/execution_env/solution_env
-                    cp /work/solution/solution.py /work/execution_env/solution_env/
+                    cp /work/solution/solution.* /work/execution_env/solution_env/
                     echo "[framework] Evaluating: {pair_id or problem_id}"
 
                     cd /work/research/{problem_id}
@@ -637,43 +600,21 @@ def exec_on_cluster(
 
         start_time = time.time()
 
-        problem_path = self.get_problem_path(problem_id)
-        if not problem_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Problem not found: {problem_path}",
-            )
+        error = self._validate_solution_file(problem_id, solution_path)
+        if error:
+            return error
+
+        problem_path, error = self._get_problem_path_or_error(problem_id)
+        if error:
+            return error
 
-        if not solution_path.exists():
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                message=f"Solution file not found: {solution_path}",
-            )
-
-        # Check for generation failure marker (.FAILED file)
-        if solution_path.suffix == f".{FAILED_EXTENSION}":
-            try:
-                meta = json.loads(solution_path.read_text(encoding="utf-8"))
-                error_msg = meta.get("error", "Generation failed")
-            except (json.JSONDecodeError, OSError):
-                error_msg = "Generation failed"
-            return EvaluationResult(
-                problem_id=problem_id,
-                status=EvaluationStatus.ERROR,
-                score=0,
-                message=f"Generation failed: {error_msg}",
-            )
-
-        # Load config
-        problem_config = load_problem_config(problem_path)
-        runtime_config = problem_config.runtime
-        docker_config = runtime_config.docker
-        uv_project = problem_config.dependencies.get("uv_project")
+        settings = self._load_runtime_settings(problem_path)
+        runtime_config = settings["runtime"]
+        docker_config = settings["docker"]
+        uv_project = settings["uv_project"]
 
         # Determine timeout from config or default
-        effective_timeout = runtime_config.timeout_seconds or self.DEFAULT_TIMEOUT
+        effective_timeout = settings["timeout_seconds"] or self.DEFAULT_TIMEOUT
 
         # Create workspace with file mounts
         with tempfile.TemporaryDirectory(prefix="frontier_exec_") as workspace_dir:
@@ -764,11 +705,11 @@ def down_cluster(cluster_name: str) -> bool:
             print(f"Failed to terminate cluster {cluster_name}: {e}")
             return False
 
-    @staticmethod
-    def down_clusters(cluster_names: list) -> None:
-        """Terminate multiple clusters in parallel."""
-        import sky
-        from concurrent.futures import ThreadPoolExecutor
+    @staticmethod
+    def down_clusters(cluster_names: list) -> None:
+        """Terminate multiple clusters in parallel."""
+        import sky
+        from concurrent.futures import ThreadPoolExecutor
 
         def down_one(name):
             try:
@@ -777,5 +718,7 @@ def down_one(name):
             except Exception:
                 pass
 
-        with ThreadPoolExecutor(max_workers=len(cluster_names)) as executor:
-            executor.map(down_one, cluster_names)
+        with ThreadPoolExecutor(max_workers=len(cluster_names)) as executor:
+            executor.map(down_one, cluster_names)
+
+    # Active cluster cleanup is handled via ActiveClusterRegistry in callers.
diff --git a/src/frontier_cs/evaluator.py b/src/frontier_cs/single_evaluator.py
similarity index 75%
rename from src/frontier_cs/evaluator.py
rename to src/frontier_cs/single_evaluator.py
index f4fd36f9..9c1b6b61 100644
--- a/src/frontier_cs/evaluator.py
+++ b/src/frontier_cs/single_evaluator.py
@@ -5,51 +5,58 @@
 with support for different backends (local Docker, SkyPilot cloud).
 """
 
-from pathlib import Path
-from typing import List, Literal, Optional, Union
-
-from .runner import EvaluationResult, DockerRunner, AlgorithmicRunner
-from .runner.base import Runner
+import atexit
+import signal
+from pathlib import Path
+from typing import List, Literal, Optional, Union
+
+from .runner import AlgorithmicLocalRunner, EvaluationResult, ResearchDockerRunner
+from .runner.base import Runner
+from .runner.cluster_cleanup import ActiveClusterRegistry
+from .runner.research_skypilot import ResearchSkyPilotRunner
+from .runner.algorithmic_skypilot import AlgorithmicSkyPilotRunner
 
 
 TrackType = Literal["algorithmic", "research"]
 BackendType = Literal["docker", "skypilot"]
 
 
-class FrontierCSEvaluator:
+class SingleEvaluator:
     """
     Unified evaluator for Frontier-CS problems.
 
     Example usage:
-        evaluator = FrontierCSEvaluator()
+        evaluator = SingleEvaluator()
 
-        # Algorithmic problem
-        result = evaluator.evaluate("algorithmic", problem_id=1, code=cpp_code)
+        # Algorithmic problem (uses Docker by default)
+        result = evaluator.evaluate("algorithmic", problem_id=1, code=cpp_code)
 
-        # Research problem (local Docker)
-        result = evaluator.evaluate("research", problem_id="flash_attn", code=py_code)
+        # Research problem (uses SkyPilot by default)
+        result = evaluator.evaluate("research", problem_id="flash_attn", code=py_code)
 
-        # Research problem (SkyPilot)
-        result = evaluator.evaluate("research", problem_id="flash_attn", code=py_code,
-                                   backend="skypilot")
+        # Override backend
+        result = evaluator.evaluate("research", problem_id="flash_attn", code=py_code,
+                                   backend="docker")
     """
 
-    def __init__(
-        self,
-        backend: BackendType = "docker",
-        base_dir: Optional[Path] = None,
-        judge_url: str = "http://localhost:8081",
+    def __init__(
+        self,
+        backend: Optional[BackendType] = None,
+        base_dir: Optional[Path] = None,
+        judge_url: str = "http://localhost:8081",
         cloud: str = "gcp",
         region: Optional[str] = None,
         keep_cluster: bool = False,
-        idle_timeout: Optional[int] = 10,
-        timeout: Optional[int] = None,
-    ):
+        idle_timeout: Optional[int] = 10,
+        timeout: Optional[int] = None,
+        register_cleanup: bool = True,
+    ):
         """
-        Initialize FrontierCSEvaluator.
+        Initialize SingleEvaluator.
 
         Args:
-            backend: Default backend for research problems ("docker" or "skypilot")
+            backend: Override default backend ("docker" or "skypilot").
+                     If None, auto-detects: research -> skypilot, algorithmic -> docker
             base_dir: Base directory of Frontier-CS repo (auto-detected if None)
             judge_url: URL of the algorithmic judge server
             cloud: Cloud provider for SkyPilot ("gcp", "aws", "azure")
@@ -64,20 +71,49 @@ def __init__(
         self.cloud = cloud
         self.region = region
         self.keep_cluster = keep_cluster
-        self.idle_timeout = idle_timeout
-        self.timeout = timeout
+        self.idle_timeout = idle_timeout
+        self.timeout = timeout
+        self._register_cleanup = register_cleanup
 
         # Lazy-initialized runners
-        self._algorithmic_runner: Optional[AlgorithmicRunner] = None
+        self._algorithmic_runner: Optional[AlgorithmicLocalRunner] = None
         self._algorithmic_skypilot_runner: Optional[Runner] = None
-        self._docker_runner: Optional[DockerRunner] = None
-        self._skypilot_runner: Optional[Runner] = None
+        self._docker_runner: Optional[ResearchDockerRunner] = None
+        self._skypilot_runner: Optional[Runner] = None
+
+        if self._register_cleanup:
+            self._register_cleanup_hooks()
+
+    def _register_cleanup_hooks(self) -> None:
+        if self.keep_cluster:
+            return
+
+        def cleanup_on_exit():
+            try:
+                names = ActiveClusterRegistry.snapshot()
+                if names:
+                    ResearchSkyPilotRunner.down_clusters(names)
+            except Exception:
+                pass
+            try:
+                ResearchSkyPilotRunner.down_cluster(AlgorithmicSkyPilotRunner.CLUSTER_NAME)
+            except Exception:
+                pass
+
+        def signal_handler(signum, frame):
+            print("\n\nInterrupted! Cleaning up...")
+            cleanup_on_exit()
+            raise SystemExit(1)
+
+        atexit.register(cleanup_on_exit)
+        signal.signal(signal.SIGINT, signal_handler)
+
 
     @property
-    def algorithmic_runner(self) -> AlgorithmicRunner:
+    def algorithmic_runner(self) -> AlgorithmicLocalRunner:
         """Get or create the algorithmic runner."""
         if self._algorithmic_runner is None:
-            self._algorithmic_runner = AlgorithmicRunner(judge_url=self.judge_url)
+            self._algorithmic_runner = AlgorithmicLocalRunner(judge_url=self.judge_url)
         return self._algorithmic_runner
 
     @property
@@ -95,18 +131,20 @@ def algorithmic_skypilot_runner(self) -> Runner:
         return self._algorithmic_skypilot_runner
 
     @property
-    def docker_runner(self) -> DockerRunner:
+    def docker_runner(self) -> ResearchDockerRunner:
         """Get or create the Docker runner."""
         if self._docker_runner is None:
-            self._docker_runner = DockerRunner(base_dir=self.base_dir, timeout=self.timeout)
+            self._docker_runner = ResearchDockerRunner(
+                base_dir=self.base_dir, timeout=self.timeout
+            )
         return self._docker_runner
 
     @property
     def skypilot_runner(self) -> Runner:
         """Get or create the SkyPilot runner."""
         if self._skypilot_runner is None:
-            from .runner.skypilot import SkyPilotRunner
-            self._skypilot_runner = SkyPilotRunner(
+            from .runner.research_skypilot import ResearchSkyPilotRunner
+            self._skypilot_runner = ResearchSkyPilotRunner(
                 base_dir=self.base_dir,
                 cloud=self.cloud,
                 region=self.region,
@@ -117,7 +155,14 @@ def skypilot_runner(self) -> Runner:
 
     def _get_runner(self, track: TrackType, backend: Optional[BackendType] = None) -> Runner:
         """Get the appropriate runner for a track and backend."""
-        effective_backend = backend or self.default_backend
+        # Priority: explicit backend > init backend > track default
+        if backend:
+            effective_backend = backend
+        elif self.default_backend:
+            effective_backend = self.default_backend
+        else:
+            # Auto-detect: research -> skypilot, algorithmic -> docker
+            effective_backend = "skypilot" if track == "research" else "docker"
 
         if track == "algorithmic":
             if effective_backend == "skypilot":
@@ -289,5 +334,5 @@ def evaluate(
         result = evaluate("research", "flash_attn", solution_code)
         print(f"Score: {result.score}")
     """
-    evaluator = FrontierCSEvaluator(backend=backend)
+    evaluator = SingleEvaluator(backend=backend)
     return evaluator.evaluate(track, problem_id, code)