diff --git a/src/arch/riscv64.zig b/src/arch/riscv64.zig
index 4d40434..39296fd 100644
--- a/src/arch/riscv64.zig
+++ b/src/arch/riscv64.zig
@@ -12,6 +12,10 @@ extern fn __rv64_enter_task(cx: *arch().Context) callconv(.C) noreturn;
 extern fn __rv64_switch_task(dcx: *arch().Context, scx: *arch().Context) callconv(.C) void;
 extern fn __rv64_task_enter_kernel() callconv(.C) noreturn;
 
+fn idleFunction() callconv(.naked) noreturn {
+    asm volatile ("j .");
+}
+
 pub fn arch() type {
     return struct {
         pub threadlocal var tHartId: u32 = 0;
@@ -22,6 +26,11 @@ pub fn arch() type {
             // Has to be exactly at offset 0x00, used in assembly
             kstack: thread.KStack(STACK_SIZE),
 
+            pub fn idle() @This() {
+                const entry = @intFromPtr(&idleFunction);
+                return Context.kernel(entry, 0);
+            }
+
             pub fn kernel(pc: usize, arg: usize) @This() {
                 var ks = thread.KStack(STACK_SIZE).create();
                 const entry = @intFromPtr(&__rv64_task_enter_kernel);
diff --git a/src/arch/riscv64/boot.zig b/src/arch/riscv64/boot.zig
index bb463d1..2ec9efd 100644
--- a/src/arch/riscv64/boot.zig
+++ b/src/arch/riscv64/boot.zig
@@ -4,15 +4,15 @@ const kernel = @import("../../kernel.zig");
 const vmm = @import("vmm.zig");
 const regs = @import("regs.zig");
 const dtb = @import("../../util/dtb.zig");
-const physMemory = @import("../../mem/phys.zig");
+const mem = @import("../../mem.zig");
 const arena = @import("../../arena.zig");
 const exception = @import("exception.zig");
 
+const physMemory = mem.phys;
+const PAGE_SIZE = mem.vmm.PAGE_SIZE;
 const log = debug.log;
 const arch = kernel.arch;
 
-extern const __rela_start: u8;
-extern const __rela_end: u8;
 extern const __rv64_bsp_stack_top: u8;
 extern const __kernel_start: u8;
 extern const __kernel_end: u8;
@@ -20,7 +20,94 @@ extern const __kernel_end: u8;
 var gDtbAddress: usize = 0;
 var gBspHartId: u32 = 0;
 
-pub export fn rv64RelocateKernel(imageBase: usize, relaStart: usize, relaEnd: usize) void {
+fn bspUpperEntry(realAddress: usize, unused: usize) callconv(.C) noreturn {
+    _ = unused;
+
+    arch.barrier(.acq_rel);
+
+    // Relocate the kernel yet again, this time to another base
+    const relaStart = @intFromPtr(&__rela_start);
+    const relaEnd = @intFromPtr(&__rela_end);
+    const relOffset = vmm.KERNEL_VIRTUAL_BASE + vmm.L1.offset(realAddress);
+
+    arch.barrier(.acq_rel);
+    rv64RelocateKernel(relOffset, relaStart, relaEnd);
+    vmm.unmapEarly();
+
+    // Setup exception handling
+    exception.init();
+
+    debug.log.setWriteFn(&sbi.debugPrintByte);
+    kernel.mem.PhysicalAddress.gVirtualizeBase = 0;
+    kernel.mem.PhysicalAddress.gVirtualizeSize = vmm.virtualizeRange();
+
+    // Setup physical memory management
+    setupMemoryFromFdt(realAddress);
+
+    setupPerCpu();
+    arch.tHartId = gBspHartId;
+
+    kernel.kernel_main();
+}
+
+pub export fn rv64BspLowerEntry(realAddress: usize, bspHartId: usize, dtbAddress: usize) callconv(.C) noreturn {
+    debug.log.setWriteFn(&sbi.debugPrintByte);
+
+    gDtbAddress = dtbAddress;
+    gBspHartId = @truncate(bspHartId);
+
+    vmm.mapEarly(realAddress);
+
+    // &bspUpperEntry will yield a pointer like: X + P, where
+    // * X is symbol's raw address,
+    // * P is the physical load base of the image (0x80200000 on rv64 usually)
+    //
+    // Relocate the address to point to Y + P, where Y is the virtual load base
+    // const kernelL1Offset = realAddress & ((1 << 30) - 1);
+    const realAddressL1Offset = vmm.L1.offset(realAddress);
+    const virtualEntry = @intFromPtr(&bspUpperEntry) + vmm.KERNEL_VIRTUAL_BASE - realAddress + realAddressL1Offset;
+    const virtualSp = @intFromPtr(&__rv64_bsp_stack_top) + vmm.KERNEL_VIRTUAL_BASE - realAddress + realAddressL1Offset;
+
+    longJump(virtualEntry, virtualSp, realAddress, 0);
+
+    arch.halt();
+}
+
+// Functions used by the boot process
+
+extern const __rela_start: u8;
+extern const __rela_end: u8;
+extern var __tdata_start: u8;
+extern var __tdata_end: u8;
+extern var __tbss_start: u8;
+extern var __tbss_end: u8;
+
+fn setupPerCpu() void {
+    // Assume .tbss follows .tdata
+    const tdataStart = @intFromPtr(&__tdata_start);
+    const tdataEnd = @intFromPtr(&__tdata_end);
+    const tdataSize = tdataEnd - tdataStart;
+    const tbssStart = @intFromPtr(&__tbss_start);
+    const tbssEnd = @intFromPtr(&__tbss_end);
+    const tbssSize = tbssEnd - tbssStart;
+
+    const tdataData = @as([*]u8, @ptrFromInt(tdataStart))[0..tdataSize];
+
+    const tlsSize = tdataSize + tbssSize;
+    const tlsPageCount = (tlsSize + PAGE_SIZE - 1) / PAGE_SIZE;
+    // Variant I: TLS block 0 follows TP after a certain displacement
+    const tlsAddress = physMemory.alloc_pages(tlsPageCount).?.virtualize();
+    const tlsData = @as([*]u8, @ptrFromInt(tlsAddress))[0..tlsSize];
+
+    log.info("Allocated TLS @ {*}", .{ tlsData });
+
+    @memcpy(tlsData[0..tdataSize], tdataData);
+    @memset(tlsData[tdataSize..], 0);
+
+    arch.setThreadPointer(tlsAddress);
+}
+
+export fn rv64RelocateKernel(imageBase: usize, relaStart: usize, relaEnd: usize) void {
     const elf = @import("std").elf;
 
     const relaTablePtr = @as([*]elf.Rela, @ptrFromInt(relaStart));
@@ -74,36 +161,6 @@ fn setupMemoryFromFdt(realAddress: usize) void {
     physMemory.init();
 }
 
-fn bspUpperEntry(realAddress: usize, unused: usize) callconv(.C) noreturn {
-    _ = unused;
-
-    arch.barrier(.acq_rel);
-
-    // Relocate the kernel yet again, this time to another base
-    const relaStart = @intFromPtr(&__rela_start);
-    const relaEnd = @intFromPtr(&__rela_end);
-    const relOffset = vmm.KERNEL_VIRTUAL_BASE + vmm.L1.offset(realAddress);
-
-    arch.barrier(.acq_rel);
-    rv64RelocateKernel(relOffset, relaStart, relaEnd);
-    vmm.unmapEarly();
-
-    // Setup exception handling
-    exception.init();
-
-    debug.log.setWriteFn(&sbi.debugPrintByte);
-    kernel.mem.PhysicalAddress.gVirtualizeBase = 0;
-    kernel.mem.PhysicalAddress.gVirtualizeSize = vmm.virtualizeRange();
-
-    // Setup physical memory management
-    setupMemoryFromFdt(realAddress);
-
-    kernel.thread.setupCurrentCpu();
-    arch.tHartId = gBspHartId;
-
-    kernel.kernel_main();
-}
-
 inline fn longJump(pc: usize, sp: usize, a0: usize, a1: usize) noreturn {
     asm volatile (
         \\ mv sp, %[sp]
@@ -117,26 +174,3 @@ inline fn longJump(pc: usize, sp: usize, a0: usize, a1: usize) noreturn {
     );
     unreachable;
 }
-
-pub export fn rv64BspLowerEntry(realAddress: usize, bspHartId: usize, dtbAddress: usize) callconv(.C) noreturn {
-    debug.log.setWriteFn(&sbi.debugPrintByte);
-
-    gDtbAddress = dtbAddress;
-    gBspHartId = @truncate(bspHartId);
-
-    vmm.mapEarly(realAddress);
-
-    // &bspUpperEntry will yield a pointer like: X + P, where
-    // * X is symbol's raw address,
-    // * P is the physical load base of the image (0x80200000 on rv64 usually)
-    //
-    // Relocate the address to point to Y + P, where Y is the virtual load base
-    // const kernelL1Offset = realAddress & ((1 << 30) - 1);
-    const realAddressL1Offset = vmm.L1.offset(realAddress);
-    const virtualEntry = @intFromPtr(&bspUpperEntry) + vmm.KERNEL_VIRTUAL_BASE - realAddress + realAddressL1Offset;
-    const virtualSp = @intFromPtr(&__rv64_bsp_stack_top) + vmm.KERNEL_VIRTUAL_BASE - realAddress + realAddressL1Offset;
-
-    longJump(virtualEntry, virtualSp, realAddress, 0);
-
-    arch.halt();
-}
diff --git a/src/kernel.zig b/src/kernel.zig
index ba0ee4d..708071d 100644
--- a/src/kernel.zig
+++ b/src/kernel.zig
@@ -17,6 +17,7 @@ fn f0(arg: usize) callconv(.C) noreturn {
     while (true) {
         f1(arg, c);
         c += 1;
+        thread.yield();
     }
 }
 
@@ -25,18 +26,17 @@ noinline fn f1(arg: usize, c: usize) void {
 }
 
 pub export fn kernel_main() callconv(.C) noreturn {
+    log.write("\x1B[2J", .{});
     var a = arena.Arena.setup(256 * 0x1000) orelse @panic("Could not setup kernel arena");
-    const pc = @intFromPtr(&f0);
+    thread.Queue.initThisCpu(&a);
 
-    // log.write("\x1B[2J", .{});
-    for (0..32) |i| {
+    const pc = @intFromPtr(&f0);
+    for (0..4) |i| {
         const t = thread.Thread.create(&a, pc, i);
-        thread.addThread(t);
+        thread.enqueue(t);
     }
 
     thread.enter();
-
-    arch.halt();
 }
 
 pub fn panic(msg: []const u8, error_return_trace: ?*std.builtin.StackTrace, return_address: ?usize) noreturn {
diff --git a/src/thread.zig b/src/thread.zig
index 23dbfde..73bfd16 100644
--- a/src/thread.zig
+++ b/src/thread.zig
@@ -5,6 +5,70 @@ const arch = @import("kernel.zig").arch;
 const log = @import("debug.zig").log;
 const mem = @import("mem.zig");
 
+pub const Queue = struct {
+    idle: arch.Context,
+    current: ?*Thread = null,
+    head: ?*Thread = null,
+
+    pub threadlocal var thisCpu: ?*Queue = null;
+
+    pub fn initThisCpu(a: *arena.Arena) void {
+        const idle = arch.Context.idle();
+        const q = a.create(Queue);
+        q.* = .{ .idle = idle };
+        thisCpu = q;
+    }
+
+    pub fn enter(self: *@This()) noreturn {
+        if (self.head) |gt| {
+            self.current = gt;
+            gt.enter();
+        } else {
+            self.current = null;
+            self.idle.enter();
+        }
+    }
+
+    pub fn yield(self: *@This()) void {
+        if (self.current) |curr| {
+            // Switching from thread
+            if (curr.next) |next| {
+                // ... to thread
+                if (next != curr) {
+                    self.current = next;
+                    next.switchFrom(curr);
+                }
+            } else {
+                // ... to idle
+                self.current = null;
+                self.idle.switchFrom(&curr.archContext);
+            }
+        } else {
+            // Switching from idle
+            if (self.head) |gt| {
+                // ... to thread
+                self.current = gt;
+                gt.archContext.switchFrom(&self.idle);
+                return;
+            }
+            // ... back to idle
+        }
+    }
+
+    pub fn enqueue(self: *@This(), t: *Thread) void {
+        if (self.head) |gt| {
+            t.next = gt;
+            t.prev = gt.prev;
+            gt.prev.?.next = t;
+            gt.prev = t;
+        } else {
+            self.head = t;
+            t.next = t;
+            t.prev = t;
+        }
+    }
+};
+
 pub const Thread = struct {
     allocator: *arena.Arena,
     archContext: arch.Context,
@@ -12,11 +76,11 @@ pub const Thread = struct {
     next: ?*Thread = null,
     prev: ?*Thread = null,
 
-    pub fn create(a: *arena.Arena, pc: usize, sp: usize) *Thread {
+    pub fn create(a: *arena.Arena, pc: usize, arg: usize) *Thread {
         const thread = a.create(Thread);
         thread.* = .{
             .allocator = a,
-            .archContext = arch.Context.kernel(pc, sp),
+            .archContext = arch.Context.kernel(pc, arg),
         };
         return thread;
     }
@@ -42,11 +106,7 @@ pub fn KStack(comptime SIZE: usize) type {
             const physicalBase = mem.phys.alloc_pages(SIZE * @sizeOf(usize) / 0x1000) orelse @panic("OOM");
             const ptr = @as(*[SIZE]usize, @ptrFromInt(physicalBase.virtualize()));
 
-            return .{
-                .data = ptr,
-                .physicalBase = physicalBase,
-                .sp = @ptrFromInt(@intFromPtr(&ptr[0]) + SIZE * @sizeOf(usize))
-            };
+            return .{ .data = ptr, .physicalBase = physicalBase, .sp = @ptrFromInt(@intFromPtr(&ptr[0]) + SIZE * @sizeOf(usize)) };
         }
 
         pub fn push(self: *@This(), value: usize) void {
@@ -59,66 +119,14 @@ pub fn KStack(comptime SIZE: usize) type {
     };
 }
 
-var gThreadHead: ?*Thread = null;
-var gCurrent: ?*Thread = null;
-
-pub fn addThread(t: *Thread) void {
-    if (gThreadHead) |gt| {
-        t.next = gt;
-        t.prev = gt.prev;
-        gt.prev.?.next = t;
-        gt.prev = t;
-    } else {
-        gThreadHead = t;
-        t.next = t;
-        t.prev = t;
-    }
+pub fn enqueue(t: *Thread) void {
+    Queue.thisCpu.?.enqueue(t);
 }
 
 pub fn enter() noreturn {
-    if (gThreadHead) |gt| {
-        gCurrent = gt;
-        gt.enter();
-    }
-    @panic("Unreachable");
+    Queue.thisCpu.?.enter();
 }
 
 pub fn yield() void {
-    const curr = gCurrent orelse @panic("No current thread");
-    const next = curr.next orelse @panic("No next thread");
-
-    if (curr != next) {
-        gCurrent = next;
-        next.switchFrom(curr);
-    }
-}
-
-extern var __tdata_start: u8;
-extern var __tdata_end: u8;
-extern var __tbss_start: u8;
-extern var __tbss_end: u8;
-
-pub fn setupCurrentCpu() void {
-    // Assume .tbss follows .tdata
-    const tdataStart = @intFromPtr(&__tdata_start);
-    const tdataEnd = @intFromPtr(&__tdata_end);
-    const tdataSize = tdataEnd - tdataStart;
-    const tbssStart = @intFromPtr(&__tbss_start);
-    const tbssEnd = @intFromPtr(&__tbss_end);
-    const tbssSize = tbssEnd - tbssStart;
-
-    const tdataData = @as([*]u8, @ptrFromInt(tdataStart))[0..tdataSize];
-
-    const tlsSize = tdataSize + tbssSize;
-    const tlsPageCount = (tlsSize + mem.vmm.PAGE_SIZE - 1) / mem.vmm.PAGE_SIZE;
-    // Variant I: TLS block 0 follows TP after a certain displacement
-    const tlsAddress = mem.phys.alloc_pages(tlsPageCount).?.virtualize();
-    const tlsData = @as([*]u8, @ptrFromInt(tlsAddress))[0..tlsSize];
-
-    log.info("Allocated TLS @ {*}", .{ tlsData });
-
-    @memcpy(tlsData[0..tdataSize], tdataData);
-    @memset(tlsData[tdataSize..], 0);
-
-    arch.setThreadPointer(tlsAddress);
+    Queue.thisCpu.?.yield();
 }