From 5225018397297fcae91cd96eb7c4af6e098bede1 Mon Sep 17 00:00:00 2001 From: Abhijit Kar Date: Thu, 6 Nov 2025 17:54:02 +0530 Subject: [PATCH 1/4] Added Bunnymark example, written using SDL3 GPU. It can teach a lot of things, for instance: - [x] GPU Instancing - [x] Bit Packing - [x] Fixed Update Loop - [x] Multithreading - [x] AoS - [x] SoA - [ ] SIMD --- Currently the SIMD implementation is not performant. (i.e. `simulate_soa_simd`) Also, the comments are not currently explaining all the things done in the code. --- .gitignore | 4 +- sdl3/bunnymark/assets/bunnys.png | Bin 0 -> 1887 bytes sdl3/bunnymark/assets/wabbit_alpha.png | Bin 0 -> 449 bytes sdl3/bunnymark/bunnymark.odin | 691 +++++++++++++++++++++++++ sdl3/bunnymark/shader.hlsl.frag | 19 + sdl3/bunnymark/shader.hlsl.vert | 87 ++++ 6 files changed, 800 insertions(+), 1 deletion(-) create mode 100644 sdl3/bunnymark/assets/bunnys.png create mode 100644 sdl3/bunnymark/assets/wabbit_alpha.png create mode 100644 sdl3/bunnymark/bunnymark.odin create mode 100644 sdl3/bunnymark/shader.hlsl.frag create mode 100644 sdl3/bunnymark/shader.hlsl.vert diff --git a/.gitignore b/.gitignore index fa72eef..6c944df 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ ols.json # orca samples orca_output -module.wasm \ No newline at end of file +module.wasm + +.DS_Store \ No newline at end of file diff --git a/sdl3/bunnymark/assets/bunnys.png b/sdl3/bunnymark/assets/bunnys.png new file mode 100644 index 0000000000000000000000000000000000000000..7010eb274c53d0108799bcadb14ac743afd767fe GIT binary patch literal 1887 zcmV-l2cY0003CP)t-s00000 z006-<)50>;hhG3?hX4SDctCE40O|h#w9){x(PoDL0P6pUU;saEhCgkF09|JQUS|ME zROLMSyM+LYy8!7y0RR902{Zoyxu^iSsQ`v}P>88OZia_*{)ck@hjagkT$-&i zx`Y6Vx&Vs1006kC09;h+K>$#RsZfZhQHiOVbN`U{|Bv?nhg<5wGSu#=0JHA^0PNKO zT~rD){{V$|UZTi^4gmiq0GV_DkN5xOJ^SW8`|hX!v+e+Ye}4dTeFR=<|3Lq=?*QsT z008UN_Oku%Mi9qYUox@dIFXdaw&d86FXRJilQx97HsBl(0u6<>-~W^J zKf6*aYbP%4qmNM{P4>5Qt~BrE(WCDsLdk=ySS*&yWp;-k0!&xg;DR7=DJ2ww1;aQ?Y@4r?Ttz>g8Kb>3>V zs?}zz^{kb`J3ITWX6vYR)NHos8gItl**O4cwbU^ zi0t1S!Fr=Dj|sQmYU|$-dw;X(I|s+$$H%lmrB>WLGO}(Te|Y=ht;QntinX5;|MZFO z?RMg&6Pau#Ym0XK`EmOeO;_Nr$OxB!Sg4e&_xZO}TXfRzYrdSq>DKsN<2S=@N ztfZ{BV_JPL68lW-JdA@t296_y<3M3j>wpFj+#JR^C^$Vj941@h{OaAXW~7CSLs1g!No=eujnI^PYp zI^PYV;Gh9lHx6=vPlet#E^|j8w!>QIz&`5JVGDBtI$6j^#u|mig9~Xe)`HX5nF~&5 zaqeH@A3C_)+45pW!{cJPoGp#aT^uE?iY}Cyf#X%B@YV{utTnCnL?TKGyRzGLg>ZYW zbUEDXxwyGq+!^Pt0%`|qjc#;%ZVwO68p~2|l7h!VV*`6VD|45#S>!_GAthRK(Kp2C;-WtigR4}E8ipHP>A{uH16?dH zmNU3;jm!xz)%QJHx;+z@+>2;B%zCDEDeul=?6UaYy357xZL`iNop-s^{=vFCiz#P~ zM$TsVMyPe1bJbRAY_H39%~gk!ciFnTZi!(2ONp)9jRzM_kFgesJWO0l;4h4G!Rg29%rX+=*Jo%Y^^aT+{>=TMh@lTA{NVry`U;TK&VUl{ij{6ml=`?81ReEq~ zQ?x`oxk?PK0;H~&rL1c$I=>hTVSJERF{kTEJkD2(%p+zAyfvgDX6R(C|3@`c)~L z*uzoY5FSO!CLUt-G{(HO!lZmPaER5>797gZ^MnXTp7a>(jl9takQ6gDz+njc5Ep~R z%d?mIZBUAV&(6*?R?5THe{j}_I`SaM!3&wLji@XV>^n^lF3%f!VsFp4EMJU9qnpIw zvMg&4Zo~LRM&;qnO?+&?70aiZDe}F9USe=r6-tTPg4lEU literal 0 HcmV?d00001 diff --git a/sdl3/bunnymark/assets/wabbit_alpha.png b/sdl3/bunnymark/assets/wabbit_alpha.png new file mode 100644 index 0000000000000000000000000000000000000000..79c31675083b7ffc272a6370bc189360bde484d0 GIT binary patch literal 449 zcmV;y0Y3hTP)SOJFMz`0@qn1GK$9W|wOohG3Lai}FkWLtiT<2<{JbM8>W7|R$! zUuNqg#gmKmsTnILy&K;zJjd^_40xy)%EA1>FWWk~LFpr>Et|!Hv(V>Hs9!2xEQJ>~!qz76HLsxFXdG zs5~GjUDeD6dXvG$_#HtoUok|M-X@BSTy{VK4P?TJZcz}O&FVzD0fZu8Yajr@?2aYL r_Q@_>J=ju&>AqoA-=wzwF98MsNX4 (texture: ^sdl.GPUTexture) { + // Load the "./assets/bunnys.png", to see multicolored bunnies. + // N.B. You must uncomment some lines in `shader.hlsl.vert` file to see those bunnies properly. + surface := img.Load("./assets/wabbit_alpha.png"); assert(surface != nil) + premultiply_surface_alpha_bitwise(surface) + defer sdl.DestroySurface(surface) + + pixels_byte_size := surface.w * surface.h * 4 + + texture = sdl.CreateGPUTexture(gpu, { + type = .D2, + format = .R8G8B8A8_UNORM, + usage = {.SAMPLER}, + width = u32(surface.w), + height = u32(surface.h), + layer_count_or_depth = 1, + num_levels = 1, + }) + + tex_transfer_buf := sdl.CreateGPUTransferBuffer(gpu, { + usage = .UPLOAD, + size = u32(pixels_byte_size) + }) + + tex_transfer_mem := sdl.MapGPUTransferBuffer(gpu, tex_transfer_buf, false) + mem.copy(tex_transfer_mem, surface.pixels, int(pixels_byte_size)) + sdl.UnmapGPUTransferBuffer(gpu, tex_transfer_buf) + + copy_cmd_buf := sdl.AcquireGPUCommandBuffer(gpu) + copy_pass := sdl.BeginGPUCopyPass(copy_cmd_buf) + + sdl.UploadToGPUTexture(copy_pass, + { transfer_buffer = tex_transfer_buf }, + { texture = texture, w = u32(surface.w), h = u32(surface.h), d = 1 }, + false + ) + + sdl.EndGPUCopyPass(copy_pass) + ok := sdl.SubmitGPUCommandBuffer(copy_cmd_buf); assert(ok) + + sdl.ReleaseGPUTransferBuffer(gpu, tex_transfer_buf) + + return texture +} + +populate_bunnies :: proc(sprites_soa: ^#soa[]SpriteSoA, sprites_aos: ^[]SpriteAoS) -> Buffers { + for i in 0..= len(data.sprites_aos) do break + + packed := data.sprites_aos[idx].position_and_color + + x := i32(packed >> 21 & 0x7FF) + y := i32(packed >> 11 & 0x3FF) + s := i32(packed >> 7 & 0xF ) + + // Apply velocity + x += i32(vxs[idx]) + y += i32(vys[idx]) + + // Bounce X + if x < 0 { + x = -x + vxs[idx] = -vxs[idx] + } else if x > SCREEN_SIZE.x { + x = 2 * SCREEN_SIZE.x - x + vxs[idx] = -vxs[idx] + } + + // Bounce Y + if y < 0 { + y = -y + vys[idx] = -vys[idx] + } else if y > SCREEN_SIZE.y { + y = 2 * SCREEN_SIZE.y - y + vys[idx] = -vys[idx] + } + + packed = u32((i32(x) << 21) | (i32(y) << 11) | (i32(s) << 8)) + + data.sprites_aos[idx].position_and_color = packed + } + } +} + +simulate_soa :: proc(t: thread.Task) { + data := cast(^ThreadTask)t.data + + // Already reaches AoS perf, by looping over arrays separately (x, y: u32, vx, vy: f32) + xs := data.sprites_soa.x + ys := data.sprites_soa.y + vxs := data.sprites_soa.vx + vys := data.sprites_soa.vy + + dt := data.dt + n := len(data.sprites_soa) + + for i in 0.. f32(SCREEN_SIZE.x) { + xs[i] = 2 * f32(SCREEN_SIZE.x) - xs[i] + vxs[i] = -vxs[i] + } + } + for i in 0.. f32(SCREEN_SIZE.y) { + ys[i] = 2 * f32(SCREEN_SIZE.y) - ys[i] + vys[i] = -vys[i] + } + } + for i in 0..> 7 & 0xF) + data.sprites_aos[i].position_and_color = (u32(xs[i]) << 21) | (u32(ys[i]) << 11) | (u32(s) << 7) + } +} + +simulate_soa_simd :: proc(t: thread.Task) { + data := cast(^ThreadTask)t.data + + xs := data.sprites_soa.x + ys := data.sprites_soa.y + vxs := data.sprites_soa.vx + vys := data.sprites_soa.vy + + n := len(data.sprites_soa) + + screen_x := Vecf32(SCREEN_SIZE.x) + screen_y := Vecf32(SCREEN_SIZE.y) + zero := Vecf32(0.0) + two := Vecf32(2.0) + + index := iota(Vecf32) + mask := simd.lanes_lt(index, Vecf32(n)) + + // X Axis + i := 0 + for ; i+4 <= n; i += 4 { + // load lanes from slices + x := simd.from_slice(Vecf32, xs[i : i + W]) + vx := simd.from_slice(Vecf32, vxs[i : i + W]) + + x = simd.add(x, vx) + + // compute bounce masks (per-lane) + mask_lt := simd.lanes_lt(x, zero) // x < 0 + vx = simd.select(mask_lt, +simd.abs(vx), vx) + + mask_gt := simd.lanes_gt(x, screen_x) // x > screen_x + vx = simd.select(mask_gt, -simd.abs(vx), vx) + + mask_any := mask_lt | mask_gt // lanes that bounced + + simd.masked_store(&xs [i], x, mask) + simd.masked_store(&vxs[i], vx, mask_any) + } + + // Tail for X + for ; i < n; i += 1 { + xs[i] += vxs[i] + + if xs[i] < 0 { + xs[i] = -xs[i] + vxs[i] = -vxs[i] + } else if xs[i] > f32(SCREEN_SIZE.x) { + xs[i] = 2 * f32(SCREEN_SIZE.x) - xs[i] + vxs[i] = -vxs[i] + } + } + + // Y Axis + i = 0 + for ; i+4 <= n; i += 4 { + y := simd.from_slice(Vecf32, ys[i : i + W]) + vy := simd.from_slice(Vecf32, vys[i : i + W]) + + y = simd.add(y, vy) + + // compute bounce masks (per-lane) + mask_lt := simd.lanes_lt(y, zero) // y < 0 + vy = simd.select(mask_lt, +simd.abs(vy), vy) + + mask_gt := simd.lanes_gt(y, screen_y) // y > screen_y + vy = simd.select(mask_gt, -simd.abs(vy), vy) + + mask_any := mask_lt | mask_gt // lanes that bounced + + simd.masked_store(&ys [i], y, mask) + simd.masked_store(&vys[i], vy, mask_any) + } + + // Tail for Y + for ; i < n; i += 1 { + ys[i] += vys[i] + + if ys[i] < 0 { + ys[i] = -ys[i] + vys[i] = -vys[i] + } else if ys[i] > f32(SCREEN_SIZE.y) { + ys[i] = 2 * f32(SCREEN_SIZE.y) - ys[i] + vys[i] = -vys[i] + } + } + + // Pack-Em-up + for i in 0..= FIXED_DELTA_TIME && updates < MAX_FRAME_SKIP; accumulator -= FIXED_DELTA_TIME { + updates += 1 + fixed_updates += 1 + + for i in 0..= BUNNIES do break + + data = new(ThreadTask) + data.dt = f32(FIXED_DELTA_TIME) * 20 + data.sprites_soa = sprites_soa[start_id:end_id] + data.sprites_aos = sprites_aos[start_id:end_id] + + // thread.pool_add_task(&pool, context.allocator, simulate, data, i) + thread.pool_add_task(&pool, context.allocator, simulate_soa, data, i) + // thread.pool_add_task(&pool, context.allocator, simulate_soa_simd, data, i) + } + + thread.pool_finish(&pool) + + // This freezes the screen while trying to quit, even though it runs a bit faster. + // Adding boundary collision slows it down to the same speed as Multi Threaded one. + // for i := 0; i < BUNNIES; i += 100 { + // #unroll for j in 0..<100 { + // idx := i + j + // if idx >= BUNNIES do break + + // packed = sprites_instances[idx].position_and_color + + // x = packed >> 21 & 0x7FF + // y = packed >> 11 & 0x3FF + // c = packed >> 8 & 0x7 + // x = x < u32(SCREEN_SIZE.x) ? x + u32(sdl.rand(20)) : 0 + // y = y < u32(SCREEN_SIZE.y) ? y + u32(sdl.rand(20)) : 0 + + // packed = (x << 21) | (y << 11) | (c << 8) + + // sprites_instances[idx].position_and_color = packed + // } + // } + + mem.copy(buffers.transfer_mem, raw_data(sprites_aos), buffers.sprites_instances_byte_size) + + copy_cmd_buf = sdl.AcquireGPUCommandBuffer(gpu) + copy_pass = sdl.BeginGPUCopyPass(copy_cmd_buf) + sdl.UploadToGPUBuffer(copy_pass, + { transfer_buffer = buffers.transfer_buf }, + { buffer = buffers.sprites_instances_buffer, + size = u32(buffers.sprites_instances_byte_size) }, + true + ) + sdl.EndGPUCopyPass(copy_pass) + ok = sdl.SubmitGPUCommandBuffer(copy_cmd_buf); assert(ok) + } + + if updates >= MAX_FRAME_SKIP { + accumulator = 0.0 + } + alpha = accumulator / FIXED_DELTA_TIME + + cmd_buf = sdl.AcquireGPUCommandBuffer(gpu) + ok = sdl.WaitAndAcquireGPUSwapchainTexture( + cmd_buf, + window, + &swapchain_tex, + nil, + nil, + ); assert(ok) + + if swapchain_tex != nil { + color_target = sdl.GPUColorTargetInfo { + texture = swapchain_tex, + load_op = .CLEAR, + clear_color = {1, 1, 1, 1}, + store_op = .STORE, + cycle = false, + } + render_pass = sdl.BeginGPURenderPass(cmd_buf, &color_target, 1, nil) + + sdl.BindGPUGraphicsPipeline (render_pass, pipeline) + sdl.BindGPUVertexStorageBuffers(render_pass, 0, &buffers.sprites_instances_buffer, 1) + sdl.BindGPUFragmentSamplers (render_pass, 0, &(sdl.GPUTextureSamplerBinding { + texture = bunny_texture, + sampler = sampler, + }), 1) + sdl.DrawGPUPrimitives(render_pass, 4, BUNNIES, 0, 0) + sdl.EndGPURenderPass (render_pass) + } + + ok = sdl.SubmitGPUCommandBuffer(cmd_buf); assert(ok) + + frame_count += 1 + time_accumulator += dt + if time_accumulator >= 1 { + current_fps = f64(frame_count) / time_accumulator + fps_smoothed = 0.9 * fps_smoothed + 0.1 * current_fps + updates_per_sec = f64(fixed_updates) / time_accumulator + + text = fmt.caprintf("Bunnymark | FPS: %.2f (%.2f ms) | Fixed Updates: %.2f Hz", current_fps, dt, updates_per_sec) + sdl.SetWindowTitle(window, text) + + frame_count = 0 + fixed_updates = 0 + time_accumulator = 0 + } + } +} + +premultiply_surface_alpha_bitwise :: proc(surf: ^sdl.Surface) { + pixels := cast(^u32) surf.pixels; assert(surf.format == .ABGR8888) + + for i in 0..< surf.w * surf.h { + p := mem.ptr_offset(pixels, i) + + a := p^ >> 24 & 0xFF + b := p^ >> 16 & 0xFF + g := p^ >> 8 & 0xFF + r := p^ & 0xFF + + unchanged_alpha := a + a /= 255.0 + + p^ = unchanged_alpha << 24 | b * a << 16 | g * a << 8 | r * a + } +} + +iota :: proc ($V: typeid/#simd[$N]$E) -> (result: V) { + for i in 0.. sprites : register(t1, space0); + +struct VSOutput +{ + float4 position : SV_Position; + float2 uv : TEXCOORD0; + uint sprite_index : COLOR0; +}; + +// --------- Constants --------- +static const float2 vertex_pos[4] = { + float2(0.0, 0.0), + float2(0.0, 1.0), + float2(1.0, 0.0), + float2(1.0, 1.0) +}; + +// ---- Convert from pixel coordinates to NDC (-1..1) ---- +static const float2 sprite_size = float2( 52.0, 74.0); +static const float2 screen_size = float2(1280.0, 720.0); + +// ---- Quad Size in NDC ---- +static const float2 sprite_size_ndc = sprite_size / screen_size; + +static const float num_cols = 1.0; +static const float num_rows = 5.0; + +// ---- Precomputed SpriteSheet UVs ---- +// Uncomment to use the spritesheet, with multi colored bunnies. +// struct SpriteUV +// { +// float2 uv_min; +// float2 uv_max; +// }; + +// static const SpriteUV uvs[5] = { +// { float2( 0.0, 0.01 ), float2( 0.9, 0.22 ) }, +// { float2( 0.0, 0.236 ), float2( 0.96, 0.4 ) }, +// { float2( 0.0, 0.42 ), float2( 1.0, 0.6 ) }, +// { float2( 0.0, 0.62 ), float2( 1.0, 0.8 ) }, +// { float2( 0.0, 0.8 ), float2( 1.0, 1.0 ) }, +// }; + +VSOutput main(uint vertexID : SV_VertexID, uint instanceID : SV_InstanceID) +{ + VSOutput output; + + // ---- Indexing ---- + uint instance_index = instanceID; + uint vertex_index = vertexID % 4; + + // ---- Load packed sprite data once ---- + uint packed = sprites[instance_index].position_and_color; + + // ---- Decode packed fields ---- + // [x, x, x, x, x, x, x, x, x, x, x, y, y, y, y, y, y, y, y, y, y, s, s, s, s, 0, 0, 0, 0, 0, 0, 0] (11x, 10y, 4s) 7 bit padding + uint px = (packed >> 21) & 0x7FFu; // 11 bits + uint py = (packed >> 11) & 0x3FFu; // 10 bits + // uint si = (packed >> 7) & 0xFu ; // 4 bits + + // ---- Position in NDC ---- + float2 position = float2(px, py); + float2 ndc_pos = (position / screen_size) * 2.0f - 1.0f; + + float2 vertex_coord = vertex_pos[vertex_index]; + float2 offset = (vertex_coord - 0.5f) * sprite_size_ndc; + + float2 world_pos = ndc_pos + offset; + + output.position = float4(world_pos.x, -world_pos.y, 0.0, 1.0); + + // For Sprite Sheet + // Uncomment to see multicolored bunnies, instead of a single colored. + // SpriteUV uv = uvs[si]; + // output.uv = lerp(uv.uv_min, uv.uv_max, vertex_coord); + + // Single Sprite + output.uv = vertex_coord; + + return output; +} \ No newline at end of file From c4e5c7f610e57b49ebc2f178d8e000ac0d302c75 Mon Sep 17 00:00:00 2001 From: Abhijit Kar Date: Thu, 6 Nov 2025 18:10:03 +0530 Subject: [PATCH 2/4] Fixed the issues with the code, by using: `-vet -strict-style -vet-tabs -disallow-do -warnings-as-errors` for compilation. --- sdl3/bunnymark/bunnymark.odin | 176 +++++++++++++++++----------------- 1 file changed, 89 insertions(+), 87 deletions(-) diff --git a/sdl3/bunnymark/bunnymark.odin b/sdl3/bunnymark/bunnymark.odin index 737b796..857b717 100644 --- a/sdl3/bunnymark/bunnymark.odin +++ b/sdl3/bunnymark/bunnymark.odin @@ -1,6 +1,5 @@ package bunnymark -import "core:log" import "core:mem" import "core:os" import "core:thread" @@ -84,14 +83,14 @@ SpriteAoS :: struct { // by the power of #soa, by Odin. SpriteSoA :: struct #align(16) { x , y : f32, - vx, vy: f32 + vx, vy: f32, } // Used to send slices out to threads in the thread pool, so that they can loop of 100,000 entities in parallel. ThreadTask :: struct { dt : f32, sprites_aos : []SpriteAoS, - sprites_soa : #soa []SpriteSoA + sprites_soa : #soa []SpriteSoA, } // Reusable resources, returned by populate bunnies, so we don't keep re instanciating them. Buffers :: struct { @@ -136,7 +135,7 @@ setup_pipeline :: proc() { num_samplers = 0, num_storage_buffers = 1, // for SSBO num_storage_textures = 0, - props = 0 + props = 0, }, ) frag_shader := sdl.CreateGPUShader( @@ -151,7 +150,7 @@ setup_pipeline :: proc() { num_samplers = 1, num_storage_buffers = 0, num_storage_textures = 0, - props = 0 + props = 0, }, ) @@ -187,10 +186,10 @@ setup_pipeline :: proc() { src_alpha_blendfactor = sdl.GPUBlendFactor.ONE, dst_color_blendfactor = sdl.GPUBlendFactor.ONE_MINUS_SRC_ALPHA, dst_alpha_blendfactor = sdl.GPUBlendFactor.ONE_MINUS_SRC_ALPHA, - } + }, }), has_depth_stencil_target = false, - depth_stencil_format = .INVALID // ✅ no depth buffer + depth_stencil_format = .INVALID, // ✅ no depth buffer }, }, ) @@ -229,7 +228,7 @@ load_bunny_texture :: proc() -> (texture: ^sdl.GPUTexture) { tex_transfer_buf := sdl.CreateGPUTransferBuffer(gpu, { usage = .UPLOAD, - size = u32(pixels_byte_size) + size = u32(pixels_byte_size), }) tex_transfer_mem := sdl.MapGPUTransferBuffer(gpu, tex_transfer_buf, false) @@ -242,7 +241,7 @@ load_bunny_texture :: proc() -> (texture: ^sdl.GPUTexture) { sdl.UploadToGPUTexture(copy_pass, { transfer_buffer = tex_transfer_buf }, { texture = texture, w = u32(surface.w), h = u32(surface.h), d = 1 }, - false + false, ) sdl.EndGPUCopyPass(copy_pass) @@ -258,23 +257,27 @@ populate_bunnies :: proc(sprites_soa: ^#soa[]SpriteSoA, sprites_aos: ^[]SpriteAo x := rand.float32_range(0, f32(SCREEN_SIZE.x)) y := rand.float32_range(0, f32(SCREEN_SIZE.y)) - sprites_aos[i].position_and_color = 0 | u32(x) << 21 | u32(y) << 11 | u32(sdl.rand(5)) << 7 + sprites_aos[i] = { + position_and_color = 0 | u32(x) << 21 | u32(y) << 11 | u32(sdl.rand(5)) << 7, + } - sprites_soa.x [i] = x - sprites_soa.y [i] = y - sprites_soa.vx[i] = rand.float32_range(-18, 20) + 2 - sprites_soa.vy[i] = rand.float32_range(-16, 20) + 4 + sprites_soa[i] = { + x = x, + y = y, + vx = rand.float32_range(-18, 20) + 2, + vy = rand.float32_range(-16, 20) + 4, + } } sprites_instances_byte_size := BUNNIES * size_of(SpriteAoS) sprites_instances_buffer := sdl.CreateGPUBuffer(gpu, { usage = {.GRAPHICS_STORAGE_READ}, - size = u32(sprites_instances_byte_size) + size = u32(sprites_instances_byte_size), }) transfer_buf := sdl.CreateGPUTransferBuffer(gpu, { usage = .UPLOAD, - size = u32(sprites_instances_byte_size) + size = u32(sprites_instances_byte_size), }) transfer_mem := sdl.MapGPUTransferBuffer(gpu, transfer_buf, true) mem.copy(transfer_mem, raw_data(sprites_aos[:]), sprites_instances_byte_size) @@ -285,7 +288,7 @@ populate_bunnies :: proc(sprites_soa: ^#soa[]SpriteSoA, sprites_aos: ^[]SpriteAo { transfer_buffer = transfer_buf }, { buffer = sprites_instances_buffer, size = u32(sprites_instances_byte_size) }, - true + true, ) sdl.EndGPUCopyPass(copy_pass) ok := sdl.SubmitGPUCommandBuffer(copy_cmd_buf); assert(ok) @@ -307,7 +310,7 @@ simulate :: proc(t: thread.Task) { for i := 0; i < len(data.sprites_aos); i += 100 { #unroll for j in 0..<100 { idx := i + j - if idx >= len(data.sprites_aos) do break + if idx >= len(data.sprites_aos) { break } packed := data.sprites_aos[idx].position_and_color @@ -316,26 +319,26 @@ simulate :: proc(t: thread.Task) { s := i32(packed >> 7 & 0xF ) // Apply velocity - x += i32(vxs[idx]) - y += i32(vys[idx]) + x += i32(vxs[idx]) + y += i32(vys[idx]) // Bounce X - if x < 0 { - x = -x - vxs[idx] = -vxs[idx] + if x < 0 { + x = -x + vxs[idx] = -vxs[idx] } else if x > SCREEN_SIZE.x { - x = 2 * SCREEN_SIZE.x - x - vxs[idx] = -vxs[idx] + x = 2 * SCREEN_SIZE.x - x + vxs[idx] = -vxs[idx] } // Bounce Y - if y < 0 { - y = -y - vys[idx] = -vys[idx] - } else if y > SCREEN_SIZE.y { - y = 2 * SCREEN_SIZE.y - y - vys[idx] = -vys[idx] - } + if y < 0 { + y = -y + vys[idx] = -vys[idx] + } else if y > SCREEN_SIZE.y { + y = 2 * SCREEN_SIZE.y - y + vys[idx] = -vys[idx] + } packed = u32((i32(x) << 21) | (i32(y) << 11) | (i32(s) << 8)) @@ -360,22 +363,22 @@ simulate_soa :: proc(t: thread.Task) { xs[i] += vxs[i] * dt if xs[i] < 0 { - xs[i] = -xs[i] - vxs[i] = -vxs[i] + xs[i] = -xs[i] + vxs[i] = -vxs[i] } else if xs[i] > f32(SCREEN_SIZE.x) { - xs[i] = 2 * f32(SCREEN_SIZE.x) - xs[i] - vxs[i] = -vxs[i] + xs[i] = 2 * f32(SCREEN_SIZE.x) - xs[i] + vxs[i] = -vxs[i] } } for i in 0.. f32(SCREEN_SIZE.y) { - ys[i] = 2 * f32(SCREEN_SIZE.y) - ys[i] - vys[i] = -vys[i] + ys[i] = 2 * f32(SCREEN_SIZE.y) - ys[i] + vys[i] = -vys[i] } } for i in 0.. screen_x - vx = simd.select(mask_gt, -simd.abs(vx), vx) + mask_gt := simd.lanes_gt(x, screen_x) // x > screen_x + vx = simd.select(mask_gt, -simd.abs(vx), vx) - mask_any := mask_lt | mask_gt // lanes that bounced + mask_any := mask_lt | mask_gt // lanes that bounced - simd.masked_store(&xs [i], x, mask) - simd.masked_store(&vxs[i], vx, mask_any) + simd.masked_store(&xs [i], x, mask) + simd.masked_store(&vxs[i], vx, mask_any) } // Tail for X - for ; i < n; i += 1 { + for ;i < n; i += 1 { xs[i] += vxs[i] if xs[i] < 0 { @@ -440,27 +442,27 @@ simulate_soa_simd :: proc(t: thread.Task) { // Y Axis i = 0 - for ; i+4 <= n; i += 4 { + for ;i+4 <= n; i += 4 { y := simd.from_slice(Vecf32, ys[i : i + W]) - vy := simd.from_slice(Vecf32, vys[i : i + W]) + vy := simd.from_slice(Vecf32, vys[i : i + W]) - y = simd.add(y, vy) + y = simd.add(y, vy) // compute bounce masks (per-lane) - mask_lt := simd.lanes_lt(y, zero) // y < 0 - vy = simd.select(mask_lt, +simd.abs(vy), vy) + mask_lt := simd.lanes_lt(y, zero) // y < 0 + vy = simd.select(mask_lt, +simd.abs(vy), vy) - mask_gt := simd.lanes_gt(y, screen_y) // y > screen_y - vy = simd.select(mask_gt, -simd.abs(vy), vy) + mask_gt := simd.lanes_gt(y, screen_y) // y > screen_y + vy = simd.select(mask_gt, -simd.abs(vy), vy) - mask_any := mask_lt | mask_gt // lanes that bounced + mask_any := mask_lt | mask_gt // lanes that bounced - simd.masked_store(&ys [i], y, mask) - simd.masked_store(&vys[i], vy, mask_any) + simd.masked_store(&ys [i], y, mask) + simd.masked_store(&vys[i], vy, mask_any) } // Tail for Y - for ; i < n; i += 1 { + for ;i < n; i += 1 { ys[i] += vys[i] if ys[i] < 0 { @@ -473,8 +475,8 @@ simulate_soa_simd :: proc(t: thread.Task) { } // Pack-Em-up - for i in 0..= BUNNIES do break + if start_id >= BUNNIES { break } data = new(ThreadTask) data.dt = f32(FIXED_DELTA_TIME) * 20 @@ -575,7 +577,7 @@ main :: proc() { // thread.pool_add_task(&pool, context.allocator, simulate_soa_simd, data, i) } - thread.pool_finish(&pool) + thread.pool_finish(&pool) // This freezes the screen while trying to quit, even though it runs a bit faster. // Adding boundary collision slows it down to the same speed as Multi Threaded one. @@ -606,7 +608,7 @@ main :: proc() { { transfer_buffer = buffers.transfer_buf }, { buffer = buffers.sprites_instances_buffer, size = u32(buffers.sprites_instances_byte_size) }, - true + true, ) sdl.EndGPUCopyPass(copy_pass) ok = sdl.SubmitGPUCommandBuffer(copy_cmd_buf); assert(ok) @@ -666,20 +668,20 @@ main :: proc() { } premultiply_surface_alpha_bitwise :: proc(surf: ^sdl.Surface) { - pixels := cast(^u32) surf.pixels; assert(surf.format == .ABGR8888) + pixels := cast(^u32) surf.pixels; assert(surf.format == .ABGR8888) - for i in 0..< surf.w * surf.h { - p := mem.ptr_offset(pixels, i) + for i in 0..< surf.w * surf.h { + p := mem.ptr_offset(pixels, i) - a := p^ >> 24 & 0xFF - b := p^ >> 16 & 0xFF - g := p^ >> 8 & 0xFF - r := p^ & 0xFF + a := p^ >> 24 & 0xFF + b := p^ >> 16 & 0xFF + g := p^ >> 8 & 0xFF + r := p^ & 0xFF - unchanged_alpha := a - a /= 255.0 + unchanged_alpha := a + a /= 255.0 - p^ = unchanged_alpha << 24 | b * a << 16 | g * a << 8 | r * a + p^ = unchanged_alpha << 24 | b * a << 16 | g * a << 8 | r * a } } From 049570ac9f974fcb6531d9d15ee166f90ff025cf Mon Sep 17 00:00:00 2001 From: Abhijit Kar Date: Thu, 6 Nov 2025 18:12:27 +0530 Subject: [PATCH 3/4] Added bunnymark to `check.yml`. --- .github/workflows/check.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index a11d7ca..450fd29 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -133,6 +133,8 @@ jobs: odin check sdl2/hellope $FLAGS odin check sdl2/microui $FLAGS + odin check sdl3/bunnymark/bunnymark $FLAGS + odin check simd/approaches $FLAGS odin check simd/basic-sum $FLAGS odin check simd/motion $FLAGS From bc48143a220b674a683b031c398743ad2d573d19 Mon Sep 17 00:00:00 2001 From: Abhijit Kar Date: Sat, 15 Nov 2025 15:03:00 +0530 Subject: [PATCH 4/4] Added waitgroup --- sdl3/bunnymark/bunnymark.odin | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/sdl3/bunnymark/bunnymark.odin b/sdl3/bunnymark/bunnymark.odin index 857b717..516ee44 100644 --- a/sdl3/bunnymark/bunnymark.odin +++ b/sdl3/bunnymark/bunnymark.odin @@ -5,6 +5,7 @@ import "core:os" import "core:thread" import "core:simd" import "core:fmt" +import "core:sync" import "core:math" import "core:math/rand" @@ -41,7 +42,7 @@ SoA ✅ SIMD 🚧 */ -BUNNIES :: 6_00_000 +BUNNIES :: 6_50_000 // If you want to change the resolution, you must update the same // in the shader.hlsl.vert. @@ -91,6 +92,7 @@ ThreadTask :: struct { dt : f32, sprites_aos : []SpriteAoS, sprites_soa : #soa []SpriteSoA, + wg : ^sync.Wait_Group, } // Reusable resources, returned by populate bunnies, so we don't keep re instanciating them. Buffers :: struct { @@ -257,7 +259,7 @@ populate_bunnies :: proc(sprites_soa: ^#soa[]SpriteSoA, sprites_aos: ^[]SpriteAo x := rand.float32_range(0, f32(SCREEN_SIZE.x)) y := rand.float32_range(0, f32(SCREEN_SIZE.y)) - sprites_aos[i] = { + sprites_aos[i] = { position_and_color = 0 | u32(x) << 21 | u32(y) << 11 | u32(sdl.rand(5)) << 7, } @@ -345,6 +347,8 @@ simulate :: proc(t: thread.Task) { data.sprites_aos[idx].position_and_color = packed } } + + sync.wait_group_done(data.wg) } simulate_soa :: proc(t: thread.Task) { @@ -386,6 +390,8 @@ simulate_soa :: proc(t: thread.Task) { s := i32(packed >> 7 & 0xF) data.sprites_aos[i].position_and_color = (u32(xs[i]) << 21) | (u32(ys[i]) << 11) | (u32(s) << 7) } + + sync.wait_group_done(data.wg) } simulate_soa_simd :: proc(t: thread.Task) { @@ -479,6 +485,8 @@ simulate_soa_simd :: proc(t: thread.Task) { data.sprites_aos[p].position_and_color = (u32(xs[i]) << 21) | (u32(ys[i]) << 11) | (u32(3) << 8) } + + sync.wait_group_done(data.wg) } main :: proc() { @@ -511,6 +519,7 @@ main :: proc() { chunks := int(math.ceil(f64(BUNNIES) / f64(thread_count))) data : ^ThreadTask + wg : sync.Wait_Group pool: thread.Pool thread.pool_init(&pool, context.allocator, thread_count) thread.pool_start(&pool) @@ -548,9 +557,13 @@ main :: proc() { for sdl.PollEvent(&ev) { #partial switch ev.type { case .QUIT: + thread.pool_finish(&pool) break main_loop case .KEY_DOWN: - if ev.key.scancode == .ESCAPE { break main_loop } + if ev.key.scancode == .ESCAPE { + thread.pool_finish(&pool) + break main_loop + } } } @@ -567,17 +580,21 @@ main :: proc() { end_id = math.min(start_id + chunks, BUNNIES) if start_id >= BUNNIES { break } + sync.wait_group_add(&wg, 1) + data = new(ThreadTask) data.dt = f32(FIXED_DELTA_TIME) * 20 data.sprites_soa = sprites_soa[start_id:end_id] data.sprites_aos = sprites_aos[start_id:end_id] + data.wg = &wg // thread.pool_add_task(&pool, context.allocator, simulate, data, i) thread.pool_add_task(&pool, context.allocator, simulate_soa, data, i) // thread.pool_add_task(&pool, context.allocator, simulate_soa_simd, data, i) } - thread.pool_finish(&pool) + // thread.pool_finish(&pool) + sync.wait_group_wait(&wg) // This freezes the screen while trying to quit, even though it runs a bit faster. // Adding boundary collision slows it down to the same speed as Multi Threaded one.