wgpu_examples/repeated_compute/
mod.rs

1//! See hello-compute example main.rs for more details
2//! as similar items here are not explained.
3//!
4//! This example does elaborate on some things though that the
5//! hello-compute example does not such as mapping buffers
6//! and why use the async channels.
7
8use nanorand::Rng;
9
10const OVERFLOW: u32 = 0xffffffff;
11
12async fn run() {
13    let mut numbers = [0u32; 256];
14    let context = WgpuContext::new(size_of_val(&numbers)).await;
15
16    let mut rand = nanorand::WyRand::new();
17
18    for _ in 0..10 {
19        for p in numbers.iter_mut() {
20            *p = rand.generate::<u16>() as u32;
21        }
22
23        compute(&mut numbers, &context).await;
24
25        let printed_numbers = numbers
26            .iter()
27            .map(|n| match n {
28                &OVERFLOW => "(overflow)".to_string(),
29                n => n.to_string(),
30            })
31            .collect::<Vec<String>>();
32        log::info!("Results: {printed_numbers:?}");
33    }
34}
35
36async fn compute(local_buffer: &mut [u32], context: &WgpuContext) {
37    log::info!("Beginning GPU compute on data {local_buffer:?}.");
38    // Local buffer contents -> GPU storage buffer
39    // Adds a write buffer command to the queue. This command is more complicated
40    // than it appears.
41    context.queue.write_buffer(
42        &context.storage_buffer,
43        0,
44        bytemuck::cast_slice(local_buffer),
45    );
46    log::info!("Wrote to buffer.");
47
48    let mut command_encoder = context
49        .device
50        .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
51
52    {
53        let mut compute_pass = command_encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
54            label: None,
55            timestamp_writes: None,
56        });
57        compute_pass.set_pipeline(&context.pipeline);
58        compute_pass.set_bind_group(0, &context.bind_group, &[]);
59        compute_pass.dispatch_workgroups(local_buffer.len() as u32, 1, 1);
60    }
61    // We finish the compute pass by dropping it.
62
63    // Entire storage buffer -> staging buffer.
64    command_encoder.copy_buffer_to_buffer(
65        &context.storage_buffer,
66        0,
67        &context.output_staging_buffer,
68        0,
69        context.storage_buffer.size(),
70    );
71
72    // Finalize the command encoder, add the contained commands to the queue and flush.
73    context.queue.submit(Some(command_encoder.finish()));
74    log::info!("Submitted commands.");
75
76    // Finally time to get our results.
77    // First we get a buffer slice which represents a chunk of the buffer (which we
78    // can't access yet).
79    // We want the whole thing so use unbounded range.
80    let buffer_slice = context.output_staging_buffer.slice(..);
81    // Now things get complicated. WebGPU, for safety reasons, only allows either the GPU
82    // or CPU to access a buffer's contents at a time. We need to "map" the buffer which means
83    // flipping ownership of the buffer over to the CPU and making access legal. We do this
84    // with `BufferSlice::map_async`.
85    //
86    // The problem is that map_async is not an async function so we can't await it. What
87    // we need to do instead is pass in a closure that will be executed when the slice is
88    // either mapped or the mapping has failed.
89    //
90    // The problem with this is that we don't have a reliable way to wait in the main
91    // code for the buffer to be mapped and even worse, calling get_mapped_range or
92    // get_mapped_range_mut prematurely will cause a panic, not return an error.
93    //
94    // Using channels solves this as awaiting the receiving of a message from
95    // the passed closure will force the outside code to wait. It also doesn't hurt
96    // if the closure finishes before the outside code catches up as the message is
97    // buffered and receiving will just pick that up.
98    //
99    // It may also be worth noting that although on native, the usage of asynchronous
100    // channels is wholly unnecessary, for the sake of portability to WASM (std channels
101    // don't work on WASM,) we'll use async channels that work on both native and WASM.
102    let (sender, receiver) = flume::bounded(1);
103    buffer_slice.map_async(wgpu::MapMode::Read, move |r| sender.send(r).unwrap());
104    // In order for the mapping to be completed, one of three things must happen.
105    // One of those can be calling `Device::poll`. This isn't necessary on the web as devices
106    // are polled automatically but natively, we need to make sure this happens manually.
107    // `PollType::Wait` will cause the thread to wait on native but not on WebGpu.
108    context
109        .device
110        .poll(wgpu::PollType::wait_indefinitely())
111        .unwrap();
112    log::info!("Device polled.");
113    // Now we await the receiving and panic if anything went wrong because we're lazy.
114    receiver.recv_async().await.unwrap().unwrap();
115    log::info!("Result received.");
116    // NOW we can call get_mapped_range.
117    {
118        let view = buffer_slice.get_mapped_range();
119        local_buffer.copy_from_slice(bytemuck::cast_slice(&view));
120    }
121    log::info!("Results written to local buffer.");
122    // We need to make sure all `BufferView`'s are dropped before we do what we're about
123    // to do.
124    // Unmap so that we can copy to the staging buffer in the next iteration.
125    context.output_staging_buffer.unmap();
126}
127
128pub fn main() {
129    #[cfg(not(target_arch = "wasm32"))]
130    {
131        env_logger::builder()
132            .filter_level(log::LevelFilter::Info)
133            .format_timestamp_nanos()
134            .init();
135        pollster::block_on(run());
136    }
137    #[cfg(target_arch = "wasm32")]
138    {
139        std::panic::set_hook(Box::new(console_error_panic_hook::hook));
140        console_log::init_with_level(log::Level::Info).expect("could not initialize logger");
141
142        crate::utils::add_web_nothing_to_see_msg();
143
144        wasm_bindgen_futures::spawn_local(run());
145    }
146}
147
148/// A convenient way to hold together all the useful wgpu stuff together.
149struct WgpuContext {
150    device: wgpu::Device,
151    queue: wgpu::Queue,
152    pipeline: wgpu::ComputePipeline,
153    bind_group: wgpu::BindGroup,
154    storage_buffer: wgpu::Buffer,
155    output_staging_buffer: wgpu::Buffer,
156}
157
158impl WgpuContext {
159    async fn new(buffer_size: usize) -> WgpuContext {
160        let instance = wgpu::Instance::default();
161        let adapter = instance
162            .request_adapter(&wgpu::RequestAdapterOptions::default())
163            .await
164            .unwrap();
165        let (device, queue) = adapter
166            .request_device(&wgpu::DeviceDescriptor {
167                label: None,
168                required_features: wgpu::Features::empty(),
169                required_limits: wgpu::Limits::downlevel_defaults(),
170                experimental_features: wgpu::ExperimentalFeatures::disabled(),
171                memory_hints: wgpu::MemoryHints::Performance,
172                trace: wgpu::Trace::Off,
173            })
174            .await
175            .unwrap();
176
177        // Our shader, kindly compiled with Naga.
178        let shader = device.create_shader_module(wgpu::include_wgsl!("shader.wgsl"));
179
180        // This is where the GPU will read from and write to.
181        let storage_buffer = device.create_buffer(&wgpu::BufferDescriptor {
182            label: None,
183            size: buffer_size as wgpu::BufferAddress,
184            usage: wgpu::BufferUsages::STORAGE
185                | wgpu::BufferUsages::COPY_DST
186                | wgpu::BufferUsages::COPY_SRC,
187            mapped_at_creation: false,
188        });
189        // For portability reasons, WebGPU draws a distinction between memory that is
190        // accessible by the CPU and memory that is accessible by the GPU. Only
191        // buffers accessible by the CPU can be mapped and accessed by the CPU and
192        // only buffers visible to the GPU can be used in shaders. In order to get
193        // data from the GPU, we need to use CommandEncoder::copy_buffer_to_buffer
194        // (which we will later) to copy the buffer modified by the GPU into a
195        // mappable, CPU-accessible buffer which we'll create here.
196        let output_staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
197            label: None,
198            size: buffer_size as wgpu::BufferAddress,
199            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
200            mapped_at_creation: false,
201        });
202
203        // This can be though of as the function signature for our CPU-GPU function.
204        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
205            label: None,
206            entries: &[wgpu::BindGroupLayoutEntry {
207                binding: 0,
208                visibility: wgpu::ShaderStages::COMPUTE,
209                ty: wgpu::BindingType::Buffer {
210                    ty: wgpu::BufferBindingType::Storage { read_only: false },
211                    has_dynamic_offset: false,
212                    // Going to have this be None just to be safe.
213                    min_binding_size: None,
214                },
215                count: None,
216            }],
217        });
218        // This ties actual resources stored in the GPU to our metaphorical function
219        // through the binding slots we defined above.
220        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
221            label: None,
222            layout: &bind_group_layout,
223            entries: &[wgpu::BindGroupEntry {
224                binding: 0,
225                resource: storage_buffer.as_entire_binding(),
226            }],
227        });
228
229        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
230            label: None,
231            bind_group_layouts: &[&bind_group_layout],
232            push_constant_ranges: &[],
233        });
234        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
235            label: None,
236            layout: Some(&pipeline_layout),
237            module: &shader,
238            entry_point: Some("main"),
239            compilation_options: Default::default(),
240            cache: None,
241        });
242
243        WgpuContext {
244            device,
245            queue,
246            pipeline,
247            bind_group,
248            storage_buffer,
249            output_staging_buffer,
250        }
251    }
252}
wgpu_examples/repeated_compute/mod.rs

wgpu_examples/repeated_compute/
mod.rs