wgpu/util/
belt.rs

1use crate::{
2    util::align_to, Buffer, BufferAddress, BufferDescriptor, BufferSize, BufferSlice, BufferUsages,
3    BufferViewMut, CommandEncoder, Device, MapMode,
4};
5use alloc::vec::Vec;
6use core::fmt;
7use std::sync::mpsc;
8
9use crate::COPY_BUFFER_ALIGNMENT;
10
11/// Efficiently performs many buffer writes by sharing and reusing temporary buffers.
12///
13/// Internally it uses a ring-buffer of staging buffers that are sub-allocated.
14/// Its advantage over [`Queue::write_buffer_with()`] is that the individual allocations
15/// are cheaper; `StagingBelt` is most useful when you are writing very many small pieces
16/// of data. It can be understood as a sort of arena allocator.
17///
18/// Using a staging belt is slightly complicated, and generally goes as follows:
19/// 1. Use [`StagingBelt::write_buffer()`] or [`StagingBelt::allocate()`] to allocate
20///    buffer slices, then write your data to them.
21/// 2. Call [`StagingBelt::finish()`].
22/// 3. Submit all command encoders that were used in step 1.
23/// 4. Call [`StagingBelt::recall()`].
24///
25/// [`Queue::write_buffer_with()`]: crate::Queue::write_buffer_with
26pub struct StagingBelt {
27    device: Device,
28    chunk_size: BufferAddress,
29    /// Chunks into which we are accumulating data to be transferred.
30    active_chunks: Vec<Chunk>,
31    /// Chunks that have scheduled transfers already; they are unmapped and some
32    /// command encoder has one or more commands with them as source.
33    closed_chunks: Vec<Chunk>,
34    /// Chunks that are back from the GPU and ready to be mapped for write and put
35    /// into `active_chunks`.
36    free_chunks: Vec<Chunk>,
37    /// When closed chunks are mapped again, the map callback sends them here.
38    sender: Exclusive<mpsc::Sender<Chunk>>,
39    /// Free chunks are received here to be put on `self.free_chunks`.
40    receiver: Exclusive<mpsc::Receiver<Chunk>>,
41}
42
43impl StagingBelt {
44    /// Create a new staging belt.
45    ///
46    /// The `chunk_size` is the unit of internal buffer allocation; writes will be
47    /// sub-allocated within each chunk. Therefore, for optimal use of memory, the
48    /// chunk size should be:
49    ///
50    /// * larger than the largest single [`StagingBelt::write_buffer()`] operation;
51    /// * 1-4 times less than the total amount of data uploaded per submission
52    ///   (per [`StagingBelt::finish()`]); and
53    /// * bigger is better, within these bounds.
54    pub fn new(device: Device, chunk_size: BufferAddress) -> Self {
55        let (sender, receiver) = mpsc::channel();
56        StagingBelt {
57            device,
58            chunk_size,
59            active_chunks: Vec::new(),
60            closed_chunks: Vec::new(),
61            free_chunks: Vec::new(),
62            sender: Exclusive::new(sender),
63            receiver: Exclusive::new(receiver),
64        }
65    }
66
67    /// Allocate a staging belt slice of `size` to be copied into the `target` buffer
68    /// at the specified offset.
69    ///
70    /// `offset` and `size` must be multiples of [`COPY_BUFFER_ALIGNMENT`]
71    /// (as is required by the underlying buffer operations).
72    ///
73    /// The upload will be placed into the provided command encoder. This encoder
74    /// must be submitted after [`StagingBelt::finish()`] is called and before
75    /// [`StagingBelt::recall()`] is called.
76    ///
77    /// If the `size` is greater than the size of any free internal buffer, a new buffer
78    /// will be allocated for it. Therefore, the `chunk_size` passed to [`StagingBelt::new()`]
79    /// should ideally be larger than every such size.
80    #[track_caller]
81    pub fn write_buffer(
82        &mut self,
83        encoder: &mut CommandEncoder,
84        target: &Buffer,
85        offset: BufferAddress,
86        size: BufferSize,
87    ) -> BufferViewMut {
88        // Asserting this explicitly gives a usefully more specific, and more prompt, error than
89        // leaving it to regular API validation.
90        // We check only `offset`, not `size`, because `self.allocate()` will check the size.
91        assert!(
92            offset.is_multiple_of(COPY_BUFFER_ALIGNMENT),
93            "StagingBelt::write_buffer() offset {offset} must be a multiple of `COPY_BUFFER_ALIGNMENT`"
94        );
95
96        let slice_of_belt = self.allocate(
97            size,
98            const { BufferSize::new(crate::COPY_BUFFER_ALIGNMENT).unwrap() },
99        );
100        encoder.copy_buffer_to_buffer(
101            slice_of_belt.buffer(),
102            slice_of_belt.offset(),
103            target,
104            offset,
105            size.get(),
106        );
107        slice_of_belt.get_mapped_range_mut()
108    }
109
110    /// Allocate a staging belt slice with the given `size` and `alignment` and return it.
111    ///
112    /// `size` must be a multiple of [`COPY_BUFFER_ALIGNMENT`]
113    /// (as is required by the underlying buffer operations).
114    ///
115    /// To use this slice, call [`BufferSlice::get_mapped_range_mut()`] and write your data into
116    /// that [`BufferViewMut`].
117    /// (The view must be dropped before [`StagingBelt::finish()`] is called.)
118    ///
119    /// You can then record your own GPU commands to perform with the slice,
120    /// such as copying it to a texture or executing a compute shader that reads it (whereas
121    /// [`StagingBelt::write_buffer()`] can only write to other buffers).
122    /// All commands involving this slice must be submitted after
123    /// [`StagingBelt::finish()`] is called and before [`StagingBelt::recall()`] is called.
124    ///
125    /// If the `size` is greater than the space available in any free internal buffer, a new buffer
126    /// will be allocated for it. Therefore, the `chunk_size` passed to [`StagingBelt::new()`]
127    /// should ideally be larger than every such size.
128    ///
129    /// The chosen slice will be positioned within the buffer at a multiple of `alignment`,
130    /// which may be used to meet alignment requirements for the operation you wish to perform
131    /// with the slice. This does not necessarily affect the alignment of the [`BufferViewMut`].
132    #[track_caller]
133    pub fn allocate(&mut self, size: BufferSize, alignment: BufferSize) -> BufferSlice<'_> {
134        assert!(
135            size.get().is_multiple_of(COPY_BUFFER_ALIGNMENT),
136            "StagingBelt allocation size {size} must be a multiple of `COPY_BUFFER_ALIGNMENT`"
137        );
138        assert!(
139            alignment.get().is_power_of_two(),
140            "alignment must be a power of two, not {alignment}"
141        );
142        // At minimum, we must have alignment sufficient to map the buffer.
143        let alignment = alignment.get().max(crate::MAP_ALIGNMENT);
144
145        let mut chunk = if let Some(index) = self
146            .active_chunks
147            .iter()
148            .position(|chunk| chunk.can_allocate(size, alignment))
149        {
150            self.active_chunks.swap_remove(index)
151        } else {
152            self.receive_chunks(); // ensure self.free_chunks is up to date
153
154            if let Some(index) = self
155                .free_chunks
156                .iter()
157                .position(|chunk| chunk.can_allocate(size, alignment))
158            {
159                self.free_chunks.swap_remove(index)
160            } else {
161                Chunk {
162                    buffer: self.device.create_buffer(&BufferDescriptor {
163                        label: Some("(wgpu internal) StagingBelt staging buffer"),
164                        size: self.chunk_size.max(size.get()),
165                        usage: BufferUsages::MAP_WRITE | BufferUsages::COPY_SRC,
166                        mapped_at_creation: true,
167                    }),
168                    offset: 0,
169                }
170            }
171        };
172
173        let allocation_offset = chunk.allocate(size, alignment);
174
175        self.active_chunks.push(chunk);
176        let chunk = self.active_chunks.last().unwrap();
177
178        chunk
179            .buffer
180            .slice(allocation_offset..allocation_offset + size.get())
181    }
182
183    /// Prepare currently mapped buffers for use in a submission.
184    ///
185    /// This must be called before the command encoder(s) provided to
186    /// [`StagingBelt::write_buffer()`] are submitted.
187    ///
188    /// At this point, all the partially used staging buffers are closed (cannot be used for
189    /// further writes) until after [`StagingBelt::recall()`] is called *and* the GPU is done
190    /// copying the data from them.
191    pub fn finish(&mut self) {
192        for chunk in self.active_chunks.drain(..) {
193            chunk.buffer.unmap();
194            self.closed_chunks.push(chunk);
195        }
196    }
197
198    /// Recall all of the closed buffers back to be reused.
199    ///
200    /// This must only be called after the command encoder(s) provided to
201    /// [`StagingBelt::write_buffer()`] are submitted. Additional calls are harmless.
202    /// Not calling this as soon as possible may result in increased buffer memory usage.
203    pub fn recall(&mut self) {
204        self.receive_chunks();
205
206        for chunk in self.closed_chunks.drain(..) {
207            let sender = self.sender.get_mut().clone();
208            chunk
209                .buffer
210                .clone()
211                .slice(..)
212                .map_async(MapMode::Write, move |_| {
213                    let _ = sender.send(chunk);
214                });
215        }
216    }
217
218    /// Move all chunks that the GPU is done with (and are now mapped again)
219    /// from `self.receiver` to `self.free_chunks`.
220    fn receive_chunks(&mut self) {
221        while let Ok(mut chunk) = self.receiver.get_mut().try_recv() {
222            chunk.offset = 0;
223            self.free_chunks.push(chunk);
224        }
225    }
226}
227
228impl fmt::Debug for StagingBelt {
229    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230        let Self {
231            device,
232            chunk_size,
233            active_chunks,
234            closed_chunks,
235            free_chunks,
236            sender: _,
237            receiver: _,
238        } = self;
239        f.debug_struct("StagingBelt")
240            .field("device", device)
241            .field("chunk_size", chunk_size)
242            .field("active_chunks", &active_chunks.len())
243            .field("closed_chunks", &closed_chunks.len())
244            .field("free_chunks", &free_chunks.len())
245            .finish_non_exhaustive()
246    }
247}
248
249struct Chunk {
250    buffer: Buffer,
251    offset: BufferAddress,
252}
253
254impl Chunk {
255    fn can_allocate(&self, size: BufferSize, alignment: BufferAddress) -> bool {
256        let alloc_start = align_to(self.offset, alignment);
257        let alloc_end = alloc_start + size.get();
258
259        alloc_end <= self.buffer.size()
260    }
261
262    fn allocate(&mut self, size: BufferSize, alignment: BufferAddress) -> BufferAddress {
263        let alloc_start = align_to(self.offset, alignment);
264        let alloc_end = alloc_start + size.get();
265
266        assert!(alloc_end <= self.buffer.size());
267        self.offset = alloc_end;
268        alloc_start
269    }
270}
271
272use exclusive::Exclusive;
273mod exclusive {
274    /// `Sync` wrapper that works by providing only exclusive access.
275    ///
276    /// See <https://doc.rust-lang.org/nightly/std/sync/struct.Exclusive.html>
277    pub(super) struct Exclusive<T>(T);
278
279    /// Safety: `&Exclusive` has no operations.
280    unsafe impl<T> Sync for Exclusive<T> {}
281
282    impl<T> Exclusive<T> {
283        pub fn new(value: T) -> Self {
284            Self(value)
285        }
286
287        pub fn get_mut(&mut self) -> &mut T {
288            &mut self.0
289        }
290    }
291}