From e2ceb2b1f7cd0fd7778b53aaf8ba1caa71b2f7f5 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Fri, 11 Jul 2025 17:38:42 +0800 Subject: otg: Improve IN write performance chunks_exact() can be handled by the compiler more efficiently. Previous code was making a memcpy call for each 4 byte chunk slice. Hoisting the fifo out of the loop avoids recalculating the pointer each time. In my benchmark I see a jump from ~13 megabyte/sec to ~25MB/sec after this change (opt-level=3). opt-level = "z" goes 9MB/s to 18MB/s. The benchmark was on a stm32h7s3l8, 600mhz clock, 512 byte bulk writes, data in DTCM. The benchmark isn't just USB writes, also has some unrelated memcpys for packet construction. --- embassy-usb-synopsys-otg/src/lib.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'embassy-usb-synopsys-otg') diff --git a/embassy-usb-synopsys-otg/src/lib.rs b/embassy-usb-synopsys-otg/src/lib.rs index fc4428b54..3f6531813 100644 --- a/embassy-usb-synopsys-otg/src/lib.rs +++ b/embassy-usb-synopsys-otg/src/lib.rs @@ -1210,10 +1210,23 @@ impl<'d> embassy_usb_driver::EndpointIn for Endpoint<'d, In> { }); // Write data to FIFO - for chunk in buf.chunks(4) { + let chunks = buf.chunks_exact(4); + // Stash the last partial chunk + let rem = chunks.remainder(); + let last_chunk = (!rem.is_empty()).then(|| { let mut tmp = [0u8; 4]; - tmp[0..chunk.len()].copy_from_slice(chunk); - self.regs.fifo(index).write_value(regs::Fifo(u32::from_ne_bytes(tmp))); + tmp[0..rem.len()].copy_from_slice(rem); + u32::from_ne_bytes(tmp) + }); + + let fifo = self.regs.fifo(index); + for chunk in chunks { + let val = u32::from_ne_bytes(chunk.try_into().unwrap()); + fifo.write_value(regs::Fifo(val)); + } + // Write any last chunk + if let Some(val) = last_chunk { + fifo.write_value(regs::Fifo(val)); } }); -- cgit