From e2ceb2b1f7cd0fd7778b53aaf8ba1caa71b2f7f5 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Fri, 11 Jul 2025 17:38:42 +0800
Subject: otg: Improve IN write performance

chunks_exact() can be handled by the compiler more efficiently.
Previous code was making a memcpy call for each 4 byte chunk slice.
Hoisting the fifo out of the loop avoids recalculating the pointer each
time.

In my benchmark I see a jump from ~13 megabyte/sec to ~25MB/sec after
this change (opt-level=3). opt-level = "z" goes 9MB/s to 18MB/s.

The benchmark was on a stm32h7s3l8, 600mhz clock, 512 byte bulk writes,
data in DTCM. The benchmark isn't just USB writes, also has some
unrelated memcpys for packet construction.
---
 embassy-usb-synopsys-otg/src/lib.rs | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'embassy-usb-synopsys-otg')

diff --git a/embassy-usb-synopsys-otg/src/lib.rs b/embassy-usb-synopsys-otg/src/lib.rs
index fc4428b54..3f6531813 100644
--- a/embassy-usb-synopsys-otg/src/lib.rs
+++ b/embassy-usb-synopsys-otg/src/lib.rs
@@ -1210,10 +1210,23 @@ impl<'d> embassy_usb_driver::EndpointIn for Endpoint<'d, In> {
             });
 
             // Write data to FIFO
-            for chunk in buf.chunks(4) {
+            let chunks = buf.chunks_exact(4);
+            // Stash the last partial chunk
+            let rem = chunks.remainder();
+            let last_chunk = (!rem.is_empty()).then(|| {
                 let mut tmp = [0u8; 4];
-                tmp[0..chunk.len()].copy_from_slice(chunk);
-                self.regs.fifo(index).write_value(regs::Fifo(u32::from_ne_bytes(tmp)));
+                tmp[0..rem.len()].copy_from_slice(rem);
+                u32::from_ne_bytes(tmp)
+            });
+
+            let fifo = self.regs.fifo(index);
+            for chunk in chunks {
+                let val = u32::from_ne_bytes(chunk.try_into().unwrap());
+                fifo.write_value(regs::Fifo(val));
+            }
+            // Write any last chunk
+            if let Some(val) = last_chunk {
+                fifo.write_value(regs::Fifo(val));
             }
         });
 
-- 
cgit