diff options
| author | pennae <[email protected]> | 2023-04-16 19:39:32 +0200 |
|---|---|---|
| committer | pennae <[email protected]> | 2023-04-16 19:45:18 +0200 |
| commit | 7a682ec02af50026d31296ce5cac6383580f5e55 (patch) | |
| tree | 9ebb9c3e648fe1aca7ffc2e7dd6730bc656fe66d | |
| parent | 1fdce6e52a51de89f48f002d5c92139f58029575 (diff) | |
rp: add division intrinsics
rp2040-hal adds division intrinsics using the hardware divider unit in
the SIO, as does the pico-sdk itself. using the hardware is faster than
the compiler_rt implementations, and more compact too.
| -rw-r--r-- | embassy-rp/src/intrinsics.rs | 198 |
1 files changed, 198 insertions, 0 deletions
diff --git a/embassy-rp/src/intrinsics.rs b/embassy-rp/src/intrinsics.rs index 3e75fb7fc..3b63846d4 100644 --- a/embassy-rp/src/intrinsics.rs +++ b/embassy-rp/src/intrinsics.rs | |||
| @@ -274,3 +274,201 @@ macro_rules! intrinsics { | |||
| 274 | intrinsics!($($rest)*); | 274 | intrinsics!($($rest)*); |
| 275 | }; | 275 | }; |
| 276 | } | 276 | } |
| 277 | |||
| 278 | // Credit: taken from `rp-hal` (also licensed Apache+MIT) | ||
| 279 | // https://github.com/rp-rs/rp-hal/blob/main/rp2040-hal/src/sio.rs | ||
| 280 | |||
| 281 | // This takes advantage of how AAPCS defines a 64-bit return on 32-bit registers | ||
| 282 | // by packing it into r0[0:31] and r1[32:63]. So all we need to do is put | ||
| 283 | // the remainder in the high order 32 bits of a 64 bit result. We can also | ||
| 284 | // alias the division operators to these for a similar reason r0 is the | ||
| 285 | // result either way and r1 a scratch register, so the caller can't assume it | ||
| 286 | // retains the argument value. | ||
| 287 | #[cfg(target_arch = "arm")] | ||
| 288 | core::arch::global_asm!( | ||
| 289 | ".macro hwdivider_head", | ||
| 290 | "ldr r2, =(0xd0000000)", // SIO_BASE | ||
| 291 | // Check the DIRTY state of the divider by shifting it into the C | ||
| 292 | // status bit. | ||
| 293 | "ldr r3, [r2, #0x078]", // DIV_CSR | ||
| 294 | "lsrs r3, #2", // DIRTY = 1, so shift 2 down | ||
| 295 | // We only need to save the state when DIRTY, otherwise we can just do the | ||
| 296 | // division directly. | ||
| 297 | "bcs 2f", | ||
| 298 | "1:", | ||
| 299 | // Do the actual division now, we're either not DIRTY, or we've saved the | ||
| 300 | // state and branched back here so it's safe now. | ||
| 301 | ".endm", | ||
| 302 | ".macro hwdivider_tail", | ||
| 303 | // 8 cycle delay to wait for the result. Each branch takes two cycles | ||
| 304 | // and fits into a 2-byte Thumb instruction, so this is smaller than | ||
| 305 | // 8 NOPs. | ||
| 306 | "b 3f", | ||
| 307 | "3: b 3f", | ||
| 308 | "3: b 3f", | ||
| 309 | "3: b 3f", | ||
| 310 | "3:", | ||
| 311 | // Read the quotient last, since that's what clears the dirty flag. | ||
| 312 | "ldr r1, [r2, #0x074]", // DIV_REMAINDER | ||
| 313 | "ldr r0, [r2, #0x070]", // DIV_QUOTIENT | ||
| 314 | // Either return to the caller or back to the state restore. | ||
| 315 | "bx lr", | ||
| 316 | "2:", | ||
| 317 | // Since we can't save the signed-ness of the calculation, we have to make | ||
| 318 | // sure that there's at least an 8 cycle delay before we read the result. | ||
| 319 | // The push takes 5 cycles, and we've already spent at least 7 checking | ||
| 320 | // the DIRTY state to get here. | ||
| 321 | "push {{r4-r6, lr}}", | ||
| 322 | // Read the quotient last, since that's what clears the dirty flag. | ||
| 323 | "ldr r3, [r2, #0x060]", // DIV_UDIVIDEND | ||
| 324 | "ldr r4, [r2, #0x064]", // DIV_UDIVISOR | ||
| 325 | "ldr r5, [r2, #0x074]", // DIV_REMAINDER | ||
| 326 | "ldr r6, [r2, #0x070]", // DIV_QUOTIENT | ||
| 327 | // If we get interrupted here (before a write sets the DIRTY flag) it's | ||
| 328 | // fine, since we have the full state, so the interruptor doesn't have to | ||
| 329 | // restore it. Once the write happens and the DIRTY flag is set, the | ||
| 330 | // interruptor becomes responsible for restoring our state. | ||
| 331 | "bl 1b", | ||
| 332 | // If we are interrupted here, then the interruptor will start an incorrect | ||
| 333 | // calculation using a wrong divisor, but we'll restore the divisor and | ||
| 334 | // result ourselves correctly. This sets DIRTY, so any interruptor will | ||
| 335 | // save the state. | ||
| 336 | "str r3, [r2, #0x060]", // DIV_UDIVIDEND | ||
| 337 | // If we are interrupted here, the the interruptor may start the | ||
| 338 | // calculation using incorrectly signed inputs, but we'll restore the | ||
| 339 | // result ourselves. This sets DIRTY, so any interruptor will save the | ||
| 340 | // state. | ||
| 341 | "str r4, [r2, #0x064]", // DIV_UDIVISOR | ||
| 342 | // If we are interrupted here, the interruptor will have restored | ||
| 343 | // everything but the quotient may be wrongly signed. If the calculation | ||
| 344 | // started by the above writes is still ongoing it is stopped, so it won't | ||
| 345 | // replace the result we're restoring. DIRTY and READY set, but only | ||
| 346 | // DIRTY matters to make the interruptor save the state. | ||
| 347 | "str r5, [r2, #0x074]", // DIV_REMAINDER | ||
| 348 | // State fully restored after the quotient write. This sets both DIRTY | ||
| 349 | // and READY, so whatever we may have interrupted can read the result. | ||
| 350 | "str r6, [r2, #0x070]", // DIV_QUOTIENT | ||
| 351 | "pop {{r4-r6, pc}}", | ||
| 352 | ".endm", | ||
| 353 | ); | ||
| 354 | |||
| 355 | macro_rules! division_function { | ||
| 356 | ( | ||
| 357 | $name:ident $($intrinsic:ident)* ( $argty:ty ) { | ||
| 358 | $($begin:literal),+ | ||
| 359 | } | ||
| 360 | ) => { | ||
| 361 | #[cfg(all(target_arch = "arm", feature = "intrinsics"))] | ||
| 362 | core::arch::global_asm!( | ||
| 363 | // Mangle the name slightly, since this is a global symbol. | ||
| 364 | concat!(".global _rphal_", stringify!($name)), | ||
| 365 | concat!(".type _rphal_", stringify!($name), ", %function"), | ||
| 366 | ".align 2", | ||
| 367 | concat!("_rphal_", stringify!($name), ":"), | ||
| 368 | $( | ||
| 369 | concat!(".global ", stringify!($intrinsic)), | ||
| 370 | concat!(".type ", stringify!($intrinsic), ", %function"), | ||
| 371 | concat!(stringify!($intrinsic), ":"), | ||
| 372 | )* | ||
| 373 | |||
| 374 | "hwdivider_head", | ||
| 375 | $($begin),+ , | ||
| 376 | "hwdivider_tail", | ||
| 377 | ); | ||
| 378 | |||
| 379 | #[cfg(all(target_arch = "arm", not(feature = "intrinsics")))] | ||
| 380 | core::arch::global_asm!( | ||
| 381 | // Mangle the name slightly, since this is a global symbol. | ||
| 382 | concat!(".global _rphal_", stringify!($name)), | ||
| 383 | concat!(".type _rphal_", stringify!($name), ", %function"), | ||
| 384 | ".align 2", | ||
| 385 | concat!("_rphal_", stringify!($name), ":"), | ||
| 386 | |||
| 387 | "hwdivider_head", | ||
| 388 | $($begin),+ , | ||
| 389 | "hwdivider_tail", | ||
| 390 | ); | ||
| 391 | |||
| 392 | #[cfg(target_arch = "arm")] | ||
| 393 | extern "aapcs" { | ||
| 394 | // Connect a local name to global symbol above through FFI. | ||
| 395 | #[link_name = concat!("_rphal_", stringify!($name)) ] | ||
| 396 | fn $name(n: $argty, d: $argty) -> u64; | ||
| 397 | } | ||
| 398 | |||
| 399 | #[cfg(not(target_arch = "arm"))] | ||
| 400 | #[allow(unused_variables)] | ||
| 401 | unsafe fn $name(n: $argty, d: $argty) -> u64 { 0 } | ||
| 402 | }; | ||
| 403 | } | ||
| 404 | |||
| 405 | division_function! { | ||
| 406 | unsigned_divmod __aeabi_uidivmod __aeabi_uidiv ( u32 ) { | ||
| 407 | "str r0, [r2, #0x060]", // DIV_UDIVIDEND | ||
| 408 | "str r1, [r2, #0x064]" // DIV_UDIVISOR | ||
| 409 | } | ||
| 410 | } | ||
| 411 | |||
| 412 | division_function! { | ||
| 413 | signed_divmod __aeabi_idivmod __aeabi_idiv ( i32 ) { | ||
| 414 | "str r0, [r2, #0x068]", // DIV_SDIVIDEND | ||
| 415 | "str r1, [r2, #0x06c]" // DIV_SDIVISOR | ||
| 416 | } | ||
| 417 | } | ||
| 418 | |||
| 419 | fn divider_unsigned(n: u32, d: u32) -> DivResult<u32> { | ||
| 420 | let packed = unsafe { unsigned_divmod(n, d) }; | ||
| 421 | DivResult { | ||
| 422 | quotient: packed as u32, | ||
| 423 | remainder: (packed >> 32) as u32, | ||
| 424 | } | ||
| 425 | } | ||
| 426 | |||
| 427 | fn divider_signed(n: i32, d: i32) -> DivResult<i32> { | ||
| 428 | let packed = unsafe { signed_divmod(n, d) }; | ||
| 429 | // Double casts to avoid sign extension | ||
| 430 | DivResult { | ||
| 431 | quotient: packed as u32 as i32, | ||
| 432 | remainder: (packed >> 32) as u32 as i32, | ||
| 433 | } | ||
| 434 | } | ||
| 435 | |||
| 436 | /// Result of divide/modulo operation | ||
| 437 | struct DivResult<T> { | ||
| 438 | /// The quotient of divide/modulo operation | ||
| 439 | pub quotient: T, | ||
| 440 | /// The remainder of divide/modulo operation | ||
| 441 | pub remainder: T, | ||
| 442 | } | ||
| 443 | |||
| 444 | intrinsics! { | ||
| 445 | extern "C" fn __udivsi3(n: u32, d: u32) -> u32 { | ||
| 446 | divider_unsigned(n, d).quotient | ||
| 447 | } | ||
| 448 | |||
| 449 | extern "C" fn __umodsi3(n: u32, d: u32) -> u32 { | ||
| 450 | divider_unsigned(n, d).remainder | ||
| 451 | } | ||
| 452 | |||
| 453 | extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 { | ||
| 454 | let quo_rem = divider_unsigned(n, d); | ||
| 455 | if let Some(rem) = rem { | ||
| 456 | *rem = quo_rem.remainder; | ||
| 457 | } | ||
| 458 | quo_rem.quotient | ||
| 459 | } | ||
| 460 | |||
| 461 | extern "C" fn __divsi3(n: i32, d: i32) -> i32 { | ||
| 462 | divider_signed(n, d).quotient | ||
| 463 | } | ||
| 464 | |||
| 465 | extern "C" fn __modsi3(n: i32, d: i32) -> i32 { | ||
| 466 | divider_signed(n, d).remainder | ||
| 467 | } | ||
| 468 | |||
| 469 | extern "C" fn __divmodsi4(n: i32, d: i32, rem: &mut i32) -> i32 { | ||
| 470 | let quo_rem = divider_signed(n, d); | ||
| 471 | *rem = quo_rem.remainder; | ||
| 472 | quo_rem.quotient | ||
| 473 | } | ||
| 474 | } | ||
