From 11095bc15e4f9bd7a990d4b8fc5d41b0b246c8bd Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 22 Feb 2026 22:41:22 -0600 Subject: [PATCH] [core] Avoid expensive modulo in LockFreeQueue for non-power-of-2 sizes Replace modulo-based ring buffer index advancement with constexpr-dispatched approach: power-of-2 sizes keep modulo (compiler emits single mask instruction), non-power-of-2 sizes use comparison+branch instead of multiply-shift sequences. Benchmarked on real ESP32-C3 (RISC-V) hardware (100k iterations): - SIZE=88 (BLE): 24,120 us -> 6,493 us (3.7x faster) - SIZE=30 (MQTT): 22,816 us -> 6,879 us (3.3x faster) - SIZE=32 (pow2): identical (both use mask) Flash savings on ESP32 Xtensa (BLE proxy build): - push(): 149 -> 133 bytes (-16 B) - pop(): 78 -> 67 bytes (-11 B) --- esphome/core/lock_free_queue.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/esphome/core/lock_free_queue.h b/esphome/core/lock_free_queue.h index e96b739b58..a589087162 100644 --- a/esphome/core/lock_free_queue.h +++ b/esphome/core/lock_free_queue.h @@ -38,13 +38,27 @@ template class LockFreeQueue { } protected: + // Advance ring buffer index by one, wrapping at SIZE. + // Power-of-2 sizes use modulo (compiler emits single mask instruction). + // Non-power-of-2 sizes use comparison to avoid expensive multiply-shift sequences. + static constexpr uint8_t next_index_(uint8_t index) { + if constexpr ((SIZE & (SIZE - 1)) == 0) { + return (index + 1) % SIZE; + } else { + uint8_t next = index + 1; + if (next >= SIZE) [[unlikely]] + next = 0; + return next; + } + } + // Internal push that reports queue state - for use by derived classes bool push_internal_(T *element, bool &was_empty, uint8_t &old_tail) { if (element == nullptr) return false; uint8_t current_tail = tail_.load(std::memory_order_relaxed); - uint8_t next_tail = (current_tail + 1) % SIZE; + uint8_t next_tail = next_index_(current_tail); // Read head before incrementing tail uint8_t head_before = head_.load(std::memory_order_acquire); @@ -73,14 +87,21 @@ template class LockFreeQueue { } T *element = buffer_[current_head]; - head_.store((current_head + 1) % SIZE, std::memory_order_release); + head_.store(next_index_(current_head), std::memory_order_release); return element; } size_t size() const { uint8_t tail = tail_.load(std::memory_order_acquire); uint8_t head = head_.load(std::memory_order_acquire); - return (tail - head + SIZE) % SIZE; + if constexpr ((SIZE & (SIZE - 1)) == 0) { + return (tail - head + SIZE) % SIZE; + } else { + int diff = static_cast(tail) - static_cast(head); + if (diff < 0) + diff += SIZE; + return static_cast(diff); + } } uint16_t get_and_reset_dropped_count() { return dropped_count_.exchange(0, std::memory_order_relaxed); } @@ -90,7 +111,7 @@ template class LockFreeQueue { bool empty() const { return head_.load(std::memory_order_acquire) == tail_.load(std::memory_order_acquire); } bool full() const { - uint8_t next_tail = (tail_.load(std::memory_order_relaxed) + 1) % SIZE; + uint8_t next_tail = next_index_(tail_.load(std::memory_order_relaxed)); return next_tail == head_.load(std::memory_order_acquire); }