[core] Avoid expensive modulo in LockFreeQueue for non-power-of-2 sizes

Replace modulo-based ring buffer index advancement with constexpr-dispatched approach: power-of-2 sizes keep modulo (compiler emits single mask instruction), non-power-of-2 sizes use comparison+branch instead of multiply-shift sequences. Benchmarked on real ESP32-C3 (RISC-V) hardware (100k iterations): - SIZE=88 (BLE): 24,120 us -> 6,493 us (3.7x faster) - SIZE=30 (MQTT): 22,816 us -> 6,879 us (3.3x faster) - SIZE=32 (pow2): identical (both use mask) Flash savings on ESP32 Xtensa (BLE proxy build): - push(): 149 -> 133 bytes (-16 B) - pop(): 78 -> 67 bytes (-11 B)
2026-02-24 20:35:30 -07:00 · 2026-02-22 22:41:22 -06:00
parent ee94bc4715
commit 11095bc15e
1 changed files with 25 additions and 4 deletions
--- a/esphome/core/lock_free_queue.h
+++ b/esphome/core/lock_free_queue.h
@@ -38,13 +38,27 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
  }

 protected:
+  // Advance ring buffer index by one, wrapping at SIZE.
+  // Power-of-2 sizes use modulo (compiler emits single mask instruction).
+  // Non-power-of-2 sizes use comparison to avoid expensive multiply-shift sequences.
+  static constexpr uint8_t next_index_(uint8_t index) {
+    if constexpr ((SIZE & (SIZE - 1)) == 0) {
+      return (index + 1) % SIZE;
+    } else {
+      uint8_t next = index + 1;
+      if (next >= SIZE) [[unlikely]]
+        next = 0;
+      return next;
+    }
+  }
+
  // Internal push that reports queue state - for use by derived classes
  bool push_internal_(T *element, bool &was_empty, uint8_t &old_tail) {
    if (element == nullptr)
      return false;

    uint8_t current_tail = tail_.load(std::memory_order_relaxed);
-    uint8_t next_tail = (current_tail + 1) % SIZE;
+    uint8_t next_tail = next_index_(current_tail);

    // Read head before incrementing tail
    uint8_t head_before = head_.load(std::memory_order_acquire);
@@ -73,14 +87,21 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
    }

    T *element = buffer_[current_head];
-    head_.store((current_head + 1) % SIZE, std::memory_order_release);
+    head_.store(next_index_(current_head), std::memory_order_release);
    return element;
  }

  size_t size() const {
    uint8_t tail = tail_.load(std::memory_order_acquire);
    uint8_t head = head_.load(std::memory_order_acquire);
-    return (tail - head + SIZE) % SIZE;
+    if constexpr ((SIZE & (SIZE - 1)) == 0) {
+      return (tail - head + SIZE) % SIZE;
+    } else {
+      int diff = static_cast<int>(tail) - static_cast<int>(head);
+      if (diff < 0)
+        diff += SIZE;
+      return static_cast<size_t>(diff);
+    }
  }

  uint16_t get_and_reset_dropped_count() { return dropped_count_.exchange(0, std::memory_order_relaxed); }
@@ -90,7 +111,7 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
  bool empty() const { return head_.load(std::memory_order_acquire) == tail_.load(std::memory_order_acquire); }

  bool full() const {
-    uint8_t next_tail = (tail_.load(std::memory_order_relaxed) + 1) % SIZE;
+    uint8_t next_tail = next_index_(tail_.load(std::memory_order_relaxed));
    return next_tail == head_.load(std::memory_order_acquire);
  }