From 11095bc15e4f9bd7a990d4b8fc5d41b0b246c8bd Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@home-assistant.io>
Date: Sun, 22 Feb 2026 22:41:22 -0600
Subject: [PATCH] [core] Avoid expensive modulo in LockFreeQueue for
 non-power-of-2 sizes

Replace modulo-based ring buffer index advancement with constexpr-dispatched
approach: power-of-2 sizes keep modulo (compiler emits single mask instruction),
non-power-of-2 sizes use comparison+branch instead of multiply-shift sequences.

Benchmarked on real ESP32-C3 (RISC-V) hardware (100k iterations):
- SIZE=88 (BLE):  24,120 us -> 6,493 us (3.7x faster)
- SIZE=30 (MQTT): 22,816 us -> 6,879 us (3.3x faster)
- SIZE=32 (pow2): identical (both use mask)

Flash savings on ESP32 Xtensa (BLE proxy build):
- push(): 149 -> 133 bytes (-16 B)
- pop():   78 ->  67 bytes (-11 B)
---
 esphome/core/lock_free_queue.h | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)
diff --git a/esphome/core/lock_free_queue.h b/esphome/core/lock_free_queue.h
index e96b739b58..a589087162 100644
--- a/esphome/core/lock_free_queue.h
+++ b/esphome/core/lock_free_queue.h
@@ -38,13 +38,27 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
   }
 
  protected:
+  // Advance ring buffer index by one, wrapping at SIZE.
+  // Power-of-2 sizes use modulo (compiler emits single mask instruction).
+  // Non-power-of-2 sizes use comparison to avoid expensive multiply-shift sequences.
+  static constexpr uint8_t next_index_(uint8_t index) {
+    if constexpr ((SIZE & (SIZE - 1)) == 0) {
+      return (index + 1) % SIZE;
+    } else {
+      uint8_t next = index + 1;
+      if (next >= SIZE) [[unlikely]]
+        next = 0;
+      return next;
+    }
+  }
+
   // Internal push that reports queue state - for use by derived classes
   bool push_internal_(T *element, bool &was_empty, uint8_t &old_tail) {
     if (element == nullptr)
       return false;
 
     uint8_t current_tail = tail_.load(std::memory_order_relaxed);
-    uint8_t next_tail = (current_tail + 1) % SIZE;
+    uint8_t next_tail = next_index_(current_tail);
 
     // Read head before incrementing tail
     uint8_t head_before = head_.load(std::memory_order_acquire);
@@ -73,14 +87,21 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
     }
 
     T *element = buffer_[current_head];
-    head_.store((current_head + 1) % SIZE, std::memory_order_release);
+    head_.store(next_index_(current_head), std::memory_order_release);
     return element;
   }
 
   size_t size() const {
     uint8_t tail = tail_.load(std::memory_order_acquire);
     uint8_t head = head_.load(std::memory_order_acquire);
-    return (tail - head + SIZE) % SIZE;
+    if constexpr ((SIZE & (SIZE - 1)) == 0) {
+      return (tail - head + SIZE) % SIZE;
+    } else {
+      int diff = static_cast<int>(tail) - static_cast<int>(head);
+      if (diff < 0)
+        diff += SIZE;
+      return static_cast<size_t>(diff);
+    }
   }
 
   uint16_t get_and_reset_dropped_count() { return dropped_count_.exchange(0, std::memory_order_relaxed); }
@@ -90,7 +111,7 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
   bool empty() const { return head_.load(std::memory_order_acquire) == tail_.load(std::memory_order_acquire); }
 
   bool full() const {
-    uint8_t next_tail = (tail_.load(std::memory_order_relaxed) + 1) % SIZE;
+    uint8_t next_tail = next_index_(tail_.load(std::memory_order_relaxed));
     return next_tail == head_.load(std::memory_order_acquire);
   }