[core] Avoid expensive modulo in LockFreeQueue for non-power-of-2 sizes

Replace modulo-based ring buffer index advancement with constexpr-dispatched
approach: power-of-2 sizes keep modulo (compiler emits single mask instruction),
non-power-of-2 sizes use comparison+branch instead of multiply-shift sequences.

Benchmarked on real ESP32-C3 (RISC-V) hardware (100k iterations):
- SIZE=88 (BLE):  24,120 us -> 6,493 us (3.7x faster)
- SIZE=30 (MQTT): 22,816 us -> 6,879 us (3.3x faster)
- SIZE=32 (pow2): identical (both use mask)

Flash savings on ESP32 Xtensa (BLE proxy build):
- push(): 149 -> 133 bytes (-16 B)
- pop():   78 ->  67 bytes (-11 B)
This commit is contained in:
J. Nick Koston
2026-02-22 22:41:22 -06:00
parent ee94bc4715
commit 11095bc15e

View File

@@ -38,13 +38,27 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
}
protected:
// Advance ring buffer index by one, wrapping at SIZE.
// Power-of-2 sizes use modulo (compiler emits single mask instruction).
// Non-power-of-2 sizes use comparison to avoid expensive multiply-shift sequences.
static constexpr uint8_t next_index_(uint8_t index) {
if constexpr ((SIZE & (SIZE - 1)) == 0) {
return (index + 1) % SIZE;
} else {
uint8_t next = index + 1;
if (next >= SIZE) [[unlikely]]
next = 0;
return next;
}
}
// Internal push that reports queue state - for use by derived classes
bool push_internal_(T *element, bool &was_empty, uint8_t &old_tail) {
if (element == nullptr)
return false;
uint8_t current_tail = tail_.load(std::memory_order_relaxed);
uint8_t next_tail = (current_tail + 1) % SIZE;
uint8_t next_tail = next_index_(current_tail);
// Read head before incrementing tail
uint8_t head_before = head_.load(std::memory_order_acquire);
@@ -73,14 +87,21 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
}
T *element = buffer_[current_head];
head_.store((current_head + 1) % SIZE, std::memory_order_release);
head_.store(next_index_(current_head), std::memory_order_release);
return element;
}
size_t size() const {
uint8_t tail = tail_.load(std::memory_order_acquire);
uint8_t head = head_.load(std::memory_order_acquire);
return (tail - head + SIZE) % SIZE;
if constexpr ((SIZE & (SIZE - 1)) == 0) {
return (tail - head + SIZE) % SIZE;
} else {
int diff = static_cast<int>(tail) - static_cast<int>(head);
if (diff < 0)
diff += SIZE;
return static_cast<size_t>(diff);
}
}
uint16_t get_and_reset_dropped_count() { return dropped_count_.exchange(0, std::memory_order_relaxed); }
@@ -90,7 +111,7 @@ template<class T, uint8_t SIZE> class LockFreeQueue {
bool empty() const { return head_.load(std::memory_order_acquire) == tail_.load(std::memory_order_acquire); }
bool full() const {
uint8_t next_tail = (tail_.load(std::memory_order_relaxed) + 1) % SIZE;
uint8_t next_tail = next_index_(tail_.load(std::memory_order_relaxed));
return next_tail == head_.load(std::memory_order_acquire);
}