From 32797534a70dbb83b02ec1e3fc67411d25699419 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 7 Nov 2025 22:04:58 -0600 Subject: [PATCH] propsals --- callback_manager_optimize.md | 845 ++++++++++++++++++ callback_optimization_analysis.md | 75 ++ callback_usage_analysis.md | 118 +++ lazy_callback_tradeoff_analysis.md | 188 ++++ partitioned_callback_final_proposal.md | 286 ++++++ unified_callback_proposal.md | 208 +++++ ..._event_emitter_replace_simple_callbacks.md | 204 +++++ usb_callback_contexts.md | 90 ++ 8 files changed, 2014 insertions(+) create mode 100644 callback_manager_optimize.md create mode 100644 callback_optimization_analysis.md create mode 100644 callback_usage_analysis.md create mode 100644 lazy_callback_tradeoff_analysis.md create mode 100644 partitioned_callback_final_proposal.md create mode 100644 unified_callback_proposal.md create mode 100644 updated_plan_remove_event_emitter_replace_simple_callbacks.md create mode 100644 usb_callback_contexts.md diff --git a/callback_manager_optimize.md b/callback_manager_optimize.md new file mode 100644 index 0000000000..25b22ae568 --- /dev/null +++ b/callback_manager_optimize.md @@ -0,0 +1,845 @@ +# CallbackManager Optimization Plan + +**Note:** ESPHome uses C++20 (gnu++20), so implementations leverage modern C++ features: +- **Concepts** for type constraints and better error messages +- **Designated initializers** for cleaner struct initialization +- **consteval** for compile-time validation +- **Requires clauses** for inline constraints + +## Current State + +### Memory Profile (ESP32 - 32-bit) + +```cpp +sizeof(std::function): 32 bytes +sizeof(void*): 4 bytes +sizeof(function pointer): 4 bytes +``` + +### Current Implementation + +```cpp +template class CallbackManager { + public: + void add(std::function &&callback) { + this->callbacks_.push_back(std::move(callback)); + } + + void call(Ts... args) { + for (auto &cb : this->callbacks_) + cb(args...); + } + + size_t size() const { return this->callbacks_.size(); } + + protected: + std::vector> callbacks_; +}; +``` + +### Memory Cost Per Instance + +- **Per callback:** 32 bytes (std::function storage) +- **Vector reallocation code:** ~132 bytes (`_M_realloc_append` template instantiation) +- **Example (1 callback):** 32 + 132 = 164 bytes + +### Codebase Usage + +- **Total CallbackManager instances:** ~67 files +- **Estimated total callbacks:** 100-150 across all components +- **Examples:** + - `sensor.h`: `CallbackManager` - multiple callbacks per sensor + - `esp32_ble_tracker.h`: `CallbackManager` - 1 callback (bluetooth_proxy) + - `esp32_improv.h`: `CallbackManager` - up to 5 callbacks (automation triggers) + - `climate.h`: `CallbackManager` - multiple callbacks for state/control + +### Current Usage Pattern + +All callbacks currently use lambda captures: + +```cpp +// bluetooth_proxy.cpp +parent_->add_scanner_state_callback([this](ScannerState state) { + if (this->api_connection_ != nullptr) { + this->send_bluetooth_scanner_state_(state); + } +}); + +// sensor.cpp (via automation) +sensor->add_on_state_callback([this](float state) { + this->trigger(state); +}); +``` + +--- + +## Optimization Options + +### Option 1: Function Pointer + Context (Recommended) + +**C++20 Implementation (Type-Safe with Concepts):** + +```cpp +#include +#include + +// Concept to validate callback signature +template +concept CallbackFunction = requires(F func, Context* ctx, Ts... args) { + { func(ctx, args...) } -> std::same_as; +}; + +template +class CallbackManager { + private: + struct Callback { + void (*invoker)(void*, Ts...); // 4 bytes - type-erased invoker + void* context; // 4 bytes - captured context + // Total: 8 bytes + }; + + // Type-safe invoker template - knows real context type + template + static void invoke(void* ctx, Ts... args) { + auto typed_func = reinterpret_cast( + *static_cast(ctx) + ); + auto typed_ctx = static_cast( + *reinterpret_cast(static_cast(ctx) + sizeof(void*)) + ); + typed_func(typed_ctx, args...); + } + + std::vector callbacks_; + + public: + // Type-safe registration with concept constraint + template + requires CallbackFunction + void add(void (*func)(Context*, Ts...), Context* context) { + // Use designated initializers (C++20) + callbacks_.push_back({ + .invoker = [](void* storage, Ts... args) { + // Extract function pointer and context from packed storage + void* func_and_ctx[2]; + std::memcpy(func_and_ctx, storage, sizeof(func_and_ctx)); + + auto typed_func = reinterpret_cast(func_and_ctx[0]); + auto typed_ctx = static_cast(func_and_ctx[1]); + typed_func(typed_ctx, args...); + }, + .context = nullptr // Will store packed data + }); + + // Pack function pointer and context into the callback storage + void* func_and_ctx[2] = { reinterpret_cast(func), context }; + std::memcpy(&callbacks_.back(), func_and_ctx, sizeof(func_and_ctx)); + } + + void call(Ts... args) { + for (auto& cb : callbacks_) { + cb.invoker(&cb, args...); + } + } + + constexpr size_t size() const { return callbacks_.size(); } +}; +``` + +**Cleaner C++20 Implementation (12 bytes, simpler):** + +```cpp +template +class CallbackManager { + private: + struct Callback { + void (*invoker)(void*, void*, Ts...); // 4 bytes - generic invoker + void* func_ptr; // 4 bytes - actual function + void* context; // 4 bytes - context + // Total: 12 bytes (still 20 bytes saved vs std::function!) + }; + + template + static consteval auto make_invoker() { + return +[](void* func, void* ctx, Ts... args) { + auto typed_func = reinterpret_cast(func); + typed_func(static_cast(ctx), args...); + }; + } + + std::vector callbacks_; + + public: + // C++20 concepts for type safety + template + requires std::invocable + void add(void (*func)(Context*, Ts...), Context* context) { + // C++20 designated initializers + callbacks_.push_back({ + .invoker = make_invoker(), + .func_ptr = reinterpret_cast(func), + .context = context + }); + } + + void call(Ts... args) { + for (auto& cb : callbacks_) { + cb.invoker(cb.func_ptr, cb.context, args...); + } + } + + constexpr size_t size() const { return callbacks_.size(); } + constexpr bool empty() const { return callbacks_.empty(); } +}; +``` + +**Most Efficient C++20 Implementation (8 bytes):** + +```cpp +template +class CallbackManager { + private: + struct Callback { + void (*invoker)(void*, Ts...); // 4 bytes + void* context; // 4 bytes + // Total: 8 bytes - maximum savings! + }; + + // C++20: consteval ensures compile-time evaluation + template + static consteval auto make_invoker() { + // The + forces decay to function pointer + return +[](void* ctx, Ts... args) { + // Unpack the storage struct + struct Storage { + void (*func)(Context*, Ts...); + Context* context; + }; + auto* storage = static_cast(ctx); + storage->func(storage->context, args...); + }; + } + + std::vector callbacks_; + + public: + template + requires std::invocable + void add(void (*func)(Context*, Ts...), Context* context) { + // Allocate storage for function + context + struct Storage { + void (*func)(Context*, Ts...); + Context* context; + }; + + auto* storage = new Storage{func, context}; + + callbacks_.push_back({ + .invoker = make_invoker(), + .context = storage + }); + } + + ~CallbackManager() { + // Clean up storage + for (auto& cb : callbacks_) { + delete static_cast(cb.context); + } + } + + void call(Ts... args) { + for (auto& cb : callbacks_) { + cb.invoker(cb.context, args...); + } + } + + constexpr size_t size() const { return callbacks_.size(); } +}; +``` + +**Simplest C++20 Implementation (Recommended):** + +```cpp +template +class CallbackManager { + private: + struct Callback { + void (*invoker)(void*, void*, Ts...); // 4 bytes + void* func_ptr; // 4 bytes + void* context; // 4 bytes + // Total: 12 bytes + }; + + template + static void invoke(void* func, void* ctx, Ts... args) { + reinterpret_cast(func)(static_cast(ctx), args...); + } + + std::vector callbacks_; + + public: + template + requires std::invocable + void add(void (*func)(Context*, Ts...), Context* context) { + callbacks_.push_back({ + .invoker = &invoke, + .func_ptr = reinterpret_cast(func), + .context = context + }); + } + + void call(Ts... args) { + for (auto& cb : callbacks_) { + cb.invoker(cb.func_ptr, cb.context, args...); + } + } + + constexpr size_t size() const { return callbacks_.size(); } +}; +``` + +**C++20 Benefits:** +- ✅ **Concepts** provide clear compile errors +- ✅ **Designated initializers** make code more readable +- ✅ **consteval** ensures compile-time evaluation +- ✅ **constexpr** improvements allow more compile-time validation +- ✅ **Requires clauses** document constraints inline + +**Usage Changes:** + +```cpp +// OLD (lambda): +parent_->add_scanner_state_callback([this](ScannerState state) { + if (this->api_connection_ != nullptr) { + this->send_bluetooth_scanner_state_(state); + } +}); + +// NEW (static function + context): +static void scanner_state_callback(BluetoothProxy* proxy, ScannerState state) { + if (proxy->api_connection_ != nullptr) { + proxy->send_bluetooth_scanner_state_(state); + } +} + +// Registration +parent_->add_scanner_state_callback(scanner_state_callback, this); +``` + +**Savings:** +- **Per callback:** 24 bytes (32 → 8) or 20 bytes (32 → 12 for simpler version) +- **RAM saved (100-150 callbacks):** 2.4 - 3.6 KB +- **Flash saved:** ~5-10 KB (eliminates std::function template instantiations) + +**Pros:** +- ✅ Maximum memory savings (75% reduction) +- ✅ Type-safe at registration time +- ✅ No virtual function overhead +- ✅ Works with all capture patterns +- ✅ Simple implementation + +**Cons:** +- ❌ Requires converting lambdas to static functions +- ❌ Changes API for all 67 CallbackManager users +- ❌ More verbose at call site + +--- + +### Option 2: Member Function Pointers + +**Implementation:** + +```cpp +template +class CallbackManager { + private: + struct Callback { + void (*invoker)(void*, Ts...); // 4 bytes + void* obj; // 4 bytes + // Total: 8 bytes + }; + + template + static void invoke_member(void* obj, Ts... args) { + (static_cast(obj)->*Method)(args...); + } + + std::vector callbacks_; + + public: + // Register a member function + template + void add(T* obj) { + callbacks_.push_back({ + &invoke_member, + obj + }); + } + + void call(Ts... args) { + for (auto& cb : callbacks_) { + cb.invoker(cb.obj, args...); + } + } + + size_t size() const { return callbacks_.size(); } +}; +``` + +**Usage Changes:** + +```cpp +// Add a method to BluetoothProxy +void BluetoothProxy::on_scanner_state_changed(ScannerState state) { + if (this->api_connection_ != nullptr) { + this->send_bluetooth_scanner_state_(state); + } +} + +// Register it +parent_->add_scanner_state_callback(this); +``` + +**Savings:** +- **Per callback:** 24 bytes (32 → 8) +- **RAM saved:** 2.4 - 3.6 KB +- **Flash saved:** ~5-10 KB + +**Pros:** +- ✅ Same memory savings as Option 1 +- ✅ Most type-safe (member function pointers) +- ✅ No static functions needed +- ✅ Clean separation of callback logic + +**Cons:** +- ❌ Verbose syntax at registration: `add(this)` +- ❌ Requires adding methods to classes +- ❌ Can't capture additional state beyond `this` +- ❌ Template parameters at call site are ugly + +--- + +### Option 3: Hybrid (Backward Compatible) + +**Implementation:** + +```cpp +template +class CallbackManager { + private: + struct Callback { + void (*invoker)(void*, Ts...); // 4 bytes + void* data; // 4 bytes + bool is_std_function; // 1 byte + 3 padding = 4 bytes + // Total: 12 bytes + }; + + std::vector callbacks_; + + public: + // Optimized: function pointer + context + template + void add(void (*func)(Context*, Ts...), Context* context) { + callbacks_.push_back({ + [](void* ctx, Ts... args) { + auto cb = static_cast(ctx); + auto typed_func = reinterpret_cast(cb->data); + auto typed_ctx = static_cast(*reinterpret_cast( + static_cast(cb) + offsetof(Callback, data) + )); + typed_func(typed_ctx, args...); + }, + reinterpret_cast(func), + false + }); + } + + // Legacy: std::function support (for gradual migration) + void add(std::function&& func) { + auto* stored = new std::function(std::move(func)); + callbacks_.push_back({ + [](void* ctx, Ts... args) { + (*static_cast*>(ctx))(args...); + }, + stored, + true + }); + } + + ~CallbackManager() { + for (auto& cb : callbacks_) { + if (cb.is_std_function) { + delete static_cast*>(cb.data); + } + } + } + + void call(Ts... args) { + for (auto& cb : callbacks_) { + cb.invoker(&cb, args...); + } + } + + size_t size() const { return callbacks_.size(); } +}; +``` + +**Usage:** + +```cpp +// NEW (optimized): +parent_->add_scanner_state_callback(scanner_state_callback, this); + +// OLD (still works - gradual migration): +parent_->add_scanner_state_callback([this](ScannerState state) { + // ... lambda still works +}); +``` + +**Savings:** +- **Per optimized callback:** 20 bytes (32 → 12) +- **Per legacy callback:** 0 bytes (still uses std::function) +- **Allows gradual migration** + +**Pros:** +- ✅ Backward compatible +- ✅ Gradual migration path +- ✅ Mix optimized and legacy in same codebase +- ✅ No breaking changes + +**Cons:** +- ❌ More complex implementation +- ❌ Need to track which callbacks need cleanup +- ❌ Extra bool field (padding makes it 12 bytes instead of 8) +- ❌ std::function still compiled in + +--- + +### Option 4: FixedVector (Keep std::function, Optimize Vector) + +**Implementation:** + +```cpp +template +class CallbackManager { + public: + void add(std::function &&callback) { + if (this->callbacks_.empty()) { + // Most CallbackManagers have 1-5 callbacks + this->callbacks_.init(5); + } + this->callbacks_.push_back(std::move(callback)); + } + + void call(Ts... args) { + for (auto &cb : this->callbacks_) + cb(args...); + } + + size_t size() const { return this->callbacks_.size(); } + + protected: + FixedVector> callbacks_; // Changed from std::vector +}; +``` + +**Savings:** +- **Per callback:** 0 bytes (still 32 bytes) +- **Per instance:** ~132 bytes (eliminates `_M_realloc_append`) +- **Flash saved:** ~5-10 KB (one less vector template instantiation per type) +- **Total:** ~132 bytes × ~20 unique callback types = ~2.6 KB + +**Pros:** +- ✅ No API changes +- ✅ Drop-in replacement +- ✅ Eliminates vector reallocation machinery +- ✅ Zero migration cost + +**Cons:** +- ❌ No per-callback savings +- ❌ std::function still 32 bytes each +- ❌ Must guess max size at runtime +- ❌ Can still overflow if guess is wrong + +--- + +### Option 5: Template Parameter for Storage (Advanced) + +**Implementation:** + +```cpp +enum class CallbackStorage { + FUNCTION, // Use std::function (default, most flexible) + FUNCTION_PTR // Use function pointer + context (optimal) +}; + +template +class CallbackManager { + // Specialize implementation based on Storage parameter +}; + +// Default: std::function (backward compatible) +template +class CallbackManager { + protected: + std::vector> callbacks_; + // ... current implementation +}; + +// Optimized: function pointer + context +template +class CallbackManager { + private: + struct Callback { + void (*func)(void*, Ts...); + void* context; + }; + std::vector callbacks_; + // ... Option 1 implementation +}; +``` + +**Usage:** + +```cpp +// Old components (no changes): +CallbackManager callback_; // Uses std::function by default + +// Optimized components: +CallbackManager scanner_state_callbacks_; +``` + +**Savings:** +- **Opt-in per component** +- **Same as Option 1 for optimized components** + +**Pros:** +- ✅ Gradual migration +- ✅ No breaking changes +- ✅ Explicit opt-in per component +- ✅ Clear which components are optimized + +**Cons:** +- ❌ Complex template metaprogramming +- ❌ Two implementations to maintain +- ❌ Template parameter pollution +- ❌ Harder to understand codebase + +--- + +## Comparison Matrix + +| Option | Per-Callback Savings | Flash Savings | API Changes | Complexity | Migration Cost | +|--------|---------------------|---------------|-------------|------------|----------------| +| **1. Function Ptr + Context** | **24 bytes** (75%) | **~10 KB** | Yes | Low | High (67 files) | +| **2. Member Function Ptrs** | **24 bytes** (75%) | **~10 KB** | Yes | Medium | High + class changes | +| **3. Hybrid** | **20 bytes** (opt-in) | **~8 KB** | No | High | Low (gradual) | +| **4. FixedVector** | **0 bytes** | **~3 KB** | No | Low | None | +| **5. Template Parameter** | **24 bytes** (opt-in) | **~10 KB** | Optional | High | Medium | + +--- + +## Migration Effort Estimate + +### Option 1 (Function Pointer + Context) + +**Files to change:** ~67 files with CallbackManager usage + +**Per-file changes:** +1. Convert lambda to static function (5 min) +2. Update registration call (1 min) +3. Test (5 min) + +**Estimate:** ~11 min × 67 files = **~12 hours** (assuming some files have multiple callbacks) + +**High-impact components to prioritize:** +- `sensor.h` / `sensor.cpp` - many sensor callbacks +- `esp32_ble_tracker.h` - BLE callbacks +- `climate.h` - climate callbacks +- `binary_sensor.h` - binary sensor callbacks + +### Option 4 (FixedVector) + +**Files to change:** 1 file (`esphome/core/helpers.h`) + +**Changes:** +1. Change `std::vector` to `FixedVector` in CallbackManager +2. Initialize with reasonable default size (e.g., 5) +3. Test across codebase + +**Estimate:** **~1 hour** + +--- + +## Recommendations + +### Immediate Action: Option 4 (FixedVector) + +**Why:** +- Zero migration cost +- Immediate ~3 KB flash savings +- No API changes +- Low risk + +**Implementation:** +```cpp +template class CallbackManager { + public: + void add(std::function &&callback) { + if (this->callbacks_.empty()) { + this->callbacks_.init(8); // Most have < 8 callbacks + } + this->callbacks_.push_back(std::move(callback)); + } + // ... rest unchanged + protected: + FixedVector> callbacks_; +}; +``` + +### Long-term: Option 1 (Function Pointer + Context) + +**Why:** +- Maximum savings (2.4-3.6 KB RAM + 10 KB flash) +- Clean, simple implementation +- Type-safe +- Well-tested pattern + +**Migration Strategy:** +1. Implement new `CallbackManager` in `helpers.h` +2. Migrate high-impact components first: + - Core components (sensor, binary_sensor, climate) + - BLE components (esp32_ble_tracker, bluetooth_proxy) + - Network components (api, mqtt) +3. Create helper macros to reduce boilerplate +4. Migrate remaining components over 2-3 releases + +**Helper Macro Example:** +```cpp +// Define a callback wrapper +#define CALLBACK_WRAPPER(Class, Method, ...) \ + static void Method##_callback(Class* self, ##__VA_ARGS__) { \ + self->Method(__VA_ARGS__); \ + } + +// In class: +class BluetoothProxy { + CALLBACK_WRAPPER(BluetoothProxy, on_scanner_state, ScannerState state) + + void on_scanner_state(ScannerState state) { + // Implementation + } + + void setup() { + parent_->add_scanner_state_callback(on_scanner_state_callback, this); + } +}; +``` + +--- + +## Testing Plan + +### Phase 1: Unit Tests +- Test CallbackManager with various signatures +- Test multiple callbacks (1, 5, 10, 50) +- Test callback removal/cancellation +- Test edge cases (empty, nullptr, etc.) + +### Phase 2: Integration Tests +- Create test YAML with heavily-used callbacks +- Run on ESP32, ESP8266, RP2040 +- Measure before/after memory usage +- Verify no functional regressions + +### Phase 3: Component Tests +- Test high-impact components: + - sensor with multiple state callbacks + - esp32_improv with all automation triggers + - climate with state/control callbacks +- Measure memory with `esphome analyze-memory` + +--- + +## Risk Analysis + +### Option 1 Risks + +**Risk: Breaking change across 67 files** +- **Mitigation:** Gradual rollout over 2-3 releases +- **Mitigation:** Extensive testing on real hardware + +**Risk: Static function verbosity** +- **Mitigation:** Helper macros (see above) +- **Mitigation:** Code generation from Python + +**Risk: Missing captures** +- **Mitigation:** Static analysis to find lambda captures +- **Mitigation:** Compile-time errors for incorrect usage + +### Option 4 Risks + +**Risk: Buffer overflow if size guess is wrong** +- **Mitigation:** Choose conservative default (8) +- **Mitigation:** Add runtime warning on resize +- **Mitigation:** Monitor in CI/testing + +**Risk: Still uses std::function (32 bytes each)** +- **Mitigation:** Follow up with Option 1 migration +- **Mitigation:** This is a stepping stone, not final solution + +--- + +## Implementation Timeline + +### Week 1: Option 4 (Quick Win) +- Implement FixedVector in CallbackManager +- Test across codebase +- Create PR with memory analysis +- **Expected savings:** ~3 KB flash + +### Month 1-2: Option 1 (Core Components) +- Implement function pointer CallbackManager +- Migrate sensor, binary_sensor, climate +- Create helper macros +- **Expected savings:** ~1 KB RAM + 5 KB flash + +### Month 3-4: Option 1 (Remaining Components) +- Migrate BLE components +- Migrate network components (api, mqtt) +- Migrate automation components +- **Expected savings:** ~2 KB RAM + 10 KB flash total + +### Month 5: Cleanup +- Remove std::function CallbackManager +- Update documentation +- Blog post about optimization + +--- + +## Conclusion + +**Recommended Approach:** + +1. **Immediate (Week 1):** Implement Option 4 (FixedVector) + - Low risk, zero migration cost + - ~3 KB flash savings + - Sets foundation for Option 1 + +2. **Short-term (Month 1-2):** Begin Option 1 migration + - Start with high-impact components + - ~1-2 KB RAM + 5 KB flash savings + - Validate approach + +3. **Long-term (Month 3-6):** Complete Option 1 migration + - Migrate all components + - ~3-4 KB total RAM + 10 KB flash savings + - Remove std::function variant + +**Total Expected Savings:** +- **RAM:** 2.4 - 3.6 KB (75% reduction per callback) +- **Flash:** 8 - 13 KB (vector overhead + template instantiations) +- **Performance:** Slightly faster (no std::function indirection) + +This is significant for ESP8266 (80 KB RAM, 1 MB flash) and beneficial for all platforms. diff --git a/callback_optimization_analysis.md b/callback_optimization_analysis.md new file mode 100644 index 0000000000..11066a80d2 --- /dev/null +++ b/callback_optimization_analysis.md @@ -0,0 +1,75 @@ +# Callback Optimization Analysis - Why It Failed + +## Goal +Convert stateful lambdas in CallbackManager to stateless function pointers to reduce flash usage. + +## Approach Tested + +### Attempt 1: Discriminated Union in CallbackManager +**Changed:** `CallbackManager` to use union with discriminator (like `TemplatableValue`) +- Stateless lambdas → function pointer (8 bytes) +- Stateful lambdas → heap-allocated `std::function*` (8 bytes struct + 32 bytes heap) + +**Result:** +- ❌ **+300 bytes heap usage** (37-38 callbacks × 8 bytes overhead) +- ✅ Flash savings potential: ~200-400 bytes per stateless callback +- **Verdict:** RAM is more precious than flash on ESP8266 - rejected + +### Attempt 2: Convert Individual Callbacks to Stateless +**Changed:** API logger callback from `[this]` lambda to static member function +- Used existing `global_api_server` pointer +- Made callback stateless (convertible to function pointer) + +**Result:** +``` +Removed: +- Lambda _M_invoke: 103 bytes +- Lambda _M_manager: 20 bytes + +Added: +- log_callback function: 104 bytes +- Function pointer _M_invoke: 20 bytes +- Function pointer _M_manager: 20 bytes +- Larger setup(): 7 bytes + +Net: +32 bytes flash ❌ +``` + +**Why it failed:** +Even though the callback became stateless, `CallbackManager` still uses `std::vector>`. The function pointer STILL gets wrapped in `std::function`, generating the same template instantiation overhead. We just moved the code from a lambda to a static function. + +## Root Cause + +The optimization **requires BOTH**: +1. ✅ Stateless callback (function pointer) +2. ❌ Modified `CallbackManager` to store function pointers directly without `std::function` wrapper + +Without modifying `CallbackManager`, converting individual callbacks to function pointers provides **no benefit** and actually **increases** code size slightly due to the extra function definition. + +## Conclusion + +This optimization path is a **dead end** for ESPHome because: +1. **Discriminated union approach**: Increases heap by 300 bytes (unacceptable for ESP8266) +2. **Individual callback conversion**: Increases flash by 32+ bytes (no benefit without CallbackManager changes) + +The current `std::vector>` approach is already optimal for the use case where most callbacks capture state. + +## Alternative Approaches Considered + +1. **Create separate `StatelessCallbackManager`**: Would require changing all call sites, not worth the complexity +2. **Template parameter to select storage type**: Same issue - requires modifying many components +3. **Hand-pick specific callbacks**: Provides no benefit as shown in Attempt 2 + +## Recommendation + +**Do not pursue this optimization.** The RAM/flash trade-offs are unfavorable for embedded systems where RAM is typically more constrained than flash. + +--- + +**Test Results:** +- Platform: ESP8266-Arduino +- Component: API +- Result: +32 bytes flash (0.01% increase) +- Status: Reverted + +🤖 Analysis by Claude Code diff --git a/callback_usage_analysis.md b/callback_usage_analysis.md new file mode 100644 index 0000000000..ab31136bca --- /dev/null +++ b/callback_usage_analysis.md @@ -0,0 +1,118 @@ +# add_on_state_callback Usage Analysis + +## Summary + +After the Controller Registry migration (PR #11772), `add_on_state_callback` is still widely used in the codebase, but for **legitimate reasons** - components that genuinely need per-entity state tracking. + +## Usage Breakdown + +### 1. **MQTT Components** (~17 uses) +**Purpose:** Per-entity MQTT configuration requires callbacks +- Each MQTT component instance needs to publish to custom topics with custom QoS/retain settings +- Cannot use Controller pattern due to per-entity configuration overhead +- Examples: `mqtt_sensor.cpp`, `mqtt_climate.cpp`, `mqtt_number.cpp`, etc. + +```cpp +this->sensor_->add_on_state_callback([this](float state) { + this->publish_state(state); +}); +``` + +### 2. **Copy Components** (~10 uses) +**Purpose:** Mirror state from one entity to another +- Each copy instance tracks a different source entity +- Legitimate use of callbacks for entity-to-entity synchronization +- Examples: `copy_sensor.cpp`, `copy_fan.cpp`, `copy_select.cpp`, etc. + +```cpp +source_->add_on_state_callback([this](const std::string &value) { + this->publish_state(value); +}); +``` + +### 3. **Derivative Sensors** (~5-7 uses) +**Purpose:** Compute derived values from source sensors +- **integration_sensor:** Integrates sensor values over time +- **total_daily_energy:** Tracks cumulative energy +- **combination:** Combines multiple sensor values +- **graph:** Samples sensor data for display +- **duty_time:** Tracks on-time duration +- **ntc/absolute_humidity/resistance:** Mathematical transformations + +```cpp +this->sensor_->add_on_state_callback([this](float state) { + this->process_sensor_value_(state); +}); +``` + +### 4. **Climate/Cover with Sensors** (~10-15 uses) +**Purpose:** External sensors providing feedback to control loops +- **feedback_cover:** Binary sensors for open/close/obstacle detection +- **bang_bang/pid/thermostat:** External temperature sensors for climate control +- **climate_ir (toshiba/yashima/heatpumpir):** Temperature sensors for IR climate + +```cpp +this->sensor_->add_on_state_callback([this](float state) { + this->current_temperature = state; + // Trigger control loop update +}); +``` + +### 5. **Entity Base Classes** (~10-15 definitions) +**Purpose:** Provide the callback interface for all entities +- Not actual usage, just the method definitions +- Examples: `sensor.cpp::add_on_state_callback()`, `climate.cpp::add_on_state_callback()`, etc. + +### 6. **Automation Trigger Classes** (~15-20 definitions) +**Purpose:** User-defined YAML automations need callbacks +- Files like `sensor/automation.h`, `climate/automation.h` +- Implement triggers like `on_value:`, `on_state:` +- Cannot be migrated - this is user-facing automation functionality + +### 7. **Miscellaneous** (~5-10 uses) +- **voice_assistant/micro_wake_word:** State coordination +- **esp32_improv:** Provisioning state tracking +- **http_request/update:** Update status monitoring +- **switch/binary_sensor:** Cross-component dependencies +- **OTA callbacks:** OTA state monitoring + +## Key Insights + +### What's NOT Using Callbacks Anymore ✅ +**API Server and WebServer** - migrated to Controller Registry +- **Before:** Each entity had 2 callbacks (API + WebServer) = ~32 bytes overhead +- **After:** Zero per-entity overhead = saves ~32 bytes per entity + +### What SHOULD Keep Using Callbacks ✅ +All the above categories have legitimate reasons: + +1. **Per-entity configuration:** MQTT needs custom topics/QoS per entity +2. **Entity-to-entity relationships:** Copy components, derivative sensors +3. **Control loop feedback:** Climate/cover with external sensors +4. **User-defined automations:** YAML triggers configured by users +5. **Component dependencies:** Components that genuinely depend on other entities + +## Memory Impact + +**Per Sensor (ESP32):** +- Empty callback infrastructure: **~16 bytes** (unique_ptr + empty vector) +- With one callback (e.g., MQTT): **~32 bytes** (16 + std::function) +- With multiple callbacks: **~32 + 16n bytes** (where n = additional callbacks) + +**Typical scenarios:** +- Sensor with **only API/WebServer:** ~16 bytes (no callbacks registered) +- Sensor with **MQTT:** ~32 bytes (one callback) +- Sensor with **MQTT + automation:** ~48 bytes (two callbacks) +- Sensor with **copy + total_daily_energy + graph:** ~64 bytes (three callbacks) + +## Conclusion + +The callback system is still heavily used (~103 occurrences) but for **appropriate reasons**: +- Components with per-entity state/configuration (MQTT, Copy) +- Sensor processing chains (derivatives, transformations) +- Control loops with external feedback (climate, covers) +- User-defined automations (cannot be removed) + +The Controller Registry successfully eliminated wasteful callbacks for **stateless global handlers** (API/WebServer), saving ~32 bytes per entity for those use cases. + +**No further callback elimination opportunities** exist without fundamentally changing ESPHome's architecture or breaking user-facing features. diff --git a/lazy_callback_tradeoff_analysis.md b/lazy_callback_tradeoff_analysis.md new file mode 100644 index 0000000000..11069bd680 --- /dev/null +++ b/lazy_callback_tradeoff_analysis.md @@ -0,0 +1,188 @@ +# Lazy Callback Allocation - Tradeoff Analysis + +## Current Implementation +```cpp +class Sensor { + std::unique_ptr> raw_callback_; // lazy + CallbackManager callback_; // always allocated +}; +``` + +## Proposed Implementation +```cpp +class Sensor { + std::unique_ptr> raw_callback_; // lazy + std::unique_ptr> callback_; // ALSO lazy +}; +``` + +## Memory Impact (ESP32 32-bit) + +### No Callbacks Registered +**Current:** +- `raw_callback_` unique_ptr: 4 bytes (nullptr) +- `callback_` vector struct: 12 bytes (empty, no heap allocation) +- **Total: 16 bytes** + +**Lazy:** +- `raw_callback_` unique_ptr: 4 bytes (nullptr) +- `callback_` unique_ptr: 4 bytes (nullptr) +- **Total: 8 bytes** + +**Savings: 8 bytes per entity without callbacks** ✅ + +### One Callback Registered (e.g., MQTT) +**Current:** +- In object: 4 bytes (raw ptr) + 12 bytes (vector struct) = 16 bytes +- On heap: vector allocates storage for std::function ≈ 16 bytes +- **Total: 16 + 16 = 32 bytes** + +**Lazy:** +- In object: 4 bytes (raw ptr) + 4 bytes (callback ptr) = 8 bytes +- Heap #1: CallbackManager object (vector struct) = 12 bytes +- Heap #2: vector allocates storage for std::function ≈ 16 bytes +- **Total: 8 + 12 + 16 = 36 bytes** + +**Cost: 4 bytes MORE when callbacks are used** ❌ + +## Code Changes Required + +### 1. Update publish_state() - Hot path! +```cpp +// Current +void Sensor::internal_send_state_to_frontend(float state) { + this->callback_.call(state); // Always valid +#ifdef USE_CONTROLLER_REGISTRY + ControllerRegistry::notify_sensor_update(this); +#endif +} + +// Lazy - adds nullptr check in hot path +void Sensor::internal_send_state_to_frontend(float state) { + if (this->callback_) { // ← NEW: nullptr check + this->callback_->call(state); + } +#ifdef USE_CONTROLLER_REGISTRY + ControllerRegistry::notify_sensor_update(this); +#endif +} +``` + +### 2. Update add_on_state_callback() +```cpp +// Current +void Sensor::add_on_state_callback(std::function &&callback) { + this->callback_.add(std::move(callback)); +} + +// Lazy - lazy allocate on first use +void Sensor::add_on_state_callback(std::function &&callback) { + if (!this->callback_) { + this->callback_ = std::make_unique>(); + } + this->callback_->add(std::move(callback)); +} +``` + +### 3. Apply to ALL entity types +Need to update: +- Sensor, BinarySensor, TextSensor +- Climate, Fan, Light, Cover +- Switch, Lock, Valve +- Number, Select, Text, Button +- AlarmControlPanel, MediaPlayer +- etc. + +## Performance Impact + +**Hot path (publish_state):** Adds one nullptr check per state update +- Branch predictor should handle this well (mostly predictable per entity) +- Cost: 1-2 CPU cycles (likely free with branch prediction) + +**Cold path (add callback):** Extra allocation + initialization +- Only happens during setup(), not during loop() +- Negligible impact + +## Who Benefits? + +### Entities WITHOUT callbacks (saves 8 bytes each): +✅ Sensors with **only** API/WebServer (no MQTT, no automations, no copy, no derivatives) +✅ Switches with **only** API/WebServer +✅ Binary sensors with **only** API/WebServer +✅ Covers, Fans, Lights, etc. with **only** API/WebServer + +### Entities WITH callbacks (costs 4 bytes each): +❌ Any entity with MQTT enabled +❌ Any entity with automations (`on_value:`, `on_state:`) +❌ Copy components +❌ Derivative sensors (total_daily_energy, integration, etc.) +❌ Climate/covers with feedback sensors + +## Realistic Scenario Analysis + +### Scenario 1: Simple API-only device (10 sensors, no MQTT) +**Current:** 10 × 16 = 160 bytes +**Lazy:** 10 × 8 = 80 bytes +**Savings: 80 bytes** ✅ + +### Scenario 2: MQTT-enabled device (10 sensors with MQTT) +**Current:** 10 × 32 = 320 bytes +**Lazy:** 10 × 36 = 360 bytes +**Cost: 40 bytes** ❌ + +### Scenario 3: Mixed device (5 API-only, 5 with MQTT) +**Current:** (5 × 16) + (5 × 32) = 80 + 160 = 240 bytes +**Lazy:** (5 × 8) + (5 × 36) = 40 + 180 = 220 bytes +**Savings: 20 bytes** ✅ + +### Scenario 4: Heavy automation device (10 sensors, MQTT + automation on each) +**Current:** 10 × (32 + 16) = 480 bytes (2 callbacks each) +**Lazy:** 10 × (36 + 16) = 520 bytes +**Cost: 40 bytes** ❌ + +## Recommendation + +### Pros: +✅ Saves 8 bytes per entity without callbacks +✅ Common case: many devices use only API/WebServer after Controller Registry +✅ Minimal code complexity (just nullptr checks) +✅ Negligible performance impact (predictable branch) +✅ Follows existing pattern (raw_callback_ is already lazy) + +### Cons: +❌ Costs 4 extra bytes when callbacks ARE used (extra heap allocation) +❌ Adds nullptr check to hot path (publish_state called frequently) +❌ Requires changes to ALL entity base classes (~15+ files) +❌ Users with MQTT enabled pay the 4-byte cost + +### Decision Matrix: +**Adopt if:** Most users have API-only devices (no MQTT) +**Skip if:** Most users enable MQTT or use many automations + +### Data Needed: +- What % of ESPHome devices use MQTT? +- What % of entities have automations? +- Average entity count per device? + +### My Recommendation: **WORTH CONSIDERING** + +The savings for API-only devices are real (8 bytes per entity), and with Controller Registry, more devices are API-only. The 4-byte cost for MQTT users is small compared to MQTT's overall overhead (~60+ bytes of config per entity). + +**Suggested approach:** +1. Start with Sensor (most common entity type) +2. Measure real-world impact +3. Expand to other entity types if beneficial + +**Code pattern:** +```cpp +// Helper macro to reduce boilerplate +#define LAZY_CALLBACK_CALL(callback_ptr, ...) \ + do { if (callback_ptr) { callback_ptr->call(__VA_ARGS__); } } while(0) + +void Sensor::internal_send_state_to_frontend(float state) { + LAZY_CALLBACK_CALL(this->callback_, state); + #ifdef USE_CONTROLLER_REGISTRY + ControllerRegistry::notify_sensor_update(this); + #endif +} +``` diff --git a/partitioned_callback_final_proposal.md b/partitioned_callback_final_proposal.md new file mode 100644 index 0000000000..fb67e5cab2 --- /dev/null +++ b/partitioned_callback_final_proposal.md @@ -0,0 +1,286 @@ +# Partitioned Callback Vector - Final Proposal + +## Design + +Use a **single vector** partitioned into filtered and raw sections, managed with **swap** to maintain O(1) insertion: + +```cpp +// Layout: [filtered_0, ..., filtered_n-1, raw_0, ..., raw_m-1] +// ^ ^ +// 0 filtered_count_ +``` + +## Implementation + +### Header (sensor.h) +```cpp +class Sensor : public EntityBase, /* ... */ { + public: + void add_on_state_callback(std::function &&callback); + void add_on_raw_state_callback(std::function &&callback); + void internal_send_state_to_frontend(float state); + void publish_state(float state); + + protected: + struct Callbacks { + std::vector> callbacks_; // 12 bytes + uint8_t filtered_count_{0}; // 1 byte (+ 3 padding) + // Total: 16 bytes on ESP32 + + void add_filtered(std::function &&fn) { + callbacks_.push_back(std::move(fn)); + if (filtered_count_ < callbacks_.size() - 1) { + // Swap new callback into filtered section + std::swap(callbacks_[filtered_count_], callbacks_[callbacks_.size() - 1]); + } + filtered_count_++; + } + + void add_raw(std::function &&fn) { + callbacks_.push_back(std::move(fn)); + } + + void call_filtered(float value) { + for (size_t i = 0; i < filtered_count_; i++) { + callbacks_[i](value); + } + } + + void call_raw(float value) { + for (size_t i = filtered_count_; i < callbacks_.size(); i++) { + callbacks_[i](value); + } + } + }; + + std::unique_ptr callbacks_; // 4 bytes, lazy allocated +}; +``` + +### Implementation (sensor.cpp) +```cpp +void Sensor::add_on_state_callback(std::function &&callback) { + if (!this->callbacks_) { + this->callbacks_ = std::make_unique(); + } + this->callbacks_->add_filtered(std::move(callback)); +} + +void Sensor::add_on_raw_state_callback(std::function &&callback) { + if (!this->callbacks_) { + this->callbacks_ = std::make_unique(); + } + this->callbacks_->add_raw(std::move(callback)); +} + +void Sensor::publish_state(float state) { + this->raw_state = state; + + // Call raw callbacks (before filters) + if (this->callbacks_) { + this->callbacks_->call_raw(state); + } + + ESP_LOGV(TAG, "'%s': Received new state %f", this->name_.c_str(), state); + + // ... filter logic ... +} + +void Sensor::internal_send_state_to_frontend(float state) { + this->set_has_state(true); + this->state = state; + + ESP_LOGD(TAG, "'%s': Sending state %.5f %s with %d decimals of accuracy", + this->get_name().c_str(), state, this->get_unit_of_measurement_ref().c_str(), + this->get_accuracy_decimals()); + + // Call filtered callbacks (after filters) + if (this->callbacks_) { + this->callbacks_->call_filtered(state); + } + +#if defined(USE_SENSOR) && defined(USE_CONTROLLER_REGISTRY) + ControllerRegistry::notify_sensor_update(this); +#endif +} +``` + +## Memory Comparison (ESP32 32-bit) + +### Current Implementation +```cpp +std::unique_ptr> raw_callback_; // 4 bytes +CallbackManager callback_; // 12 bytes +``` + +| Scenario | Current | Partitioned | Savings | +|----------|---------|-------------|---------| +| No callbacks | 16 bytes | 4 bytes | **12 bytes ✅** | +| 1 filtered (MQTT) | 32 bytes | 33 bytes | -1 byte ⚠️ | +| 2 filtered | 48 bytes | 49 bytes | -1 byte ⚠️ | +| 1 raw + 1 filtered | 60 bytes | 49 bytes | **11 bytes ✅** | +| 2 raw + 2 filtered | 92 bytes | 65 bytes | **27 bytes ✅** | + +Wait, let me recalculate this more carefully... + +### Corrected Memory Analysis + +**Current:** +- No callbacks: 16 bytes (4 ptr + 12 vec) +- 1 filtered: 16 + 16 (fn) = 32 bytes +- 2 filtered: 16 + 32 (2 fns) = 48 bytes +- 1 raw + 1 filtered: 16 + 12 (raw vec) + 16 (raw fn) + 16 (filtered fn) = 60 bytes + +**Partitioned:** +- No callbacks: 4 bytes (nullptr) +- 1 filtered: 4 (ptr) + 16 (Callbacks struct) + 16 (fn) = 36 bytes +- 2 filtered: 4 + 16 + 32 (2 fns) = 52 bytes +- 1 raw + 1 filtered: 4 + 16 + 32 (2 fns) = 52 bytes + +Hmm, the struct is 16 bytes (12 vec + 1 count + 3 padding), so: + +Actually on ESP32: +- std::vector = 12 bytes (3 pointers) +- uint8_t = 1 byte +- padding = 3 bytes (to align to 4) +- Total struct: 16 bytes + +But when heap allocated, the struct size is what matters for memory consumption. Let me revise: + +**Partitioned (heap-allocated Callbacks struct):** +- Callbacks struct on heap: 12 (vector struct) + 1 (count) + 3 (padding) = 16 bytes +- Vector data on heap: N × 16 bytes for N callbacks + +So: +- No callbacks: 4 bytes (nullptr) ✅ SAVES 12 +- 1 filtered: 4 (ptr) + 16 (struct) + 16 (fn) = 36 bytes ⚠️ COSTS 4 +- 2 filtered: 4 + 16 + 32 = 52 bytes ⚠️ COSTS 4 +- 1 raw + 1 filtered: 4 + 16 + 32 = 52 bytes ✅ SAVES 8 + +Actually wait - in the current implementation, when we have raw + filtered, we have: +- 16 bytes base +- 12 bytes for raw CallbackManager (heap allocated) +- 16 bytes for raw std::function +- 16 bytes for filtered std::function += 60 bytes total + +With partitioned: +- 4 bytes (ptr) +- 16 bytes (Callbacks struct on heap) +- 16 bytes (raw fn) +- 16 bytes (filtered fn) += 52 bytes total + +SAVES 8 bytes ✅ + +Let me make a cleaner table: + +| Scenario | Current | Partitioned | Savings | +|----------|---------|-------------|---------| +| No callbacks | 16 | 4 | **+12 ✅** | +| 1 filtered only | 32 | 36 | **-4 ⚠️** | +| 1 raw only | 44 | 36 | **+8 ✅** | +| 1 raw + 1 filtered | 60 | 52 | **+8 ✅** | +| 2 filtered only | 48 | 52 | **-4 ⚠️** | +| 10 API-only sensors | 160 | 40 | **+120 ✅** | +| 10 MQTT sensors | 320 | 360 | **-40 ⚠️** | + +## Performance Characteristics + +### Time Complexity +- `add_filtered()`: **O(1)** - append + swap +- `add_raw()`: **O(1)** - append +- `call_filtered()`: **O(n)** - iterate filtered section +- `call_raw()`: **O(m)** - iterate raw section + +### Hot Path (publish_state) +**Before:** +```cpp +if (this->callback_) { + this->callback_.call(state); // Direct call +} +``` + +**After:** +```cpp +if (this->callbacks_) { + for (size_t i = 0; i < callbacks_->filtered_count_; i++) { + callbacks_->callbacks_[i](state); + } +} +``` + +**Performance impact:** +- Adds nullptr check (already present for raw_callback_) +- Loop is tight, no branching inside +- Better cache locality than separate vectors +- Negligible impact for 0-2 callbacks (typical case) + +## Advantages + +1. ✅ **Best memory savings**: 12 bytes per entity without callbacks +2. ✅ **O(1) insertion**: Both filtered and raw use append (+ optional swap) +3. ✅ **No branching**: Hot path has no `if (type == FILTERED)` checks +4. ✅ **Cache friendly**: Callbacks stored contiguously +5. ✅ **Simple**: One container instead of two +6. ✅ **Minimal overhead**: Only 1 byte (+ padding) for partition count + +## Disadvantages + +1. ⚠️ **Costs 4 bytes** for entities with callbacks (vs current) + - But saves 12 bytes for entities WITHOUT callbacks (more common after Controller Registry) + +2. ⚠️ **Swap on filtered insertion** + - Only during setup(), not runtime + - O(1) operation + - Negligible impact + +3. ⚠️ **Order not preserved** within each section + - Not a problem - callback order doesn't matter + - MQTT and automation callbacks are independent + +## Recommendation + +**IMPLEMENT THIS!** + +The partitioned vector with swap is the optimal design because: +- After Controller Registry, most entities have 0 callbacks (12-byte savings) +- Entities with callbacks pay only 4 extra bytes +- O(1) operations, no performance degradation +- Cleaner, simpler code + +**Migration strategy:** +1. Implement for Sensor first +2. Measure real-world impact on flash/RAM +3. Apply to BinarySensor, TextSensor +4. Expand to other entity types (Climate, Fan, etc.) + +## Code Reusability + +The `Callbacks` struct can be templated for reuse across entity types: + +```cpp +template +struct PartitionedCallbacks { + std::vector> callbacks_; + uint8_t filtered_count_{0}; + + void add_filtered(std::function &&fn) { /* ... */ } + void add_raw(std::function &&fn) { /* ... */ } + void call_filtered(Args... args) { /* ... */ } + void call_raw(Args... args) { /* ... */ } +}; + +// Usage in different entity types: +class Sensor { + std::unique_ptr> callbacks_; +}; + +class TextSensor { + std::unique_ptr> callbacks_; +}; + +class Climate { + std::unique_ptr> callbacks_; +}; +``` diff --git a/unified_callback_proposal.md b/unified_callback_proposal.md new file mode 100644 index 0000000000..d92ce26c92 --- /dev/null +++ b/unified_callback_proposal.md @@ -0,0 +1,208 @@ +# Unified Callback Storage - Proposal + +## Current Implementation (ESP32 32-bit) + +```cpp +class Sensor { + protected: + std::unique_ptr> raw_callback_; // 4 bytes + CallbackManager callback_; // 12 bytes +}; +``` + +**Memory costs:** +- No callbacks: **16 bytes** +- 1 filtered callback (MQTT): **32 bytes** (16 + 16 function) +- 1 raw + 1 filtered: **60 bytes** (16 + 12 + 16 + 16) + +## Proposed: Single Vector with Type Flag + +```cpp +class Sensor { + protected: + enum CallbackType : uint8_t { RAW, FILTERED }; + + struct CallbackEntry { + CallbackType type; // 1 byte (+ 3 bytes padding) + std::function fn; // 16 bytes + // Total: 20 bytes per callback + }; + + std::unique_ptr> callbacks_; // 4 bytes, lazy allocated +}; +``` + +**Memory costs:** +- No callbacks: **4 bytes** ✅ **SAVES 12 BYTES** +- 1 filtered callback (MQTT): **36 bytes** (4 + 12 vector + 20 entry) ⚠️ **COSTS 4 BYTES** +- 1 raw + 1 filtered: **56 bytes** (4 + 12 + 20 + 20) ✅ **SAVES 4 BYTES** + +## Alternative: Two Vectors in One Struct + +```cpp +struct SensorCallbacks { + std::vector> raw_callbacks; // 12 bytes + std::vector> filtered_callbacks; // 12 bytes + // Total: 24 bytes overhead +}; + +class Sensor { + protected: + std::unique_ptr callbacks_; // 4 bytes, lazy allocated +}; +``` + +**Memory costs:** +- No callbacks: **4 bytes** ✅ **SAVES 12 BYTES** +- 1 filtered callback: **44 bytes** (4 + 24 struct + 16 function) ❌ **COSTS 12 BYTES** +- 1 raw + 1 filtered: **60 bytes** (4 + 24 + 16 + 16) ✅ **SAME** + +## Recommendation: Single Vector with Type + +The single vector approach is better because: +1. ✅ Same 12-byte savings when no callbacks (most common after Controller Registry) +2. ✅ Only 4-byte cost when callbacks ARE used (vs 12-byte cost for two-vectors) +3. ✅ Simpler: one vector to manage instead of two +4. ✅ Better cache locality: callbacks stored together + +## Implementation + +### Header +```cpp +class Sensor : public EntityBase, /* ... */ { + public: + void add_on_state_callback(std::function &&callback); + void add_on_raw_state_callback(std::function &&callback); + void internal_send_state_to_frontend(float state); + + protected: + enum CallbackType : uint8_t { RAW, FILTERED }; + + struct CallbackEntry { + CallbackType type; + std::function fn; + }; + + std::unique_ptr> callbacks_; + + // Helper to allocate on first use + void ensure_callbacks_() { + if (!this->callbacks_) { + this->callbacks_ = std::make_unique>(); + } + } +}; +``` + +### Implementation +```cpp +void Sensor::add_on_state_callback(std::function &&callback) { + this->ensure_callbacks_(); + this->callbacks_->push_back({FILTERED, std::move(callback)}); +} + +void Sensor::add_on_raw_state_callback(std::function &&callback) { + this->ensure_callbacks_(); + this->callbacks_->push_back({RAW, std::move(callback)}); +} + +void Sensor::internal_send_state_to_frontend(float state) { + // Call filtered callbacks + if (this->callbacks_) { + for (auto &entry : *this->callbacks_) { + if (entry.type == FILTERED) { + entry.fn(state); + } + } + } + +#ifdef USE_CONTROLLER_REGISTRY + ControllerRegistry::notify_sensor_update(this); +#endif +} + +void Sensor::publish_state(float state) { + // Call raw callbacks first + if (this->callbacks_) { + for (auto &entry : *this->callbacks_) { + if (entry.type == RAW) { + entry.fn(state); + } + } + } + + // Then filters, which eventually call internal_send_state_to_frontend + // ... existing filter logic ... +} +``` + +## Performance Impact + +**Hot path (publish_state):** +- Before: Direct vector iteration +- After: Vector iteration with type check (`if (entry.type == FILTERED)`) +- Cost: ~1 CPU cycle per callback (trivial branch) +- Most entities have 0-2 callbacks, so negligible + +**Cold path (add_on_state_callback):** +- Extra check + possible allocation +- Only happens during setup() +- Negligible impact + +## Migration Path + +1. **Start with Sensor** (most common entity type) +2. Measure real-world impact +3. Apply to other entities: BinarySensor, TextSensor, Climate, Fan, etc. + +## Real-World Scenarios + +### Scenario 1: API-only device (10 sensors, no MQTT) +**Before:** 10 × 16 = 160 bytes +**After:** 10 × 4 = 40 bytes +**Saves: 120 bytes** ✅ + +### Scenario 2: MQTT-enabled (10 sensors) +**Before:** 10 × 32 = 320 bytes +**After:** 10 × 36 = 360 bytes +**Costs: 40 bytes** ⚠️ + +### Scenario 3: Mixed (5 API-only + 5 MQTT) +**Before:** (5 × 16) + (5 × 32) = 240 bytes +**After:** (5 × 4) + (5 × 36) = 200 bytes +**Saves: 40 bytes** ✅ + +### Scenario 4: Heavy automation (10 sensors, MQTT + automation each) +**Before:** 10 × (16 + 16 + 16) = 480 bytes (base + MQTT fn + automation fn) +**After:** 10 × (4 + 12 + 20 + 20) = 560 bytes (ptr + vector + 2 entries) +**Costs: 80 bytes** ⚠️ + +Wait, let me recalculate this more carefully... + +Actually with 2 callbacks: +**Before:** 16 (base) + 16 (MQTT fn) + 16 (automation fn) = 48 bytes per sensor +**After:** 4 (ptr) + 12 (vector) + 20 (MQTT entry) + 20 (automation entry) = 56 bytes per sensor +**Costs: 8 bytes per sensor with 2+ callbacks** ⚠️ + +### Revised Decision Matrix + +**Wins:** +- Entities with 0 callbacks: Save 12 bytes (most common after Controller Registry) +- Entities with 2+ different types (raw + filtered): Save 4 bytes + +**Loses:** +- Entities with 1 callback: Cost 4 bytes +- Entities with 2+ callbacks of same type: Cost 8 bytes + +**Net benefit:** Positive if >75% of entities have 0 callbacks + +## Conclusion + +**RECOMMENDED** for implementation because: +1. ✅ After Controller Registry, most entities are API-only (0 callbacks) +2. ✅ 12-byte savings per entity adds up quickly (120 bytes for 10 sensors) +3. ✅ 4-8 byte cost for MQTT users is acceptable (MQTT already has ~60+ bytes overhead) +4. ✅ Simpler code: one container instead of two +5. ✅ Negligible performance impact (predictable branch on hot path) + +**Start with Sensor, measure, then expand to other entity types.** diff --git a/updated_plan_remove_event_emitter_replace_simple_callbacks.md b/updated_plan_remove_event_emitter_replace_simple_callbacks.md new file mode 100644 index 0000000000..7c05c9a18f --- /dev/null +++ b/updated_plan_remove_event_emitter_replace_simple_callbacks.md @@ -0,0 +1,204 @@ +# Plan: Remove EventEmitter and Replace with Simple Callbacks + +## Overview + +This plan describes the architectural change from using the EventEmitter component to direct callback storage in ESP32 BLE server classes. This simplifies the codebase by removing template metaprogramming and code generation complexity. + +## Motivation + +The previous EventEmitter-based approach had several issues: +1. **Memory overhead**: Each EventEmitter instance stored a `std::vector` of listeners, even when only 0-1 listeners were needed +2. **Complexity**: Required template metaprogramming with per-UUID specialized classes +3. **Code generation**: Required Python code generation to create specialized classes +4. **Allocation tracking**: Required complex API to pre-allocate listener slots + +## New Approach + +Replace EventEmitter with direct callback storage using `std::function`: + +- Each BLE object (characteristic, descriptor, server) stores callbacks directly as member variables +- Only one callback per event type (sufficient for ESPHome's automation framework) +- No code generation needed +- No allocation tracking needed +- Simpler, more maintainable code + +## Implementation Changes + +### 1. BLECharacteristic (ble_characteristic.h/cpp) + +**Removed:** +- `EventEmitter, uint16_t>` inheritance +- `EventEmitter` inheritance +- `BLECharacteristicEvt` namespace with event enums +- Virtual `emit_on_write_()` and `emit_on_read_()` methods + +**Added:** +- Direct callback registration methods: + ```cpp + void on_write(std::function, uint16_t)> &&callback); + void on_read(std::function &&callback); + ``` +- Callback member variables: + ```cpp + std::function, uint16_t)> on_write_callback_{nullptr}; + std::function on_read_callback_{nullptr}; + ``` + +**Implementation:** +- In `gatts_event_handler()`, replaced `emit_on_write_()` with direct callback invocation: + ```cpp + if (this->on_write_callback_) { + this->on_write_callback_(this->value_, param->write.conn_id); + } + ``` + +### 2. BLEDescriptor (ble_descriptor.h/cpp) + +**Removed:** +- `EventEmitter, uint16_t>` inheritance +- `BLEDescriptorEvt` namespace with event enums +- Virtual `emit_on_write_()` method + +**Added:** +- Direct callback registration: + ```cpp + void on_write(std::function, uint16_t)> &&callback); + ``` +- Callback member variable: + ```cpp + std::function, uint16_t)> on_write_callback_{nullptr}; + ``` + +### 3. BLEServer (ble_server.h/cpp) + +**Removed:** +- `EventEmitter` inheritance +- `BLEServerEvt` namespace with event enums +- Virtual `emit_on_connect_()` and `emit_on_disconnect_()` methods + +**Added:** +- Direct callback registration: + ```cpp + void on_connect(std::function &&callback); + void on_disconnect(std::function &&callback); + ``` +- Callback member variables: + ```cpp + std::function on_connect_callback_{nullptr}; + std::function on_disconnect_callback_{nullptr}; + ``` + +### 4. BLE Server Automations (ble_server_automations.h/cpp) + +**Removed:** +- EventEmitter listener ID tracking (`EventEmitterListenerID`) +- `INVALID_LISTENER_ID` constant +- Complex listener ID management in `BLECharacteristicSetValueActionManager` + +**Simplified:** +- `BLECharacteristicSetValueActionManager` now just tracks presence of listener (not ID) +- Uses simple `has_listener()` check instead of ID comparison +- Callback registration uses direct `on_write()` and `on_read()` methods + +### 5. Python Code Generation (__init__.py) + +**Removed:** +- Entire allocation API (functions to pre-allocate listener slots) +- All code generation for specialized per-UUID classes +- Allocation tracking data structures +- EventEmitter from `AUTO_LOAD` + +**Result:** +- ~700 lines of complex code generation removed +- Simpler `to_code()` functions +- No allocation calls needed in component configurations + +### 6. EventEmitter Component + +**Removed:** +- Entire `esphome/components/event_emitter/` directory deleted +- No longer needed since callbacks are stored directly + +### 7. Components Using BLE Server (esp32_improv) + +**Changes:** +- Removed EventEmitter allocation calls +- Uses direct callback registration via existing interface +- No other changes needed (uses virtual methods that call callbacks) + +## Benefits + +1. **Reduced memory usage**: No `std::vector` overhead for each event type +2. **Simpler codebase**: ~700 lines of complex code removed +3. **Easier to maintain**: No template metaprogramming or code generation +4. **Faster compilation**: No specialized classes to generate +5. **Clearer intent**: Direct callbacks are easier to understand than EventEmitter indirection + +## Technical Details + +### Callback Signature Changes + +Using `std::span` for zero-copy data passing: +- **Before**: `std::vector` (requires copy) +- **After**: `std::span` (zero-copy view) + +### Callback Storage + +Using `std::function` with nullptr initialization: +```cpp +std::function callback_{nullptr}; +``` + +Checked before invocation: +```cpp +if (this->callback_) { + this->callback_(...); +} +``` + +### Callback Registration + +Callbacks moved into storage to avoid copies: +```cpp +void on_write(std::function, uint16_t)> &&callback) { + this->on_write_callback_ = std::move(callback); +} +``` + +### Lambda Captures + +Automation framework registers lambdas with captures: +```cpp +characteristic->on_write([on_write_trigger](std::span data, uint16_t id) { + on_write_trigger->trigger(std::vector(data.begin(), data.end()), id); +}); +``` + +## Testing + +After implementation: +1. Test ESP32 BLE server compilation with various configurations +2. Test esp32_improv component (uses BLE server) +3. Verify memory usage improvements +4. Verify functionality with BLE automations + +## Files Modified + +- `esphome/components/esp32_ble_server/ble_characteristic.h` +- `esphome/components/esp32_ble_server/ble_characteristic.cpp` +- `esphome/components/esp32_ble_server/ble_descriptor.h` +- `esphome/components/esp32_ble_server/ble_descriptor.cpp` +- `esphome/components/esp32_ble_server/ble_server.h` +- `esphome/components/esp32_ble_server/ble_server.cpp` +- `esphome/components/esp32_ble_server/ble_server_automations.h` +- `esphome/components/esp32_ble_server/ble_server_automations.cpp` +- `esphome/components/esp32_ble_server/__init__.py` +- `esphome/components/esp32_improv/__init__.py` + +## Files Deleted + +- `esphome/components/event_emitter/` (entire directory) + +## Summary + +This change dramatically simplifies the ESP32 BLE server implementation while reducing memory usage and improving maintainability. The direct callback approach is more idiomatic C++ and easier to understand than the EventEmitter abstraction. diff --git a/usb_callback_contexts.md b/usb_callback_contexts.md new file mode 100644 index 0000000000..0635605b1c --- /dev/null +++ b/usb_callback_contexts.md @@ -0,0 +1,90 @@ +# USB Host Component - Callback Execution Contexts + +## Overview +After the refactoring to use a dedicated USB task, all USB callbacks now execute in the USB task context, NOT in the main loop. This prevents race conditions and data corruption. + +## USB Task Architecture +```cpp +// USB Task (runs on Core 1, priority 5) +void USBClient::usb_task_loop() { + while (usb_task_running_) { + // This handles ALL USB events and triggers callbacks + usb_host_client_handle_events(handle_, pdMS_TO_TICKS(10)); + } +} +``` + +## Callback Execution Contexts + +### 1. **client_event_cb** (Device Connect/Disconnect) +- **Context**: USB task +- **Triggered by**: USB device connection/disconnection events +- **What it does**: Queues events to main loop via FreeRTOS queue +- **Data flow**: USB hardware → USB task → Queue → Main loop + +### 2. **control_callback** (Control Transfers) +- **Context**: USB task +- **Triggered by**: Completion of USB control transfers +- **What it does**: Queues callback execution to main loop +- **Data flow**: USB hardware → USB task → Queue → Main loop + +### 3. **transfer_callback** (Bulk Transfers) +- **Context**: USB task +- **Triggered by**: Completion of bulk IN/OUT transfers +- **What it does**: Queues callback execution to main loop +- **Data flow**: USB hardware → USB task → Queue → Main loop + +### 4. **USBUartComponent Input Callback** (Data Reception) +- **Context**: USB task (lambda passed to transfer_in) +- **Called from**: transfer_callback in USB task +- **What it does**: + - Copies received data to temporary buffer + - Queues data processing to main loop via defer() + - Main loop then safely pushes to ring buffer +- **Data flow**: USB hardware → USB task → Copy data → Queue → Main loop → Ring buffer + +### 5. **USBUartComponent Output Callback** (Data Transmission) +- **Context**: USB task (lambda passed to transfer_out) +- **Called from**: transfer_callback in USB task +- **What it does**: + - Marks output as not started + - Queues next transfer start to main loop via defer() +- **Data flow**: Main loop reads ring buffer → Start transfer → USB task → Queue restart → Main loop + +## Thread Safety Summary + +### Safe Operations (Main Loop Only): +- Ring buffer push (incoming data) +- Ring buffer pop (outgoing data) +- Component state changes +- Transfer initiation + +### USB Task Operations: +- USB event handling +- Transfer completion detection +- Event queueing to main loop + +### Communication Mechanism: +- **FreeRTOS Queue**: For USB events and callbacks +- **defer()**: For component-specific operations + +## Why This Architecture? + +1. **Prevents data loss**: USB events are handled promptly even when main loop is busy +2. **Thread safety**: Ring buffer is only accessed from main loop +3. **No race conditions**: Data structures aren't shared between tasks +4. **Maintains responsiveness**: USB hardware FIFOs don't overflow + +## Key Changes from Original Implementation + +### Before (Problematic): +- `usb_host_client_handle_events()` called in main loop +- Callbacks executed in main loop context +- USB events dropped when main loop was busy +- Ring buffer corruption when data arrived during slow processing + +### After (Fixed): +- `usb_host_client_handle_events()` runs in dedicated task +- Callbacks execute in USB task, queue work to main loop +- USB events always handled promptly +- Ring buffer only accessed from main loop (thread-safe)