// RL-Tuned MPC for Multi-Zone HVAC using ESP32 + Wokwi
// Components: 2x Servo, 2x DHT22, DIP Switch 8, I2C 20x4 LCD
// Version 2: Corrected control logic and simulation physics
#include <Wire.h>
#include <LiquidCrystal_I2C.h>
#include <DHT.h>
#include <ESP32Servo.h>
// ==== Constants ====
#define DHTPIN1 25
#define DHTPIN2 26
#define DHTTYPE DHT22
#define SERVO1_PIN 18
#define SERVO2_PIN 19
#define I2C_ADDR 0x27
// CORRECTED: Standard 20x4 LCD dimensions
#define LCD_COLS 20
#define LCD_ROWS 4
#define NUM_LAMBDAS 5
#define NUM_ZONES 2
#define EPSILON 0.2
#define ALPHA 0.1 // Learning rate
#define SETPOINT_BASE 22.0
#define SETPOINT_STEP 1.0
// DIP Switch Pins
const int dipPins[8] = {32, 33, 35, 34, 27, 12, 13, 14};
// ==== Globals ====
DHT dht1(DHTPIN1, DHTTYPE);
DHT dht2(DHTPIN2, DHTTYPE);
Servo servo1, servo2;
LiquidCrystal_I2C lcd(I2C_ADDR, LCD_COLS, LCD_ROWS);
float lambdas[NUM_LAMBDAS] = {0.1, 0.5, 1.0, 2.0, 5.0};
float Q[NUM_LAMBDAS] = {0}; // Q-values for each lambda
int N[NUM_LAMBDAS] = {0}; // Count of how many times each lambda was chosen
// Simulated state variables
float zoneTemp[NUM_ZONES] = {15.0, 15.0}; // Start at a cooler temperature
// ==== Functions ====
// Reads DIP switches to determine the target temperature
float getSetpoint() {
int sp_bits = digitalRead(dipPins[0]) + 2 * digitalRead(dipPins[1]);
return SETPOINT_BASE + sp_bits * SETPOINT_STEP; // 22, 23, 24, 25°C
}
// Reads DIP switch to check if we should use a fixed lambda (Manual Mode)
bool useFixedLambda() {
return digitalRead(dipPins[2]) == HIGH;
}
// Reads DIP switches to get the index of the fixed lambda to use
int getFixedLambdaIndex() {
int idx_bits = digitalRead(dipPins[3]) + 2 * digitalRead(dipPins[4]);
return constrain(idx_bits, 0, NUM_LAMBDAS - 1);
}
// Reads DIP switch to check if RL memory should be reset
bool resetRL() {
return digitalRead(dipPins[5]) == HIGH;
}
void initDIPSwitches() {
for (int i = 0; i < 8; i++) {
pinMode(dipPins[i], INPUT_PULLUP);
}
}
void setup() {
Serial.begin(115200);
Serial.println("Time,T1,T2,U1,U2,Lambda,Reward,Q0,Q1,Q2,Q3,Q4");
dht1.begin();
dht2.begin();
servo1.attach(SERVO1_PIN);
servo2.attach(SERVO2_PIN);
lcd.init();
lcd.backlight();
initDIPSwitches();
lcd.clear();
lcd.setCursor(0, 0);
lcd.print("RL HVAC Controller");
lcd.setCursor(0, 1);
lcd.print("V2 - Corrected");
delay(2000);
}
// Epsilon-Greedy algorithm to choose the best lambda
int chooseLambdaIndex() {
// Explore: choose a random lambda with probability EPSILON
if (random(1000) / 1000.0 < EPSILON) {
return random(NUM_LAMBDAS);
}
// Exploit: choose the lambda with the highest Q-value
float bestQ = -999999.0;
int bestIndex = 0;
for (int i = 0; i < NUM_LAMBDAS; i++) {
if (Q[i] > bestQ) {
bestQ = Q[i];
bestIndex = i;
}
}
return bestIndex;
}
// Simplified MPC (Proportional Controller) to calculate control signal
float computeMPC(float currentTemp, float lambda) {
float error = getSetpoint() - currentTemp;
// IMPROVEMENT: If we are already at or above the setpoint, don't apply heating.
if (error <= 0) {
return 0.0;
}
// The control signal is proportional to the error.
// Lambda acts as a damping factor: higher lambda -> less aggressive control.
float control = error / (1.0 + lambda);
return constrain(control, 0, 1); // Ensure control is between 0% and 100%
}
// **THIS IS THE SIMULATED PHYSICS OF YOUR ROOM**
// It takes the control signal and calculates the new temperature.
float applyActuation(float control, Servo &servo, int zoneIdx) {
// Map the control signal (0-1) to servo angle (0-180)
int pwm = map(control * 100, 0, 100, 0, 180);
servo.write(pwm);
// --- Simulated Physics Model ---
// This function now models a HEATING system.
// a: How much the previous temperature persists (thermal inertia). Closer to 1.0 means it changes slower.
// b: How much effect the heater has. A higher value means the heater is more powerful.
// control: The output of your controller (0 to 1).
// CRITICAL FIX: 'b' is now POSITIVE. A positive control signal INCREASES temperature.
float a = 0.98;
float b = 4.0; // This represents the heater's power.
float noise = random(-10, 11) * 0.01; // Small random fluctuation
// The new temperature is a function of the old temp and the heater output
zoneTemp[zoneIdx] = a * zoneTemp[zoneIdx] + b * control + noise;
return zoneTemp[zoneIdx];
}
// Calculates the reward for a given state and action
float computeReward(float temp, float control, float lambda) {
// Penalize being far from the setpoint. Squaring the error penalizes large errors more heavily.
float comfort_penalty = pow(temp - getSetpoint(), 2);
// Penalize using energy. The penalty is weighted by lambda.
float energy_penalty = lambda * pow(control, 2);
return -(comfort_penalty + energy_penalty);
}
// Updates the Q-value for the chosen lambda based on the reward received
void updateQ(int lambdaIdx, float reward) {
// Standard Q-learning update rule
Q[lambdaIdx] = Q[lambdaIdx] + ALPHA * (reward - Q[lambdaIdx]);
N[lambdaIdx]++;
}
void displayStatus(float t1, float t2, float u1, float u2, float lambda, float reward) {
lcd.clear();
lcd.setCursor(0, 0);
lcd.printf("Z1=%.1fC u1=%.0f%%", t1, u1 * 100);
lcd.setCursor(0, 1);
lcd.printf("Z2=%.1fC u2=%.0f%%", t2, u2 * 100);
lcd.setCursor(0, 2);
lcd.printf("lambda=%.1f r=%.2f", lambda, reward);
lcd.setCursor(0, 3);
lcd.printf("SP=%.1f Mode:%s", getSetpoint(), useFixedLambda() ? "MAN" : "AUTO");
}
void loop() {
// Check for RL reset signal from DIP switch
if (resetRL()) {
for (int i = 0; i < NUM_LAMBDAS; i++) {
Q[i] = 0;
N[i] = 0;
}
lcd.setCursor(0, 2);
lcd.print("RL Memory Reset!");
delay(1500);
return;
}
// 1. CHOOSE ACTION: Select a lambda value using epsilon-greedy
int lambdaIdx = useFixedLambda() ? getFixedLambdaIndex() : chooseLambdaIndex();
float lambda = lambdas[lambdaIdx];
// (We read the temperature from our simulation variable)
float t1 = zoneTemp[0];
float t2 = zoneTemp[1];
// 2. COMPUTE CONTROL: Calculate the required actuator output
float u1 = computeMPC(t1, lambda);
float u2 = computeMPC(t2, lambda);
// 3. APPLY & SIMULATE: Apply control and get the new simulated temperature
float newT1 = applyActuation(u1, servo1, 0);
float newT2 = applyActuation(u2, servo2, 1);
// 4. GET REWARD: Calculate the reward based on the outcome
float reward = computeReward(newT1, u1, lambda) + computeReward(newT2, u2, lambda);
// 5. LEARN: Update the Q-value for the chosen lambda if in AUTO mode
if (!useFixedLambda()) {
updateQ(lambdaIdx, reward);
}
// 6. DISPLAY: Show the current status on the LCD
displayStatus(newT1, newT2, u1, u2, lambda, reward);
static int timeStep = 0;
Serial.printf("%d,%.2f,%.2f,%.2f,%.2f,%.1f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f\n",
timeStep++,
newT1,
newT2,
u1,
u2,
lambda,
reward,
Q[0], Q[1], Q[2], Q[3], Q[4]);
delay(1000);
}