diff --git a/MARL.xmind b/MARL.xmind new file mode 100644 index 0000000..5ecfe55 Binary files /dev/null and b/MARL.xmind differ diff --git a/Reinforcement Learning.md b/Reinforcement Learning.md new file mode 100644 index 0000000..7d896b3 --- /dev/null +++ b/Reinforcement Learning.md @@ -0,0 +1,208 @@ +# Reinforcement Learning + +## value-based + +## policy-based + +### 一般方法 + +- 目标函数 + +$$ +J(\theta)=E_{S}[V_{\pi}(S)] +$$ +- 策略梯度定理 + +$$ +\frac{\partial J(\theta)}{\partial \theta}=\nabla_{\theta}J(\theta)=\mathbb{E}_{S}[\mathbb{E}_{A\backsim \pi(\cdot | S;\theta)}[\frac{\partial \ln\pi(A|S;\theta)}{\partial \theta}\cdot Q_{\pi}(S,A)]] +$$ +- 随机梯度——策略梯度的无偏估计 + +$$ +{g}(s,a;\theta)\triangleq Q_\pi(s,a)\cdot \nabla_{\theta}\ln\pi(a|s;\theta) +$$ +- 策略网络提升 + +$$ +\theta\gets\theta+\beta\cdot g(s,a;\theta) +$$ + +### 带基线的策略梯度方法 + +- 带基线的策略梯度定理——b是不依赖于A的任意函数 + +$$ +\nabla_{\theta}J(\theta)=\mathbb{E}_{S}[\mathbb{E}_{A\backsim \pi(\cdot | S;\theta)}[(Q_{\pi}(S,A)]-b)\cdot \nabla_{\theta}\ln\pi(A|S;\theta)] +$$ +- 随机梯度 + +$$ +g_b(s,a;\theta)=[Q_{\pi}(S,A)-b]\cdot\nabla_{\theta}\ln\pi(A|S;\theta) +$$ + +### REINFORCE + +- 折扣回报 + +$$ +U_t=\sum_{k-t}^{n}\gamma^{k-t}\cdot R_k +$$ +- 动作价值是折扣回报的期望 + +$$ +Q_\pi(s_t,a_t)=\mathbb{E}[U_t|S_t=s_t,A_t=a_t] +$$ +- 用折扣回报的观测值蒙特卡洛近似动作价值 + +$$ +\tilde{g}(s_T,a_t;\theta)=u_t\cdot\nabla_{\theta}\ln\pi(a_t|s_t;\theta) +$$ +- 策略网络提升 + +$$ +\theta_{new}\gets\theta_{now}+\beta\cdot\sum_{t=1}^{n}\gamma^{t-1}\cdot\tilde{g}(s_t.a_t;\theta_{now}) +$$ + +### 带基线的REINFORCE + +- 策略网络 + + - 折扣回报 + +$$ +u_t=\sum_{k-t}^{n}\gamma^{k-t}\cdot r_k +$$ + - 基线——价值网络做出的预测 + +$$ +\hat{v_t}=v(s_t;\omega) +$$ + - 带基线的策略梯度 + +$$ +\tilde{g}(s_t,a_t;\theta)=(u_t-\hat{v_t})\cdot\nabla_{\theta}\ln\pi(a_t|s_t;\theta) +$$ + - 梯度上升 + +$$ +\theta\gets\theta+\beta\cdot\tilde{g}(s_t,a_t;\theta) +$$ + +- 价值网络 + + - 损失函数 + +$$ +L(\omega)=\frac{1}{2n}\sum_{t=1}^{n}[v(s_t;\omega)-u_t]^2 +$$ + - 损失函数的梯度 + +$$ +\nabla_{\omega}L(\omega)=\frac{1}{n}\sum_{t=1}^{n}[v(s_t;\omega)-u_t]\cdot\nabla_\omega v(s_t;\omega) +$$ + - 梯度下降 + +$$ +\omega\gets\omega-\alpha\cdot\nabla_{\omega}L(\omega) +$$ + +### Actor-Critic + +- 策略网络 + + - 用价值网络近似动作价值 + +$$ +\hat{g}(s,a;\theta)\triangleq q(s,a;\omega)\cdot\nabla_{\theta}\ln\pi(a|s;\theta) +$$ + - 策略网络提升 + +$$ +\theta\gets\theta+\beta\cdot\hat{g}(s,a;\theta) +$$ + +- 价值网络 + + - TD目标 + +$$ +\hat{y_t}\triangleq r_t+\gamma\cdot q(s_{t+1},a_{t+1};\omega) +$$ + - 损失函数 + +$$ +L(\omega)\triangleq \frac{1}{2}[q(s_t,a_t;\omega)-\hat{y_t}]^2 +$$ + - 损失函数梯度 + +$$ +\nabla_{\omega}L(\omega)=[q(s_t,a_t;\omega)-\hat{y_t}]\cdot\nabla_{\omega}q(s_t,a_t;\omega) +$$ + - 梯度下降 + +$$ +\omega\gets\omega-\alpha\cdot\nabla_{\omega}L(\omega) +$$ + +### Advantage Actor-Critic (A2C) + +- 策略网络 + + - 贝尔曼公式 + +$$ +Q_\pi(s_t,a_t)=\mathbb{E}_{S_{t+1}\backsim p(\cdot|s_t,a_t)}[R_t+\gamma\cdot V_\pi(S_{t+1})] +$$ + - 优势函数(Advantage function) + +$$ +Q_\pi(s,a)-V_\pi(s) +$$ + - 近似策略梯度 + +$$ +\begin{aligned}g(s_t,a_t;\theta)&=[Q_\pi(s_t,a_t)-V_\pi(s_t)]\cdot\nabla_\theta\ln\pi(a_t|s_t;\theta)\\ &=[\mathbb{E}_{S_{t+1}}[R_t+\gamma\cdot V_\pi(S_{t+1})]-V_\pi(s_t)]\cdot\nabla_\theta\ln\pi(a_t|s_t;\theta)\\ +蒙特卡洛近似\\ +&\thickapprox[r_t+\gamma\cdot V_\pi(s_{t+1})]-V_\pi(s_t)]\cdot\nabla_\theta\ln\pi(a_t|s_t;\theta)\\ +用价值网络v(s;\omega)替换状态价值函数V_\pi(s)\\ +\tilde{g}(s_t.a_t;\theta)&\triangleq [r_t+\gamma\cdot v_\pi(s_{t+1};\omega)]-v_\pi(s_t;\omega)]\cdot\nabla_\theta\ln\pi(a_t|s_t;\theta) +\end{aligned} +$$ + - 策略网络提升 + +$$ +\theta\gets\theta+\beta\cdot\tilde{g}(s,a;\theta) +$$ + +- 价值网络 + + - 贝尔曼公式 + +$$ +V_\pi(s_t)=\mathbb{E}_{A_t\backsim\pi(\cdot|s_t;\theta)}[\mathbb{E}_{S_{t+1}\backsim p(\cdot|s_t,A_t)}[R_t+\gamma\cdot V_\pi(S_{t+1})]] +$$ + - TD目标 + +$$ +\hat{y_t}\triangleq r_t+\gamma\cdot v(s_{t+1};\omega) +$$ + - 损失函数 + +$$ +L(\omega)\triangleq\frac{1}{2}[v(s_t;\omega)-\hat{y_t}]^2 +$$ + - 损失函数的梯度 + +$$ +\nabla_\omega L(\omega)=[v(s_t;\omega)-\hat{y_t}]\cdot\nabla_\omega v(s_t;\omega) +$$ + - 梯度下降 + +$$ +\omega\gets\omega-\alpha\cdot\nabla_{\omega}L(\omega) +$$ + +## Multi-agent + +### Subtopic 1 + diff --git a/Reinforcement Learning.pdf b/Reinforcement Learning.pdf new file mode 100644 index 0000000..848b2d7 Binary files /dev/null and b/Reinforcement Learning.pdf differ diff --git a/Reinforcement Learning.svg b/Reinforcement Learning.svg new file mode 100644 index 0000000..c8c452f --- /dev/null +++ b/Reinforcement Learning.svg @@ -0,0 +1,281 @@ +]>‎Reinforcement ‎Learning‎value-based‎policy-based‎带基线的策略梯度方法‎带基线的策略梯度定理——b是不依赖于A的任意‎函数‎随机梯度‎REINFORCE‎用折扣回报的观测值蒙特卡洛近似动作价值‎策略网络提升‎带基线的REINFORCE‎策略网络‎折扣回报‎基线——价值网络做出的预测‎带基线的策略梯度‎梯度上升‎价值网络‎损失函数‎损失函数的梯度‎梯度下降‎Actor-Critic‎策略网络‎用价值网络近似动作价值‎策略网络提升‎价值网络‎TD目标‎损失函数‎损失函数梯度‎梯度下降‎Advantage Actor-Critic (A2C)‎策略网络‎贝尔曼公式‎近似策略梯度‎策略网络提升‎价值网络‎TD目标‎损失函数‎损失函数的梯度‎梯度下降‎Multi-agent‎Subtopic 1‎贝尔曼公式‎一般方法‎策略网络提升‎随机梯度——策略梯度的无偏估计‎策略梯度定理‎目标函数‎优势函数(Advantage function)线‎折扣回报‎动作价值是折扣回报的期望‎价值网络只起到基线的作用‎价值网络直接参与策略提升 \ No newline at end of file diff --git a/Reinforcement Learning.xmind b/Reinforcement Learning.xmind new file mode 100644 index 0000000..ce639c4 Binary files /dev/null and b/Reinforcement Learning.xmind differ