|
| 1 | +\documentclass{beamer} |
| 2 | +\usetheme{Madrid} |
| 3 | +\usepackage{amsmath, amsfonts, tikz} |
| 4 | +\usetikzlibrary{arrows.meta, positioning, shapes.geometric} |
| 5 | + |
| 6 | +\title[Reinforcement Learning]{Reinforcement Learning: Principles and Algorithms} |
| 7 | +\author{Morten Hjorth-Jensen} |
| 8 | +\institute{Preliminary notes} |
| 9 | +\date{May, 2024} |
| 10 | + |
| 11 | +\begin{document} |
| 12 | + |
| 13 | +% Title Slide |
| 14 | +\begin{frame} |
| 15 | + \titlepage |
| 16 | +\end{frame} |
| 17 | + |
| 18 | +% Outline |
| 19 | +\begin{frame}{Outline} |
| 20 | + \tableofcontents |
| 21 | +\end{frame} |
| 22 | + |
| 23 | +\section{Introduction} |
| 24 | +\begin{frame}{Motivation and Applications} |
| 25 | + \begin{itemize} |
| 26 | + \item RL is used in robotics, game playing (AlphaGo), finance, recommendation systems. |
| 27 | + \item Learn from interaction with environment. |
| 28 | + \item Optimal control and decision making. |
| 29 | + \end{itemize} |
| 30 | +\end{frame} |
| 31 | + |
| 32 | +\begin{frame}{What is Reinforcement Learning?} |
| 33 | + \begin{itemize} |
| 34 | + \item An area of machine learning concerned with how agents ought to take actions to maximize cumulative reward. |
| 35 | + \item Trial-and-error learning. |
| 36 | + \item Feedback is delayed and sparse. |
| 37 | + \end{itemize} |
| 38 | +\end{frame} |
| 39 | + |
| 40 | +\section{RL Framework} |
| 41 | +\begin{frame}{Agent-Environment Interaction} |
| 42 | + \centering |
| 43 | + \begin{tikzpicture}[node distance=2.5cm, every node/.style={align=center}] |
| 44 | + \node[draw, rectangle, minimum width=2.5cm, minimum height=1cm] (agent) {Agent}; |
| 45 | + \node[draw, rectangle, minimum width=3.5cm, minimum height=1cm, right of=agent, xshift=4cm] (env) {Environment}; |
| 46 | + |
| 47 | + \draw[->, thick] (agent) -- node[above] {Action $a_t$} (env); |
| 48 | + \draw[->, thick] (env) to[out=270,in=270,looseness=2] node[below] {Reward $r_t$, State $s_{t+1}$} (agent); |
| 49 | + \end{tikzpicture} |
| 50 | +\end{frame} |
| 51 | + |
| 52 | +\begin{frame}{Markov Decision Processes (MDPs)} |
| 53 | + \begin{itemize} |
| 54 | + \item MDP is defined by $(\mathcal{S}, \mathcal{A}, P, R, \gamma)$. |
| 55 | + \item $\mathcal{S}$: set of states. |
| 56 | + \item $\mathcal{A}$: set of actions. |
| 57 | + \item $P(s'|s,a)$: transition probabilities. |
| 58 | + \item $R(s,a)$: reward function. |
| 59 | + \item $\gamma \in [0,1]$: discount factor. |
| 60 | + \end{itemize} |
| 61 | +\end{frame} |
| 62 | + |
| 63 | +\begin{frame}{MDP Transition Diagram (Example)} |
| 64 | + \centering |
| 65 | + \begin{tikzpicture}[->, >=stealth', shorten >=1pt, auto, node distance=3.5cm, |
| 66 | + thick, main node/.style={circle, draw, font=\sffamily\Large\bfseries}] |
| 67 | + |
| 68 | + \node[main node] (1) {$s_1$}; |
| 69 | + \node[main node] (2) [right of=1] {$s_2$}; |
| 70 | + \node[main node] (3) [below of=1, yshift=1cm] {$s_3$}; |
| 71 | + |
| 72 | + \path[every node/.style={font=\sffamily\small}] |
| 73 | + (1) edge[bend left] node[above] {$a_1$} (2) |
| 74 | + edge[bend right] node[left] {$a_2$} (3) |
| 75 | + (2) edge[loop right] node {$a_1$} (2) |
| 76 | + (3) edge[bend left] node[right] {$a_1$} (1); |
| 77 | + \end{tikzpicture} |
| 78 | +\end{frame} |
| 79 | + |
| 80 | +\section{Value Functions} |
| 81 | +\begin{frame}{Reward Signal and Return} |
| 82 | + \begin{itemize} |
| 83 | + \item Return $G_t = \sum_{k=0}^{\infty} \gamma^k r_{t+k+1}$. |
| 84 | + \item Objective: Maximize expected return. |
| 85 | + \item Reward hypothesis: All goals can be framed as the maximization of expected cumulative reward. |
| 86 | + \end{itemize} |
| 87 | +\end{frame} |
| 88 | + |
| 89 | +\begin{frame}{Value Functions} |
| 90 | + \begin{itemize} |
| 91 | + \item State-value: $V^\pi(s) = \mathbb{E}_\pi[G_t | s_t = s]$. |
| 92 | + \item Action-value: $Q^\pi(s,a) = \mathbb{E}_\pi[G_t | s_t = s, a_t = a]$. |
| 93 | + \end{itemize} |
| 94 | +\end{frame} |
| 95 | + |
| 96 | +\begin{frame}{Bellman Equations} |
| 97 | + \begin{itemize} |
| 98 | + \item $V^\pi(s) = \sum_a \pi(a|s) \sum_{s'} P(s'|s,a)[R(s,a) + \gamma V^\pi(s')]$. |
| 99 | + \item $V^*(s) = \max_a \sum_{s'} P(s'|s,a)[R(s,a) + \gamma V^*(s')]$. |
| 100 | + \end{itemize} |
| 101 | +\end{frame} |
| 102 | + |
| 103 | +\section{Learning Algorithms} |
| 104 | +\begin{frame}{Dynamic Programming} |
| 105 | + \begin{itemize} |
| 106 | + \item Requires a model of the environment. |
| 107 | + \item Policy evaluation and improvement. |
| 108 | + \item Value iteration and policy iteration. |
| 109 | + \end{itemize} |
| 110 | +\end{frame} |
| 111 | + |
| 112 | +\begin{frame}{Monte Carlo Methods} |
| 113 | + \begin{itemize} |
| 114 | + \item No model needed. |
| 115 | + \item Learn from complete episodes. |
| 116 | + \item First-visit and every-visit MC methods. |
| 117 | + \end{itemize} |
| 118 | +\end{frame} |
| 119 | + |
| 120 | +\begin{frame}{Temporal-Difference (TD) Learning} |
| 121 | + \begin{itemize} |
| 122 | + \item Learn from incomplete episodes. |
| 123 | + \item TD(0): $V(s_t) \leftarrow V(s_t) + \alpha [r_{t+1} + \gamma V(s_{t+1}) - V(s_t)]$. |
| 124 | + \end{itemize} |
| 125 | +\end{frame} |
| 126 | + |
| 127 | +\begin{frame}{SARSA and Q-Learning} |
| 128 | + \begin{itemize} |
| 129 | + \item SARSA: on-policy. |
| 130 | + \item Q-Learning: off-policy. |
| 131 | + \item Both update $Q(s,a)$ from experience. |
| 132 | + \end{itemize} |
| 133 | +\end{frame} |
| 134 | + |
| 135 | +\section{Deep Reinforcement Learning} |
| 136 | +\begin{frame}{Exploration vs Exploitation} |
| 137 | + \begin{itemize} |
| 138 | + \item Need to balance exploring new actions and exploiting known rewards. |
| 139 | + \item $\epsilon$-greedy strategy. |
| 140 | + \end{itemize} |
| 141 | +\end{frame} |
| 142 | + |
| 143 | +\begin{frame}{Function Approximation} |
| 144 | + \begin{itemize} |
| 145 | + \item Use neural networks to approximate $Q(s,a)$ or $\pi(a|s)$. |
| 146 | + \item Generalizes across states and actions. |
| 147 | + \end{itemize} |
| 148 | +\end{frame} |
| 149 | + |
| 150 | +\begin{frame}{Deep Q-Networks (DQN)} |
| 151 | + \begin{itemize} |
| 152 | + \item Neural net to approximate $Q(s,a)$. |
| 153 | + \item Uses experience replay and target networks. |
| 154 | + \end{itemize} |
| 155 | +\end{frame} |
| 156 | + |
| 157 | +\begin{frame}{Policy Gradient Methods} |
| 158 | + \begin{itemize} |
| 159 | + \item Directly parameterize and optimize the policy. |
| 160 | + \item Use gradient ascent: $\nabla_\theta J(\theta)$. |
| 161 | + \end{itemize} |
| 162 | +\end{frame} |
| 163 | + |
| 164 | +\begin{frame}{Actor-Critic Methods} |
| 165 | + \begin{itemize} |
| 166 | + \item Combine value function (critic) with policy (actor). |
| 167 | + \item Advantage Actor-Critic (A2C), A3C. |
| 168 | + \end{itemize} |
| 169 | +\end{frame} |
| 170 | + |
| 171 | +\begin{frame}{Advanced Algorithms} |
| 172 | + \begin{itemize} |
| 173 | + \item PPO: stable updates via clipping. |
| 174 | + \item SAC: maximum entropy RL. |
| 175 | + \end{itemize} |
| 176 | +\end{frame} |
| 177 | + |
| 178 | +\section{Conclusion} |
| 179 | +\begin{frame}{Challenges in RL} |
| 180 | + \begin{itemize} |
| 181 | + \item Sample inefficiency. |
| 182 | + \item Exploration in sparse reward environments. |
| 183 | + \item Stability of training with function approximation. |
| 184 | + \end{itemize} |
| 185 | +\end{frame} |
| 186 | + |
| 187 | +\begin{frame}{Applications} |
| 188 | + \begin{itemize} |
| 189 | + \item Games: AlphaGo, StarCraft. |
| 190 | + \item Robotics. |
| 191 | + \item Healthcare, Finance. |
| 192 | + \end{itemize} |
| 193 | +\end{frame} |
| 194 | + |
| 195 | +\begin{frame}{Summary} |
| 196 | + \begin{itemize} |
| 197 | + \item RL is about learning through interaction. |
| 198 | + \item MDPs, value functions, policy learning are key concepts. |
| 199 | + \item Algorithms evolve from tabular to deep methods. |
| 200 | + \end{itemize} |
| 201 | +\end{frame} |
| 202 | + |
| 203 | + |
| 204 | +\end{document} |
0 commit comments