Skip to content

Commit f7c6966

Browse files
committed
update
1 parent 90b109a commit f7c6966

File tree

2 files changed

+204
-0
lines changed

2 files changed

+204
-0
lines changed
6 KB
Binary file not shown.
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
\documentclass{beamer}
2+
\usetheme{Madrid}
3+
\usepackage{amsmath, amsfonts, tikz}
4+
\usetikzlibrary{arrows.meta, positioning, shapes.geometric}
5+
6+
\title[Reinforcement Learning]{Reinforcement Learning: Principles and Algorithms}
7+
\author{Morten Hjorth-Jensen}
8+
\institute{Preliminary notes}
9+
\date{May, 2024}
10+
11+
\begin{document}
12+
13+
% Title Slide
14+
\begin{frame}
15+
\titlepage
16+
\end{frame}
17+
18+
% Outline
19+
\begin{frame}{Outline}
20+
\tableofcontents
21+
\end{frame}
22+
23+
\section{Introduction}
24+
\begin{frame}{Motivation and Applications}
25+
\begin{itemize}
26+
\item RL is used in robotics, game playing (AlphaGo), finance, recommendation systems.
27+
\item Learn from interaction with environment.
28+
\item Optimal control and decision making.
29+
\end{itemize}
30+
\end{frame}
31+
32+
\begin{frame}{What is Reinforcement Learning?}
33+
\begin{itemize}
34+
\item An area of machine learning concerned with how agents ought to take actions to maximize cumulative reward.
35+
\item Trial-and-error learning.
36+
\item Feedback is delayed and sparse.
37+
\end{itemize}
38+
\end{frame}
39+
40+
\section{RL Framework}
41+
\begin{frame}{Agent-Environment Interaction}
42+
\centering
43+
\begin{tikzpicture}[node distance=2.5cm, every node/.style={align=center}]
44+
\node[draw, rectangle, minimum width=2.5cm, minimum height=1cm] (agent) {Agent};
45+
\node[draw, rectangle, minimum width=3.5cm, minimum height=1cm, right of=agent, xshift=4cm] (env) {Environment};
46+
47+
\draw[->, thick] (agent) -- node[above] {Action $a_t$} (env);
48+
\draw[->, thick] (env) to[out=270,in=270,looseness=2] node[below] {Reward $r_t$, State $s_{t+1}$} (agent);
49+
\end{tikzpicture}
50+
\end{frame}
51+
52+
\begin{frame}{Markov Decision Processes (MDPs)}
53+
\begin{itemize}
54+
\item MDP is defined by $(\mathcal{S}, \mathcal{A}, P, R, \gamma)$.
55+
\item $\mathcal{S}$: set of states.
56+
\item $\mathcal{A}$: set of actions.
57+
\item $P(s'|s,a)$: transition probabilities.
58+
\item $R(s,a)$: reward function.
59+
\item $\gamma \in [0,1]$: discount factor.
60+
\end{itemize}
61+
\end{frame}
62+
63+
\begin{frame}{MDP Transition Diagram (Example)}
64+
\centering
65+
\begin{tikzpicture}[->, >=stealth', shorten >=1pt, auto, node distance=3.5cm,
66+
thick, main node/.style={circle, draw, font=\sffamily\Large\bfseries}]
67+
68+
\node[main node] (1) {$s_1$};
69+
\node[main node] (2) [right of=1] {$s_2$};
70+
\node[main node] (3) [below of=1, yshift=1cm] {$s_3$};
71+
72+
\path[every node/.style={font=\sffamily\small}]
73+
(1) edge[bend left] node[above] {$a_1$} (2)
74+
edge[bend right] node[left] {$a_2$} (3)
75+
(2) edge[loop right] node {$a_1$} (2)
76+
(3) edge[bend left] node[right] {$a_1$} (1);
77+
\end{tikzpicture}
78+
\end{frame}
79+
80+
\section{Value Functions}
81+
\begin{frame}{Reward Signal and Return}
82+
\begin{itemize}
83+
\item Return $G_t = \sum_{k=0}^{\infty} \gamma^k r_{t+k+1}$.
84+
\item Objective: Maximize expected return.
85+
\item Reward hypothesis: All goals can be framed as the maximization of expected cumulative reward.
86+
\end{itemize}
87+
\end{frame}
88+
89+
\begin{frame}{Value Functions}
90+
\begin{itemize}
91+
\item State-value: $V^\pi(s) = \mathbb{E}_\pi[G_t | s_t = s]$.
92+
\item Action-value: $Q^\pi(s,a) = \mathbb{E}_\pi[G_t | s_t = s, a_t = a]$.
93+
\end{itemize}
94+
\end{frame}
95+
96+
\begin{frame}{Bellman Equations}
97+
\begin{itemize}
98+
\item $V^\pi(s) = \sum_a \pi(a|s) \sum_{s'} P(s'|s,a)[R(s,a) + \gamma V^\pi(s')]$.
99+
\item $V^*(s) = \max_a \sum_{s'} P(s'|s,a)[R(s,a) + \gamma V^*(s')]$.
100+
\end{itemize}
101+
\end{frame}
102+
103+
\section{Learning Algorithms}
104+
\begin{frame}{Dynamic Programming}
105+
\begin{itemize}
106+
\item Requires a model of the environment.
107+
\item Policy evaluation and improvement.
108+
\item Value iteration and policy iteration.
109+
\end{itemize}
110+
\end{frame}
111+
112+
\begin{frame}{Monte Carlo Methods}
113+
\begin{itemize}
114+
\item No model needed.
115+
\item Learn from complete episodes.
116+
\item First-visit and every-visit MC methods.
117+
\end{itemize}
118+
\end{frame}
119+
120+
\begin{frame}{Temporal-Difference (TD) Learning}
121+
\begin{itemize}
122+
\item Learn from incomplete episodes.
123+
\item TD(0): $V(s_t) \leftarrow V(s_t) + \alpha [r_{t+1} + \gamma V(s_{t+1}) - V(s_t)]$.
124+
\end{itemize}
125+
\end{frame}
126+
127+
\begin{frame}{SARSA and Q-Learning}
128+
\begin{itemize}
129+
\item SARSA: on-policy.
130+
\item Q-Learning: off-policy.
131+
\item Both update $Q(s,a)$ from experience.
132+
\end{itemize}
133+
\end{frame}
134+
135+
\section{Deep Reinforcement Learning}
136+
\begin{frame}{Exploration vs Exploitation}
137+
\begin{itemize}
138+
\item Need to balance exploring new actions and exploiting known rewards.
139+
\item $\epsilon$-greedy strategy.
140+
\end{itemize}
141+
\end{frame}
142+
143+
\begin{frame}{Function Approximation}
144+
\begin{itemize}
145+
\item Use neural networks to approximate $Q(s,a)$ or $\pi(a|s)$.
146+
\item Generalizes across states and actions.
147+
\end{itemize}
148+
\end{frame}
149+
150+
\begin{frame}{Deep Q-Networks (DQN)}
151+
\begin{itemize}
152+
\item Neural net to approximate $Q(s,a)$.
153+
\item Uses experience replay and target networks.
154+
\end{itemize}
155+
\end{frame}
156+
157+
\begin{frame}{Policy Gradient Methods}
158+
\begin{itemize}
159+
\item Directly parameterize and optimize the policy.
160+
\item Use gradient ascent: $\nabla_\theta J(\theta)$.
161+
\end{itemize}
162+
\end{frame}
163+
164+
\begin{frame}{Actor-Critic Methods}
165+
\begin{itemize}
166+
\item Combine value function (critic) with policy (actor).
167+
\item Advantage Actor-Critic (A2C), A3C.
168+
\end{itemize}
169+
\end{frame}
170+
171+
\begin{frame}{Advanced Algorithms}
172+
\begin{itemize}
173+
\item PPO: stable updates via clipping.
174+
\item SAC: maximum entropy RL.
175+
\end{itemize}
176+
\end{frame}
177+
178+
\section{Conclusion}
179+
\begin{frame}{Challenges in RL}
180+
\begin{itemize}
181+
\item Sample inefficiency.
182+
\item Exploration in sparse reward environments.
183+
\item Stability of training with function approximation.
184+
\end{itemize}
185+
\end{frame}
186+
187+
\begin{frame}{Applications}
188+
\begin{itemize}
189+
\item Games: AlphaGo, StarCraft.
190+
\item Robotics.
191+
\item Healthcare, Finance.
192+
\end{itemize}
193+
\end{frame}
194+
195+
\begin{frame}{Summary}
196+
\begin{itemize}
197+
\item RL is about learning through interaction.
198+
\item MDPs, value functions, policy learning are key concepts.
199+
\item Algorithms evolve from tabular to deep methods.
200+
\end{itemize}
201+
\end{frame}
202+
203+
204+
\end{document}

0 commit comments

Comments
 (0)