Skip to content

Commit 28fc14b

Browse files
committed
addd files
1 parent bb1d6e2 commit 28fc14b

File tree

2 files changed

+279
-0
lines changed

2 files changed

+279
-0
lines changed

doc/pub/week7/pdf/rnn.pdf

249 KB
Binary file not shown.

doc/src/week7/Latexfiles/rnn.tex

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,97 @@ \section{Motivation}
4040
Standard feed-forward networks assume independent samples.
4141
\end{frame}
4242

43+
%================================================
44+
\section{Motivation and Background}
45+
%================================================
46+
47+
\begin{frame}{Why sequential models?}
48+
Many datasets are \emph{ordered} and exhibit temporal/causal structure:
49+
\begin{itemize}
50+
\item time series in physics and engineering (signals, sensors, trajectories)
51+
\item language (sentences), speech, audio
52+
\item dynamical systems and control
53+
\end{itemize}
54+
55+
A feed-forward network typically assumes:
56+
\begin{itemize}
57+
\item fixed-size input vectors,
58+
\item exchangeability (ordering does not matter),
59+
\item no internal memory.
60+
\end{itemize}
61+
62+
RNNs introduce a \textbf{state} that evolves with the sequence.
63+
\end{frame}
64+
65+
\begin{frame}{Sequential data notation}
66+
We represent an input sequence as
67+
\[
68+
x_{1:T} \equiv (x_1,x_2,\dots,x_T), \qquad x_t\in\mathbb{R}^{d}.
69+
\]
70+
Typical prediction settings:
71+
\begin{itemize}
72+
\item \textbf{many-to-one:} predict a label from the whole sequence
73+
\item \textbf{one-to-many:} generate a sequence from one input
74+
\item \textbf{many-to-many:} sequence tagging or forecasting
75+
\end{itemize}
76+
77+
Core challenge: capture dependencies across time steps \(t\).
78+
\end{frame}
79+
80+
\begin{frame}{Sources and pointers (for students)}
81+
Standard references:
82+
\begin{itemize}
83+
\item I. Goodfellow, Y. Bengio, A. Courville, \emph{Deep Learning} (RNN chapter)
84+
\item C. Bishop, \emph{Pattern Recognition and Machine Learning} (sequence models background)
85+
\item S. Hochreiter \& J. Schmidhuber (1997): LSTM
86+
\item K. Cho et al. (2014): GRU
87+
\end{itemize}
88+
\end{frame}
89+
90+
%================================================
91+
\section{Recurrent Neural Networks: Concepts}
92+
%================================================
93+
94+
\begin{frame}{What is an RNN?}
95+
An RNN maintains a \textbf{hidden state} \(h_t\in\mathbb{R}^m\) that summarizes the past:
96+
\[
97+
h_t = f_\theta(x_t,h_{t-1}).
98+
\]
99+
Key idea: parameters \(\theta\) are \emph{shared} across time steps.
100+
\begin{itemize}
101+
\item compact parameterization for long sequences
102+
\item natural inductive bias for temporal structure
103+
\end{itemize}
104+
\end{frame}
105+
106+
\begin{frame}{Why RNNs?}
107+
RNNs are designed to learn:
108+
\begin{itemize}
109+
\item \textbf{short-range dependencies} (local patterns)
110+
\item \textbf{long-range dependencies} (delayed effects, memory)
111+
\end{itemize}
112+
113+
In varios applciations this resembles:
114+
\begin{itemize}
115+
\item state-space models
116+
\item discrete-time dynamical systems
117+
\item Markovian updates with learnable transition maps
118+
\end{itemize}
119+
\end{frame}
120+
121+
\begin{frame}{Limitations of vanilla RNNs}
122+
Training a basic RNN can be difficult due to:
123+
\begin{itemize}
124+
\item \textbf{vanishing gradients} (long-range information fades)
125+
\item \textbf{exploding gradients} (instability)
126+
\item optimization challenges in deep unrolled computation graphs
127+
\end{itemize}
128+
129+
Modern gated architectures (LSTM/GRU) mitigate these issues.
130+
\end{frame}
131+
132+
133+
43134
%------------------------------------------------
44135
\begin{frame}{Sequential Data Structure}
45136

@@ -106,6 +197,55 @@ \section{Recurrent Neural Networks}
106197
\]
107198
\end{frame}
108199

200+
201+
\subsection{Architecture and forward propagation}
202+
203+
\begin{frame}{Vanilla RNN architecture (vector form)}
204+
Let \(x_t\in\mathbb{R}^d\), hidden state \(h_t\in\mathbb{R}^m\), output pre-activation \(o_t\in\mathbb{R}^k\).
205+
Define
206+
\[
207+
a_t = W_{xh}x_t + W_{hh}h_{t-1} + b_h,\qquad h_t = \sigma(a_t),
208+
\]
209+
\[
210+
o_t = W_{hy}h_t + b_y,\qquad y_t = \phi(o_t).
211+
\]
212+
Parameters:
213+
\[
214+
W_{xh}\in\mathbb{R}^{m\times d},\;
215+
W_{hh}\in\mathbb{R}^{m\times m},\;
216+
W_{hy}\in\mathbb{R}^{k\times m}.
217+
\]
218+
\end{frame}
219+
220+
\begin{frame}{Unrolling through time}
221+
The recurrence couples time steps:
222+
\[
223+
h_t = \sigma(W_{xh}x_t + W_{hh}h_{t-1}+b_h).
224+
\]
225+
Unrolling yields a depth-\(T\) computation graph with weight sharing:
226+
\begin{itemize}
227+
\item forward pass computes \(h_1,\dots,h_T\) sequentially
228+
\item backward pass propagates sensitivities from \(T\) back to \(1\)
229+
\end{itemize}
230+
231+
Initialization: \(h_0=0\) (or learn \(h_0\)).
232+
\end{frame}
233+
234+
\begin{frame}{Loss functions for sequences}
235+
Common objective:
236+
\[
237+
\mathcal{L} = \sum_{t=1}^T \ell_t(y_t,\hat y_t).
238+
\]
239+
Examples:
240+
\begin{itemize}
241+
\item regression: \(\ell_t=\frac12\|y_t-\hat y_t\|^2\)
242+
\item classification: \(\ell_t=-\sum_i \hat y_{t,i}\log y_{t,i}\)
243+
\end{itemize}
244+
We now derive gradients for training via \textbf{backpropagation through time} (BPTT).
245+
\end{frame}
246+
247+
248+
109249
%------------------------------------------------
110250
\section{Forward Propagation}
111251

@@ -700,3 +840,142 @@ \section{Applications in physical sciences}
700840

701841

702842

843+
844+
845+
846+
%================================================
847+
\subsection{Backpropagation Through Time (BPTT)}
848+
%================================================
849+
850+
\begin{frame}{BPTT: core idea}
851+
BPTT is ordinary backpropagation applied to the \emph{unrolled} graph.
852+
We compute:
853+
\[
854+
\nabla_\theta \mathcal{L} = \frac{\partial \mathcal{L}}{\partial \theta},
855+
\]
856+
with \(\theta=\{W_{xh},W_{hh},b_h,W_{hy},b_y\}\).
857+
858+
Two coupled chains:
859+
\begin{itemize}
860+
\item output chain: \(o_t\to y_t\to \ell_t\)
861+
\item recurrent chain: \(h_t\to a_{t+1}\to h_{t+1}\to \cdots\)
862+
\end{itemize}
863+
\end{frame}
864+
865+
\begin{frame}{Output-layer backpropagation}
866+
Define output error signal:
867+
\[
868+
\delta_t^{o} \equiv \frac{\partial \mathcal{L}}{\partial o_t}\in\mathbb{R}^k
869+
= \left(\frac{\partial \ell_t}{\partial y_t}\right)\odot \phi'(o_t).
870+
\]
871+
Then
872+
\[
873+
\frac{\partial \mathcal{L}}{\partial W_{hy}} = \sum_{t=1}^T \delta_t^{o}\,h_t^\top,\qquad
874+
\frac{\partial \mathcal{L}}{\partial b_y} = \sum_{t=1}^T \delta_t^{o}.
875+
\]
876+
Contribution to hidden state:
877+
\[
878+
\left.\frac{\partial \mathcal{L}}{\partial h_t}\right|_{\text{out}}=W_{hy}^\top \delta_t^{o}.
879+
\]
880+
\end{frame}
881+
882+
\begin{frame}{Hidden-state sensitivities and recursion}
883+
Define
884+
\[
885+
\delta_t^{h} \equiv \frac{\partial \mathcal{L}}{\partial h_t}\in\mathbb{R}^{m},\qquad
886+
\delta_t^{a} \equiv \frac{\partial \mathcal{L}}{\partial a_t}\in\mathbb{R}^{m}.
887+
\]
888+
Elementwise nonlinearity gives
889+
\[
890+
\delta_t^{a} = \delta_t^{h}\odot \sigma'(a_t).
891+
\]
892+
Since \(a_{t+1}=W_{xh}x_{t+1}+W_{hh}h_t+b_h\),
893+
\[
894+
\left.\frac{\partial \mathcal{L}}{\partial h_t}\right|_{\text{future}}
895+
=
896+
\left(\frac{\partial a_{t+1}}{\partial h_t}\right)^\top \delta_{t+1}^{a}
897+
= W_{hh}^\top \delta_{t+1}^{a}.
898+
\]
899+
Therefore, the BPTT recursion (backwards in time) is
900+
\[
901+
\boxed{\delta_t^{h}=W_{hy}^\top \delta_t^{o} + W_{hh}^\top \delta_{t+1}^{a}},\qquad
902+
\boxed{\delta_t^{a}=\delta_t^{h}\odot \sigma'(a_t)},
903+
\]
904+
with terminal condition \(\delta_{T+1}^{a}=0\).
905+
\end{frame}
906+
907+
\begin{frame}{Gradients for recurrent core parameters}
908+
Using \(a_t=W_{xh}x_t+W_{hh}h_{t-1}+b_h\),
909+
matrix calculus yields:
910+
\[
911+
\boxed{\frac{\partial \mathcal{L}}{\partial W_{xh}} = \sum_{t=1}^{T}\delta_t^{a}\,x_t^\top},\qquad
912+
\boxed{\frac{\partial \mathcal{L}}{\partial W_{hh}} = \sum_{t=1}^{T}\delta_t^{a}\,h_{t-1}^\top},
913+
\]
914+
\[
915+
\boxed{\frac{\partial \mathcal{L}}{\partial b_h} = \sum_{t=1}^{T}\delta_t^{a}}.
916+
\]
917+
These, together with output-layer gradients, complete training for a vanilla RNN.
918+
\end{frame}
919+
920+
\begin{frame}{Vanishing and exploding gradients (mathematical origin)}
921+
Unrolling the recursion reveals repeated Jacobian products:
922+
\[
923+
\delta_t^{h} \sim (W_{hh}^\top D_{t+1})(W_{hh}^\top D_{t+2})\cdots (W_{hh}^\top D_{t+k})\,(\cdots),
924+
\]
925+
where \(D_t=\mathrm{diag}(\sigma'(a_t))\).
926+
927+
Heuristic criterion using operator norms:
928+
\[
929+
\|W_{hh}\|\cdot \max_t \|D_t\| < 1 \Rightarrow \text{vanishing gradients},
930+
\]
931+
\[
932+
\|W_{hh}\|\cdot \max_t \|D_t\| > 1 \Rightarrow \text{exploding gradients}.
933+
\]
934+
\end{frame}
935+
936+
\begin{frame}{Mitigation strategies}
937+
Common practical fixes:
938+
\begin{itemize}
939+
\item \textbf{Gradient clipping:} enforce \(\|\nabla\|\le \tau\).
940+
\item \textbf{Initialization:} orthogonal/unitary \(W_{hh}\), careful scaling.
941+
\item \textbf{Truncated BPTT:} backpropagate only \(K\) steps.
942+
\item \textbf{Gating:} LSTM/GRU architectures.
943+
\end{itemize}
944+
\end{frame}
945+
946+
%================================================
947+
\section{Algorithmic Summary}
948+
%================================================
949+
950+
\begin{frame}{BPTT algorithm (single sequence)}
951+
\textbf{Forward pass:}
952+
\begin{itemize}
953+
\item for \(t=1\) to \(T\): compute \(a_t,h_t,o_t,y_t\) and store them
954+
\end{itemize}
955+
956+
\textbf{Backward pass:}
957+
\begin{itemize}
958+
\item set \(\delta_{T+1}^{a}=0\)
959+
\item for \(t=T\) down to \(1\):
960+
\begin{align*}
961+
\delta_t^{o} &\leftarrow \left(\frac{\partial \ell_t}{\partial y_t}\right)\odot \phi'(o_t),\\
962+
\delta_t^{h} &\leftarrow W_{hy}^\top \delta_t^{o} + W_{hh}^\top \delta_{t+1}^{a},\\
963+
\delta_t^{a} &\leftarrow \delta_t^{h}\odot \sigma'(a_t),
964+
\end{align*}
965+
and accumulate gradients.
966+
\end{itemize}
967+
\end{frame}
968+
969+
\begin{frame}{Summary}
970+
\begin{itemize}
971+
\item RNNs are state-space models with learnable transitions and shared parameters.
972+
\item Vanilla RNN forward dynamics:
973+
\[
974+
h_t=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h).
975+
\]
976+
\item Training uses BPTT on the unrolled graph with a backward recursion.
977+
\item Long sequences lead to vanishing/exploding gradients; LSTM/GRU and practical tricks help.
978+
\end{itemize}
979+
\end{frame}
980+
981+
\end{document}

0 commit comments

Comments
 (0)