@@ -40,6 +40,97 @@ \section{Motivation}
4040Standard feed-forward networks assume independent samples.
4141\end {frame }
4242
43+ % ================================================
44+ \section {Motivation and Background }
45+ % ================================================
46+
47+ \begin {frame }{Why sequential models?}
48+ Many datasets are \emph {ordered } and exhibit temporal/causal structure:
49+ \begin {itemize }
50+ \item time series in physics and engineering (signals, sensors, trajectories)
51+ \item language (sentences), speech, audio
52+ \item dynamical systems and control
53+ \end {itemize }
54+
55+ A feed-forward network typically assumes:
56+ \begin {itemize }
57+ \item fixed-size input vectors,
58+ \item exchangeability (ordering does not matter),
59+ \item no internal memory.
60+ \end {itemize }
61+
62+ RNNs introduce a \textbf {state } that evolves with the sequence.
63+ \end {frame }
64+
65+ \begin {frame }{Sequential data notation}
66+ We represent an input sequence as
67+ \[
68+ x_{1:T} \equiv (x_1,x_2,\dots ,x_T), \qquad x_t\in\mathbb {R}^{d}.
69+ \]
70+ Typical prediction settings:
71+ \begin {itemize }
72+ \item \textbf {many-to-one: } predict a label from the whole sequence
73+ \item \textbf {one-to-many: } generate a sequence from one input
74+ \item \textbf {many-to-many: } sequence tagging or forecasting
75+ \end {itemize }
76+
77+ Core challenge: capture dependencies across time steps \( t\) .
78+ \end {frame }
79+
80+ \begin {frame }{Sources and pointers (for students)}
81+ Standard references:
82+ \begin {itemize }
83+ \item I. Goodfellow, Y. Bengio, A. Courville, \emph {Deep Learning } (RNN chapter)
84+ \item C. Bishop, \emph {Pattern Recognition and Machine Learning } (sequence models background)
85+ \item S. Hochreiter \& J. Schmidhuber (1997): LSTM
86+ \item K. Cho et al. (2014): GRU
87+ \end {itemize }
88+ \end {frame }
89+
90+ % ================================================
91+ \section {Recurrent Neural Networks: Concepts }
92+ % ================================================
93+
94+ \begin {frame }{What is an RNN?}
95+ An RNN maintains a \textbf {hidden state } \( h_t\in\mathbb {R}^m\) that summarizes the past:
96+ \[
97+ h_t = f_\theta (x_t,h_{t-1}).
98+ \]
99+ Key idea: parameters \( \theta \) are \emph {shared } across time steps.
100+ \begin {itemize }
101+ \item compact parameterization for long sequences
102+ \item natural inductive bias for temporal structure
103+ \end {itemize }
104+ \end {frame }
105+
106+ \begin {frame }{Why RNNs?}
107+ RNNs are designed to learn:
108+ \begin {itemize }
109+ \item \textbf {short-range dependencies } (local patterns)
110+ \item \textbf {long-range dependencies } (delayed effects, memory)
111+ \end {itemize }
112+
113+ In varios applciations this resembles:
114+ \begin {itemize }
115+ \item state-space models
116+ \item discrete-time dynamical systems
117+ \item Markovian updates with learnable transition maps
118+ \end {itemize }
119+ \end {frame }
120+
121+ \begin {frame }{Limitations of vanilla RNNs}
122+ Training a basic RNN can be difficult due to:
123+ \begin {itemize }
124+ \item \textbf {vanishing gradients } (long-range information fades)
125+ \item \textbf {exploding gradients } (instability)
126+ \item optimization challenges in deep unrolled computation graphs
127+ \end {itemize }
128+
129+ Modern gated architectures (LSTM/GRU) mitigate these issues.
130+ \end {frame }
131+
132+
133+
43134% ------------------------------------------------
44135\begin {frame }{Sequential Data Structure}
45136
@@ -106,6 +197,55 @@ \section{Recurrent Neural Networks}
106197\]
107198\end {frame }
108199
200+
201+ \subsection {Architecture and forward propagation }
202+
203+ \begin {frame }{Vanilla RNN architecture (vector form)}
204+ Let \( x_t\in\mathbb {R}^d\) , hidden state \( h_t\in\mathbb {R}^m\) , output pre-activation \( o_t\in\mathbb {R}^k\) .
205+ Define
206+ \[
207+ a_t = W_{xh}x_t + W_{hh}h_{t-1} + b_h,\qquad h_t = \sigma (a_t),
208+ \]
209+ \[
210+ o_t = W_{hy}h_t + b_y,\qquad y_t = \phi (o_t).
211+ \]
212+ Parameters:
213+ \[
214+ W_{xh}\in\mathbb {R}^{m\times d},\;
215+ W_{hh}\in\mathbb {R}^{m\times m},\;
216+ W_{hy}\in\mathbb {R}^{k\times m}.
217+ \]
218+ \end {frame }
219+
220+ \begin {frame }{Unrolling through time}
221+ The recurrence couples time steps:
222+ \[
223+ h_t = \sigma (W_{xh}x_t + W_{hh}h_{t-1}+b_h).
224+ \]
225+ Unrolling yields a depth-\( T\) computation graph with weight sharing:
226+ \begin {itemize }
227+ \item forward pass computes \( h_1,\dots ,h_T\) sequentially
228+ \item backward pass propagates sensitivities from \( T\) back to \( 1\)
229+ \end {itemize }
230+
231+ Initialization: \( h_0=0\) (or learn \( h_0\) ).
232+ \end {frame }
233+
234+ \begin {frame }{Loss functions for sequences}
235+ Common objective:
236+ \[
237+ \mathcal {L} = \sum _{t=1}^T \ell _t(y_t,\hat y_t).
238+ \]
239+ Examples:
240+ \begin {itemize }
241+ \item regression: \( \ell _t=\frac 12\| y_t-\hat y_t\| ^2\)
242+ \item classification: \( \ell _t=-\sum _i \hat y_{t,i}\log y_{t,i}\)
243+ \end {itemize }
244+ We now derive gradients for training via \textbf {backpropagation through time } (BPTT).
245+ \end {frame }
246+
247+
248+
109249% ------------------------------------------------
110250\section {Forward Propagation }
111251
@@ -700,3 +840,142 @@ \section{Applications in physical sciences}
700840
701841
702842
843+
844+
845+
846+ % ================================================
847+ \subsection {Backpropagation Through Time (BPTT) }
848+ % ================================================
849+
850+ \begin {frame }{BPTT: core idea}
851+ BPTT is ordinary backpropagation applied to the \emph {unrolled } graph.
852+ We compute:
853+ \[
854+ \nabla _\theta \mathcal {L} = \frac {\partial \mathcal {L}}{\partial \theta },
855+ \]
856+ with \( \theta =\{ W_{xh},W_{hh},b_h,W_{hy},b_y\} \) .
857+
858+ Two coupled chains:
859+ \begin {itemize }
860+ \item output chain: \( o_t\to y_t\to \ell _t\)
861+ \item recurrent chain: \( h_t\to a_{t+1}\to h_{t+1}\to \cdots \)
862+ \end {itemize }
863+ \end {frame }
864+
865+ \begin {frame }{Output-layer backpropagation}
866+ Define output error signal:
867+ \[
868+ \delta _t^{o} \equiv \frac {\partial \mathcal {L}}{\partial o_t}\in\mathbb {R}^k
869+ = \left (\frac {\partial \ell _t}{\partial y_t}\right )\odot \phi '(o_t).
870+ \]
871+ Then
872+ \[
873+ \frac {\partial \mathcal {L}}{\partial W_{hy}} = \sum _{t=1}^T \delta _t^{o}\, h_t^\top ,\qquad
874+ \frac {\partial \mathcal {L}}{\partial b_y} = \sum _{t=1}^T \delta _t^{o}.
875+ \]
876+ Contribution to hidden state:
877+ \[
878+ \left .\frac {\partial \mathcal {L}}{\partial h_t}\right |_{\text {out}}=W_{hy}^\top \delta _t^{o}.
879+ \]
880+ \end {frame }
881+
882+ \begin {frame }{Hidden-state sensitivities and recursion}
883+ Define
884+ \[
885+ \delta _t^{h} \equiv \frac {\partial \mathcal {L}}{\partial h_t}\in\mathbb {R}^{m},\qquad
886+ \delta _t^{a} \equiv \frac {\partial \mathcal {L}}{\partial a_t}\in\mathbb {R}^{m}.
887+ \]
888+ Elementwise nonlinearity gives
889+ \[
890+ \delta _t^{a} = \delta _t^{h}\odot \sigma '(a_t).
891+ \]
892+ Since \( a_{t+1}=W_{xh}x_{t+1}+W_{hh}h_t+b_h\) ,
893+ \[
894+ \left .\frac {\partial \mathcal {L}}{\partial h_t}\right |_{\text {future}}
895+ =
896+ \left (\frac {\partial a_{t+1}}{\partial h_t}\right )^\top \delta _{t+1}^{a}
897+ = W_{hh}^\top \delta _{t+1}^{a}.
898+ \]
899+ Therefore, the BPTT recursion (backwards in time) is
900+ \[
901+ \boxed {\delta _t^{h}=W_{hy}^\top \delta _t^{o} + W_{hh}^\top \delta _{t+1}^{a}},\qquad
902+ \boxed {\delta _t^{a}=\delta _t^{h}\odot \sigma '(a_t)},
903+ \]
904+ with terminal condition \( \delta _{T+1}^{a}=0\) .
905+ \end {frame }
906+
907+ \begin {frame }{Gradients for recurrent core parameters}
908+ Using \( a_t=W_{xh}x_t+W_{hh}h_{t-1}+b_h\) ,
909+ matrix calculus yields:
910+ \[
911+ \boxed {\frac {\partial \mathcal {L}}{\partial W_{xh}} = \sum _{t=1}^{T}\delta _t^{a}\, x_t^\top },\qquad
912+ \boxed {\frac {\partial \mathcal {L}}{\partial W_{hh}} = \sum _{t=1}^{T}\delta _t^{a}\, h_{t-1}^\top },
913+ \]
914+ \[
915+ \boxed {\frac {\partial \mathcal {L}}{\partial b_h} = \sum _{t=1}^{T}\delta _t^{a}}.
916+ \]
917+ These, together with output-layer gradients, complete training for a vanilla RNN.
918+ \end {frame }
919+
920+ \begin {frame }{Vanishing and exploding gradients (mathematical origin)}
921+ Unrolling the recursion reveals repeated Jacobian products:
922+ \[
923+ \delta _t^{h} \sim (W_{hh}^\top D_{t+1})(W_{hh}^\top D_{t+2})\cdots (W_{hh}^\top D_{t+k})\, (\cdots ),
924+ \]
925+ where \( D_t=\mathrm {diag}(\sigma '(a_t))\) .
926+
927+ Heuristic criterion using operator norms:
928+ \[
929+ \| W_{hh}\| \cdot \max _t \| D_t\| < 1 \Rightarrow \text {vanishing gradients},
930+ \]
931+ \[
932+ \| W_{hh}\| \cdot \max _t \| D_t\| > 1 \Rightarrow \text {exploding gradients}.
933+ \]
934+ \end {frame }
935+
936+ \begin {frame }{Mitigation strategies}
937+ Common practical fixes:
938+ \begin {itemize }
939+ \item \textbf {Gradient clipping: } enforce \( \| \nabla \| \le \tau \) .
940+ \item \textbf {Initialization: } orthogonal/unitary \( W_{hh}\) , careful scaling.
941+ \item \textbf {Truncated BPTT: } backpropagate only \( K\) steps.
942+ \item \textbf {Gating: } LSTM/GRU architectures.
943+ \end {itemize }
944+ \end {frame }
945+
946+ % ================================================
947+ \section {Algorithmic Summary }
948+ % ================================================
949+
950+ \begin {frame }{BPTT algorithm (single sequence)}
951+ \textbf {Forward pass: }
952+ \begin {itemize }
953+ \item for \( t=1\) to \( T\) : compute \( a_t,h_t,o_t,y_t\) and store them
954+ \end {itemize }
955+
956+ \textbf {Backward pass: }
957+ \begin {itemize }
958+ \item set \( \delta _{T+1}^{a}=0\)
959+ \item for \( t=T\) down to \( 1\) :
960+ \begin {align* }
961+ \delta _t^{o} &\leftarrow \left (\frac {\partial \ell _t}{\partial y_t}\right )\odot \phi '(o_t),\\
962+ \delta _t^{h} &\leftarrow W_{hy}^\top \delta _t^{o} + W_{hh}^\top \delta _{t+1}^{a},\\
963+ \delta _t^{a} &\leftarrow \delta _t^{h}\odot \sigma '(a_t),
964+ \end {align* }
965+ and accumulate gradients.
966+ \end {itemize }
967+ \end {frame }
968+
969+ \begin {frame }{Summary}
970+ \begin {itemize }
971+ \item RNNs are state-space models with learnable transitions and shared parameters.
972+ \item Vanilla RNN forward dynamics:
973+ \[
974+ h_t=\sigma (W_{xh}x_t+W_{hh}h_{t-1}+b_h).
975+ \]
976+ \item Training uses BPTT on the unrolled graph with a backward recursion.
977+ \item Long sequences lead to vanishing/exploding gradients; LSTM/GRU and practical tricks help.
978+ \end {itemize }
979+ \end {frame }
980+
981+ \end {document }
0 commit comments