From: Francois Fleuret Date: Fri, 26 Nov 2021 23:21:00 +0000 (+0100) Subject: Initial commit X-Git-Url: https://ant.fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=b94c6b870ce9282f924b043bedac4c62920e7c6a;p=tex.git Initial commit --- b94c6b870ce9282f924b043bedac4c62920e7c6a diff --git a/attention.tex b/attention.tex new file mode 100644 index 0000000..b6f15dd --- /dev/null +++ b/attention.tex @@ -0,0 +1,153 @@ +% -*- mode: latex; mode: reftex; mode: auto-fill; mode: flyspell; -*- + +\documentclass[c,8pt]{beamer} + +\usepackage{tikz} +\newcommand{\transpose}{^{\top}} +\def\softmax{\operatorname{softmax}} + +\setbeamertemplate{navigation symbols}{} + +\begin{document} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{frame}[fragile] + +Given a query sequence $Q$, a key sequence $K$, and a value sequence +$V$, compute an attention matrix $A$ by matching $Q$s to $K$s, and +weight $V$ with it to get $Y$. + +\medskip + +\[ +\uncover<2,4,6->{ + A_i = \softmax \left( \frac{Q_i \, K\transpose}{\sqrt{d}} \right) +} +% +\quad \quad \quad +% +\uncover<3,5->{ + Y_i = A_i V +} +\] + +\medskip + +\makebox[\textwidth][c]{ +\begin{tikzpicture} + + \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (V) at (-2, 2.35) { + \begin{tikzpicture} + \draw[fill=green!20] (0, 0) rectangle (4, 1.4); + \uncover<3,5>{\draw[fill=yellow] (0, 0) rectangle (4, 1.4);} + \foreach \x in { 0.2, 0.4, ..., 3.8 } \draw (\x, 0) -- ++(0, 1.4); + \end{tikzpicture} + }; + + \node[cm={1.0, 0.0, 0.5, 0.5, (0.0, 0.0)}] (A) at (0.5, 1.6) { + \begin{tikzpicture} + \draw (0, 0) rectangle ++(3, 4); + \end{tikzpicture} + }; + + \uncover<2-3>{ + \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (a1) at (-0.9, 2.1) { + \begin{tikzpicture} + \draw[draw=none] (0, 0) rectangle (4, 1); + \foreach \x/\y in { + 0.00/0.03, 0.20/0.04, 0.40/0.07, 0.60/0.35, 0.80/0.52, + 1.00/1.00, 1.20/0.82, 1.40/0.25, 1.60/0.08, 1.80/0.03, + 2.00/0.15, 2.20/0.24, 2.40/0.70, 2.60/0.05, 2.80/0.03, + 3.00/0.03, 3.20/0.03, 3.40/0.00, 3.60/0.03, 3.80/0.00 }{ + \uncover<2>{\draw[black,fill=red] (\x, 0) rectangle ++(0.2, \y);} + \uncover<3>{\draw[black,fill=yellow] (\x, 0) rectangle ++(0.2, \y);} + }; + \end{tikzpicture} + }; + } + + \uncover<4-5>{ + \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (a2) at (-0.7, 2.1) { + \begin{tikzpicture} + \draw[draw=none] (0, 0) rectangle (4, 1); + \foreach \x/\y in { + 0.00/0.03, 0.20/0.04, 0.40/0.07, 0.60/0.03, 0.80/0.03, + 1.00/0.05, 1.20/0.02, 1.40/0.08, 1.60/0.35, 1.80/0.85, + 2.00/0.05, 2.20/0.04, 2.40/0.03, 2.60/0.05, 2.80/0.03, + 3.00/0.03, 3.20/0.03, 3.40/0.00, 3.60/0.03, 3.80/0.00 }{ + \uncover<4>{\draw[black,fill=red] (\x, 0) rectangle ++(0.2, \y);} + \uncover<5>{\draw[black,fill=yellow] (\x, 0) rectangle ++(0.2, \y);} + }; + \end{tikzpicture} + }; + } + + \node[cm={1.0, 0.0, 0.0, 1.0, (0.0, 0.0)}] (Q) at (-0.5, -0.05) { + \begin{tikzpicture} + \draw[fill=green!20] (0, 0) rectangle (3, 1.0); + \foreach \x in { 0.2, 0.4, ..., 2.8 } \draw (\x, 0) -- ++(0, 1.0); + \uncover<2>{\draw[fill=yellow] (0.0, 0) rectangle ++(0.2, 1);} + \uncover<4>{\draw[fill=yellow] (0.2, 0) rectangle ++(0.2, 1);} + \end{tikzpicture} + }; + + \node[cm={1.0, 0.0, 0.0, 1.0, (0.0, 0.0)}] (Y) at (1.5, 3.45) { + \begin{tikzpicture} + \uncover<3>{\draw[fill=red] (0.0, 0) rectangle ++(0.2, 1.4);} + \uncover<4->{\draw[fill=green!20] (0.0, 0) rectangle ++(0.2, 1.4);} + \uncover<6->{\draw[fill=green!20] (0.0, 0) rectangle ++(3, 1.4);} + \uncover<5>{\draw[fill=red] (0.2, 0) rectangle ++(0.2, 1.4);} + \draw (0, 0) rectangle (3, 1.4); + \foreach \x in { 0.2, 0.4, ..., 2.8 } \draw (\x, 0) -- ++(0, 1.4); + \end{tikzpicture} + }; + + \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (K) at (3, 1.1) { + \begin{tikzpicture} + \draw[fill=green!20] (0, 0) rectangle (4, 1); + \uncover<2,4>{\draw[fill=yellow] (0, 0) rectangle (4, 1);} + \foreach \x in { 0.2, 0.4, ..., 3.8 } \draw (\x, 0) -- ++(0, 1); + \end{tikzpicture} + }; + + \node[left of=V,xshift=0.5cm,yshift=0.7cm] (Vl) {V}; + \node[left of=Q,xshift=-0.8cm] (Ql) {Q}; + \node (Al) at (A) {A}; + \node[right of=K,xshift=-0.6cm,yshift=-0.6cm] (Kl) {K}; + \node[right of=Y,xshift=0.8cm] (Yl) {Y}; + +\end{tikzpicture} +} + +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{frame}[fragile] + +A standard attention layer takes as input two sequences $X$ and $X'$ +and computes +% +\begin{align*} +K & = W^K X \\ +V & = W^V X \\ +Q & = w^Q X' \\ +Y & = \underbrace{\softmax_{row} \left( \frac{Q K\transpose}{\sqrt{d}} \right)}_{A} V +\end{align*} + +When $X = X'$, this is \textbf{self attention}, otherwise \textbf{cross + attention.} + +\pause + +\bigskip + +Several such processes can be combined in which case $Y$ is the +concatenation of the separate results. This is \textbf{multi-head + attention}. + +\end{frame} + + +\end{document}