-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbigo.tex
420 lines (389 loc) · 13.6 KB
/
bigo.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
\PassOptionsToPackage{bookmarks={true}}{hyperref}
\documentclass[mathserif,xcolor={dvipsnames,table}]{beamer}
\mode<presentation>{\usetheme{Warsaw}\usecolortheme{crane}}
\usepackage{centernot}
\usepackage{upgreek}
\usepackage{graphicx}
\usepackage{framed,color}
\usepackage{geometry}
\usepackage{transparent}
\usepackage{tikz}
\usetikzlibrary{shadows}
%\usepackage[utf8]{inputenc}
%\usepackage[english]{babel}
%\usepackage[T1]{fontenc}
\usepackage{lmodern}
%\usepackage[babel=true]{microtype}
\usepackage{amsmath}
\beamertemplatenavigationsymbolsempty
\renewcommand{\sfdefault}{cmr}
\definecolor{shadecolor}{rgb}{1,0.8,0.3}
\title{\textbf{Big-$\boldsymbol{\mathcal{O}}$ Ain't What it Used to Be}}
\date{}
\author{CS4803UWS at the\\
Georgia Institute of Technology
}
\begin{document}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth,height=\paperheight]{images/gt2.jpeg}
}%
\begin{frame}[plain]
\textcolor{white}{
\transparent{0.5}%
\colorbox{black}{\textbf{Big-$\boldsymbol{\mathcal{O}}$ Ain't What it Used to Be}}
}
\vspace{2.7in}
\\
\hfill\includegraphics[scale=.25]{images/cc-logo.pdf}
\includegraphics[scale=.25]{images/cc-new.pdf}
\includegraphics[scale=.25]{images/cc-share.pdf}
\textcolor{white}{
\\
\hfill \tiny{CC3.0 share-alike attribution}\\
}
\textcolor{white}{
\hfill \scriptsize{copyright \copyright\ 2013 nick black}\\
}
\end{frame}
}
\begin{frame}{Asymptotic notation review I}
Asymptotic analysis gives us a means of speaking of arbitrarily large growth,
independently of arbitrarily (but finitely) large costs not associated with
problem size.
\scriptsize{
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{ForestGreen!20}{ForestGreen!5}
\begin{tabular}{|l|l|p{4cm}|p{2.25cm}|}
\hline
\textbf{Notation} & \textbf{Name} & \textbf{Definition} & \textbf{Introduced} \\
\hline\hline
$f(n) \in \mathcal{O}(g(n))$ & Big O & $\exists k>0,\exists n_0,\forall n$
\newline
$n>n_0 \implies f(n) \le g(n)*k$ & Paul Bachmann\newline (1894) \\
\hline
$f(n) \in o(g(n))$ & Small O & $\forall k>0,\exists n_0,\forall n$
\newline
$n>n_0 \implies |f(n)| \le |g(n)|*k$ & Edmund Landau\newline (1909) \\
\hline
$f(n) \in \Theta(g(n))$ & Big Theta & $\exists k_1>0,\exists k_2>0,\exists n_0,\forall n$
\newline
$n>n_0 \implies g(n)*k_1 \le f(n),
\newline
f(n) \le g(n)*k_2$ & Donald Knuth\newline (1976) \\
\hline
$f(n) \in \omega(g(n))$ &Small Omega& $\forall k>0,\exists n_0,\forall n$
\newline
$n>n_0 \implies |f(n)| \ge |g(n)|*k$ & Donald Knuth\newline(1976) \\
\hline
$f(n) \in \Omega(g(n))$ & Big Omega & $\exists k>0,\exists n_0,\forall n$
\newline
$n>n_0 \implies f(n) > g(n)*k$ & Donald Knuth\newline (1976) \\
\hline
\end{tabular}%
};
\end{tikzpicture}
\end{center}
\vfill
Advances in (finite) computing technology can only reduce these
ignored costs. Wrap the earth with your register file, and still there will be
numbers so large that their addition is $\Theta(n)$.
}
\end{frame}
\begin{frame}{Asymptotic notation review II}
\scriptsize{
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{ForestGreen!20}{ForestGreen!5}
\begin{tabular}{|l|l|l}
\hline
\textbf{Class (all $c_*>1$)} & \textbf{Name} & \textbf{Example} \\
\hline\hline
$\mathcal{O}(1)$ & Constant & Is word-sized unsigned int $n$ a power of 2?\\
\hline
$\mathcal{O}(\lg_{c_1}\lg_{c_2} n)$ & Double-log & Interpolative search on uniform distribution\\
\hline
$\mathcal{O}(\lg_c n)$ & Logarithmic & Binary search\\
\hline
$\mathcal{O}(n\lg n) = \mathcal{O}(\lg n!)$ & Linearithmic & FFT\\
\hline
$\mathcal{O}(n^{c})$ & Polynomial & Primality testing\\
\hline
$\mathcal{O}(c^{n})$ & Exponential & Brute-force Boolean equivalence \\
\hline
$\mathcal{O}(n!)$ & Factorial & Unrestricted permutations of a poset\\
\hline
$\mathcal{O}(c^{c^{n}_2}_1)$ & Double-exp & Presburger arithmetic decision best case\\
\hline
\end{tabular}%
};
\end{tikzpicture}
\end{center}
\vfill
Speaking of still faster growth rates\footnote{\scriptsize{Check out ``fast-growing hierarchies'' and the L\"ob–Wainer hierarchy.}} (hyper-exponential, $\mathcal{A}$)
is mostly zoology.%\footnote{\scriptsize{Forgive the pun on busy beavers.}}.
}
\end{frame}
\begin{frame}{It's the constants, stupid}
Algorithmic choices can dominate performance, especially at scale. By the
definition of Big O, it should also be obvious that an asymptotically superior
algorithm can be slower for small inputs\footnote{We will see that small inputs can be surprisingly large.}.\\
\vfill
That said, no one's going to think implementing a routing table with a linked list is a good idea.\\
\vfill
Furthermore, asymptotic analysis speaks of performance as problem size grows.
It doesn't speak of real-time. It doesn't speak of bounded memories.
We rarely speak of piecewise asymptotics.
\vfill
But, by all means, do ensure you're not doing linear searches on sorted data etc.
\end{frame}
\begin{frame}{What's hiding behind $\mathcal{O}$?}
Naive square (nXn X nXn) matrix multiplication is $\Theta(n^{3})$.
\vfill
\begin{equation}
C = AB \implies C_{ij} = \sum\limits_{m=1}^{k} A_{im}B_{mj}
\end{equation}
\vfill
Counting the explicit additions and multiplications, there are
precisely $2n^{3}$ ``operations''.
\end{frame}
\begin{frame}{Fused multiply-add}
\small{
IEEE 754-2008 floating point support requires FMA, fused multiply-add.
Let $rn()$ denote a rounding operation. Typically, a multiply-add chain
requires two instructions, and rounds twice:
\begin{equation}
MAC(A,B,C) = rn(rn(A * B) + C)
\end{equation}
Fused multiply-add rounds only once, preserving the fully precise product in
an internal register:
\begin{equation}
FMA(A,B,C) = rn(A * B + C)
\end{equation}
AMD's FMA4 (Bulldozer) implements a fully general SIMD FP FMA. Intel's FMA3
(Haswell, as part of AVX2; also in AMD's Piledriver) implements a destructive
SIMD FP FMA\footnote{AMD's XOP further implements an SIMD integer FMA.}.
The throughput and latency are equivalent to standard single SIMD FP adds and
multiplies. NVIDIA's Fermi likewise introduced a full-throughput FMA.
\vfill
There are now precisely $n^{3}$ ``operations''.
}
\end{frame}
{
\setbeamertemplate{background canvas}{%
\includegraphics[scale=.33]{images/haswellexec.png}
}%
\begin{frame}[b]{Wide issue}
\scriptsize{
Haswell can issue and retire 2 VFMADD* instructions per cycle.\\
There are now precisely $n^{3}/2$ ``operations''\footnote{\tiny{Assuming that two operations are available every cycle.}}.
\vspace{.1in}
}
\end{frame}
}
{
\setbeamertemplate{background canvas}{%
\includegraphics[scale=1]{images/256bit.png}
}%
\begin{frame}[b]{SIMD}
\scriptsize{
AVX uses the 16 256-bit YMM registers. There are 8 32-bit IEEE 754-2008
single-precision values in a 256-bit input.\\
There are now precisely $n^{3}/16$ ``operations''\footnote{Assuming that values
are usable in 256-bit chunks.}.
\vspace{.1in}
}
\end{frame}
}
{
\setbeamertemplate{background canvas}{%
\includegraphics[scale=1]{images/Multicore.jpg}
}%
\begin{frame}[b]{Multicore}
\scriptsize{
\begin{block}{}
Haswell will likely debut in a quadcore physical package.\\
There are now precisely $n^{3}/64$ ``operations''\footnote{\tiny{Ignoring communication
costs, and assuming perfect parallelism.}}.
\end{block}
\vspace{.1in}
}
\end{frame}
}
\begin{frame}{Memory accesses}
\begin{enumerate}
\item $\forall i, 0\le i \le n:$ Read row $i$ from $A$ into $a$
\item $\forall j, 0\le j \le n:$ Read column $j$ from $B$ into $b$, read $C_{ij}$ into $c$
\item $\forall k, 0\le k \le n:$ Store $a_k * b_k + c$ into $C_{ij}$
\end{enumerate}
\vfill
\begin{enumerate}
\item $n^{2}$ loads from A
\item $n^{3}$ loads from B
\item $n^{2}$ loads from C
\item $n^{2}$ stores to C
\end{enumerate}
Counting the loads and stores, there are precisely $n^{3} + 3n^{2}$ memory
accesses. There are now precisely $\frac{65n^{3}}{64} + 3n^{2}$ ``operations''.
\vfill
\begin{alertblock}{Arithmetic intensity}
\center$\lim\limits_{n \to \infty} \frac{\text{Arithmetic}}{\text{Memory}} = 2$
\end{alertblock}
\vfill
\end{frame}
\begin{frame}{Loads and stores}
Hong and Kung proved in 1981 that any schedule of conventional matrix
multiplication must transfer $\Omega(\frac{n^{3}}{\sqrt{Z}}), Z<\frac{n^{2}}{6}$
words between slow and fast memory. \textit{Tiling} is the optimal strategy.
\vfill
Of course, AVX's VMOVAPS moves 256 bits, or 8 32-bit single precision floating
point words, at a time. And there's 4 cores. With two load/store pipes each. So that's $\Omega_{HK}/64$\footnote{
\tiny{Assuming that the values are located in aligned, contiguous 256-bite chunks in memory.\\
\hspace{.6cm}Wait\ldots we can use VMOVUPS if they're unsuitably aligned.}}.
\vfill
Of course, we're not going to be able to pack two VFMADDPS and two VMOVAPS
instructions into every 16B/\textbf{c} I\$ fetch\footnote{\tiny{VEX-encoded
VMOVAPS tends to run \textasciitilde 5 bytes.}}.
\vfill
How does this interact with register banking? Multilevel caching? TLBs? Page
cache? Prefetching? DRAM banking? Multilevel disk? Logical cores? Other physical cores?
NUMA?
\end{frame}
{
\setbeamertemplate{background canvas}{%
\includegraphics[scale=1]{images/rage.jpg}
}%
\begin{frame}{Argh}
\begin{block}{FFFFFFFFFFFFFFFFFFFFFFUUUUUUUUUUUUUUUUUUU}
We've not yet mentioned branching.
\end{block}
\end{frame}
}
\begin{frame}{CS4803 Spring 2010 Lab 3---All $\mathcal{O}(n^{3})$}
\includegraphics[width=.8\paperwidth]{images/lab3.jpg}
\end{frame}
\begin{frame}{ATLAS Unleashed---All $\mathcal{O}(n^{3})$\hfill\tiny{Image source: Richard Vuduc's CSE6230}}
\includegraphics[width=.8\paperwidth]{images/vuduc.jpg}
\end{frame}
\begin{frame}{RAPTORIAL-file\hfill\tiny{Image source: Dirty South Supercomputing and Waffles}}
\begin{center}
In the pure systems space, $\mathcal{O}$ makes still less sense.\\
What's $\mathcal{O}$ of multithreaded file lexing?\\
\includegraphics[width=.675\paperwidth]{images/raptorial-file-fullsize.png}
\end{center}
\end{frame}
\begin{frame}{RAPT-show-version\hfill\tiny{Image source: Dirty South Supercomputing and Waffles}}
\begin{center}
Nonetheless, optimization can be very fruitful.
\includegraphics[width=.675\paperwidth]{images/rapt-show-versions-fullsize.png}
\end{center}
\end{frame}
{
\setbeamertemplate{background canvas}{%
\includegraphics[scale=.5]{images/perf1.jpg}
}%
\begin{frame}[b]{Analysis-driven optimization I}
\begin{block}{High-level (``Macroanalysis'')}
Coarse tools and algorithmic reasoning, e.g.:
\begin{itemize}
\item Ensure sufficient task-level parallelism
\item Ensure cores aren't overutilized
\item Profiler-driven hotspot location
\item High-level memory and I/O flow
\end{itemize}
\end{block}
\end{frame}
}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth]{images/cudaprof.png}
}%
\begin{frame}[b]{Analysis-driven optimization II}
\begin{block}{Low-level (``Microanalysis'')}
Performance counters and throughput-based reasoning, e.g.:
\begin{itemize}
\item Vectorize
\item Tune for caches
\item Watch for $\upmu$architectural stalls
\end{itemize}
\end{block}
\end{frame}
}
\begin{frame}{Complexity finds a way}
\begin{center}
\includegraphics[width=\textwidth]{images/peano.jpg}\\
\tiny{A fractal having Hausdorff dimension 2,\\
the space-filling Peano curve is used in cache-oblivious algorithms.}
\end{center}
\vfill
\large{\textbf{Mastering the complex modern design space requires deftly
switching between theoretical analysis, guided experiment, and pure
exploration.\\
\vspace{.25in}
This class is about doing so.}}
\end{frame}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth]{images/bigohw1.png}
}%
\begin{frame}{Insert 1/3}
\end{frame}
}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth]{images/bigohw2.png}
}%
\begin{frame}{Insert 2/3}
\end{frame}
}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth]{images/bigohw3.png}
}%
\begin{frame}{Insert 3/3}
\end{frame}
}
%In terms of pure growth, we have $ijk$, so we can set
%\begin{equation}
%n=\frac{i}{3} + \frac{j}{3} + \frac{k}{3}
%\end{equation}
%and this is a very accurate analysis.
%Do you see $n$?\\
%\vfill
%I see $k$ additions of $k$ multiplications, done $i * j$ times. So $n$ is the
%average of our three sizes. But really, $k$ contributes twice as much as either
%$i$ or $j$. So
%\begin{equation}
%n = \frac{k}{2} + \frac{i}{4} + \frac{j}{4}
%\end{equation}
%
%naively, a bit at a time on $b$-bit inputs, I see $ijk\Theta(b) + ijk\mathcal{O}(b^{2})
%\implies ijk\mathcal{O}(b^{2})$. Fix $b$. I see $ijkb + ijkb^{2}$. What, we can add
%$b$-bit words in a single operation? I see $ijk + ijkb$
%\footnote{\tiny{Note that whatever $b$ we pick, so long as we pick a finite $b$, the
%asymptotic analysis holds---whether $b$ or $b^{2}$, it's just a constant factor
%as our problem grows. Accurately multiplying matrices of fixed size, but having arbitrarily
%large elements, is a different analysis.}}
%. Ho ho, we have a hardware
%multiplier? $ijk + ijk$ it is!
\begin{frame}{Recommended reading}
\tiny{
\begin{itemize}
\item Hong and Kung. ``I/O complexity: The red-blue pebble game'' (1981).
\item Yotov et al. ``An experimental comparison of cache-oblivious and cache-conscious programs'' (2007).
\item Irony et al. ``Communication Lower Bounds for Distributed-Memory Matrix Mul'' (2004).
\item Goto et al. ``Anatomy of High-Performance Matrix Multiplication'' (2008).
\item Kalyanasundaram et al. ``Improved Simulation of NTMs'' (2011).
\item Fran\c{c}ois Le Gall. ``Faster Algorithms for Rectangular Matrix Multiplication'' (2012).
\item Eric Quinnell. ``Floating-Point Fused Multiply-Add Architectures'' (2007).
\item Tom Leighton. ``Better Master Theorems for Divide-and-Conquer Recurrences'' (1996).
\item \textit{Intel Instruction Set Extensions Programming Reference}
\end{itemize}
}
\end{frame}
%\begin{frame}{Hack on!}
%\end{frame}
\end{document}