Commit bb3d03d8 authored by Antonin Dudermel's avatar Antonin Dudermel
Browse files

adding resources from an old presentation

parent 13e031e7
% !TEX TS-program = pdflatex
% !TEX encoding = UTF-8 Unicode
\documentclass[slidetop,11pt]{beamer}
%\usepackage[french]{babel}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{amsmath,amsfonts,amssymb, amsthm}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{subfig}
\usepackage{shadethm}
\usepackage{colortbl,xcolor}
% \usepackage{listings}
\usepackage{eurosym}
\usepackage{multirow}
\usepackage{pgfplots}
\usepackage{cancel}
\pgfplotsset{compat=newest}
\usepackage{tikz}
\usetikzlibrary{calc, patterns, arrows, shapes, shapes.geometric, decorations.markings, positioning, shadows, shapes.gates.logic.US}
\input{./2020-ICERM/Figures/fpexparch.tex}
\input{./2020-ICERM/defs}
\usetheme{metropolis}
% \usecolortheme{beaver}
\renewcommand{\textsc}[1]{\textit{#1}}
\newcommand{\switch}[2]{\only<1>{#1}\only<2>{#2}}
\title{Report: Floating-Point Exponential for DSP-Enabled FPGA}
\subtitle{}
\author{De Dinechin \and Pasca}
\date{}
\makeatletter
\let\@@magyar@captionfix\relax
\makeatother
\begin{document}
%\theoremstyle{plain}
\newshadetheorem{defin}{Definition}
\newshadetheorem{theo}{Theorem}
\newshadetheorem{prop}{Property}
\frame{\titlepage}
\begin{frame}{Table of contents}
\tableofcontents
\end{frame}
\section{Fully-Parametric last-bit-accurate arithmetic core generator}
\label{sec:fully-param-last}
\begin{frame}{Arithmetic on CPU \only<2>{VS FPGA}}
% \begin{columns}
% \begin{column}{0.5\textwidth}
% Arithmetic with CPU must
% \begin{itemize}
% \item rely on a fixed circuit
% \item be versatile
% \item have a fixed format
% \end{itemize}
% Software model
% \end{column}
% \pause
% \begin{column}{0.5\textwidth}
% Arithmetic with FPGA can
% \begin{itemize}
% \item be reprogrammed at no cost
% \item be very specific
% \item have any format
% \end{itemize}
% Circuit model
% \end{column}
% \end{columns}
Arithmetic on CPU:
\begin{itemize}
\item one once-for-all fixed circuit
\begin{itemize}
\item versatile format
\item common operators
\end{itemize}
\item problems
\begin{itemize}
\item overaccurate/noise computations
\item software emulation of specific operators
\end{itemize}
\end{itemize}
\pause
Arithmetic on FPGA: \(\approx 10\times\) slower, but
\begin{itemize}
\item fully-reprogrammable circuit
\begin{itemize}
\item very specific formats
\item very specific operators
\end{itemize}
\item opportunities
\begin{itemize}
\item exactly the accuracy needed
\item circuit-based model
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{FPGA thinking}
FPGA are not exactly reprogrammable circuits, they also include
\begin{itemize}
\item small memories (16kB)
\item small FMA (DSP blocks)
\item optimizations for adders
\end{itemize}
A great advantage for arithmetic
\end{frame}
\begin{frame}{Overwhelming Freedom}
\begin{figure}
\parbox{0.48\textwidth}{\centering\includegraphics[scale=0.1]{interruptor}\caption{float/double}}\hfill
\parbox{0.48\textwidth}{\centering\includegraphics[scale=0.8]{console}\caption{anything you want}}
\label{fig:ux}
\end{figure}
infinite number of formats \(\Longrightarrow\) infinite number of circuits
circuit \(\Longrightarrow\) circuit generator, parameterised by format
\end{frame}
\begin{frame}{FloPoCo}{}
{\hfill\includegraphics[scale=0.4]{FloPoCoSmall}\hfill}
An open-source arithmetic core generator that is
\begin{itemize}
\item Fully parameterized in precision
\item Accurate to last bit
\end{itemize}
\end{frame}
\section{Floating-Point exponential}
\label{sec:float-point-expon}
\begin{frame}{A well-known mathematical function}
\[
\begin{array}{rcl}
\mathbb{R}&\rightarrow&\mathbb{R}\\
x&\mapsto&e^x \uncover<2>{= 2^{x/\ln(2)} = 1 + x + \frac{x^2}{2} + R_n(x)}
\end{array}
\]
\begin{itemize}
\item is strictly increasing
\item is bijective
\item is \(\mathcal{C}^\infty\)
\item is a group homomorphism
\item has limits in \(-\infty\) and \(+\infty\)
\item (is its own derivative)
\end{itemize}
\end{frame}
\begin{frame}
\centering
\scalebox{0.5}{
\begin{tikzpicture}[x=1ex, y=1ex]
\fill[color=cyan, opacity=0.7] (4, 10) rectangle (40, -10);
\fpexparch{1}
\end{tikzpicture}
}
\end{frame}
\begin{frame}{Easy parts}
\begin{center}
\begin{tikzpicture}[scale=0.5]
\draw[->] (-4.5,0) -- (2.5,0) node[right] {$x$};
\draw[->] (0,-0.5) -- (0,5) node[above] {$e^x$};
\draw[color=blue,domain=-4:2,samples=100] plot (\x,{exp(\x)});
\fill[color=blue, opacity=0.3] (-2, 0.1353) rectangle (-4.5, -0.5);
\fill[color=blue, opacity=0.3] (1.5,4.4817) rectangle (2.5,7.5);
\uncover<2->{
\fill[color=red,opacity=0.3] (-0.2, 0.8) rectangle (0.2, 1.2);
}
\end{tikzpicture}
\end{center}
\only<1>{
As exp is increasing and grows really fast, under/overflows
\begin{itemize}
\item happen very often
\item are easy to detect
\end{itemize}}
\only<2>{
We also have \(e^0 = 1\), so for small inputs \(\epsilon\) \(\mathsf{RTN}(e^\epsilon) = 1\)
}
\only<3>{
All interesting inputs of reasonable size \(\to\) exponent is useless
}
\end{frame}
\begin{frame}
\centering
\scalebox{0.5}{
\begin{tikzpicture}[x=1ex, y=1ex]
\fill[color=cyan, opacity=0.7] (10, -12) rectangle (45, -38);
\fpexparch{1}
\end{tikzpicture}
}
\end{frame}
\begin{frame}{Computing the exponent}
With formula \(e^X = 2^{X/\ln(2)}\) we could compute : \[E = \lfloor X\ln(2)\rfloor, Y = X - E\ln(2)\]
and have \[e^X = 2^E e^Y \]
with \[0 \le Y \le \ln(2)\]
\pause
\begin{itemize}
\item \(0 \le Y\) costs a lot
\item \( Y\le \ln(2) \) is not very interesting
\end{itemize}
\end{frame}
\begin{frame}{Computing roughly the exponent}
One can be satisfied by an approximation \(E\) of the exponent, provided that
\begin{itemize}
\item \(Y = X - E\ln(2)\) is accurate: \(e^X = 2^E e^Y \) will still be accurate
\item the error can easily be corrected downstream
\end{itemize}
\[E = \lfloor X/\ln(2)\rceil, Y = X - E\ln(2)\]
with
\[|Y| \le 1/2\]
\end{frame}
\begin{frame}
\centering
\scalebox{0.5}{
\begin{tikzpicture}[x=1ex, y=1ex]
\uncover<2>{\fill[color=cyan, opacity=0.7] (10, -37) rectangle (50, -78);}
\fpexparch{1}
\end{tikzpicture}
}
\end{frame}
\begin{frame}{The exponential, at last}
Have it done in advance by software and put every possible result in a big table.
What if the table is too big for the FPGA?
\pause
Use divide and conquer: split \(Y\) between its upper and lower bits
\(Y = A + Z \text{ with } Z < 2^k \Rightarrow e^Y = e^{A+Z} = e^A\times e^Z = e^A(e^Z - 1) + e^A\)
\pause
What if the table is too big for the FPGA?
\pause
Compute \(e^Z - 1\) as \([e^Z - Z - 1] + Z\), needs less input bits.
\pause
What if the table is too big for the FPGA?
\pause
Use a polynomial approximator (FloPoCo primitive)
\end{frame}
\begin{frame}
\centering
\scalebox{0.5}{
\begin{tikzpicture}[x=1ex, y=1ex]
\fpexparch{1}
\end{tikzpicture}
}
\end{frame}
\section{Accuracy Proofs}
\label{sec:accuracy-proofs}
\begin{frame}{The old model}
Method
\begin{itemize}
\item Estimate accuracy of each component in term of ulp
\item Adding guard bits makes the ulp smaller
\item Deduce the number of guard bits needed to ensure accuracy
\end{itemize}
\pause
Problems
\begin{itemize}
\item More speech than maths
\item Error magnifying harder to explain than to set in equations
\item Error-prone
\item Barely systematic
\end{itemize}
\end{frame}
\begin{frame}{The new model}
\begin{columns}
\begin{column}{0.5\textwidth}
First step:
\begin{align*}
\delta_M &= M - e^Y \\
&= M - e^Ae^Z \\
&= M - Te^Z + Te^Z - e^Ae^Z\\
&= M - Te^Z + (T - e^A)e^Z \\
&= M - Te^Z + \delta_T e^Z
\end{align*}
\end{column}
\begin{column}{0.5\textwidth}
\centering
\scalebox{0.5}{
\begin{tikzpicture}[x=1ex, y=1ex]
\fpexparch{1}
\end{tikzpicture}
}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Testing}
\begin{center}
\begin{tikzpicture}
\node (flopoco) at (0,0) {FloPoCo};
\node (mpfr) at (3,2) {MPFR};
\node (exe) at (7,2) {EXE};
\node (vhdl) at (3,-2) {VHDL};
\node (gtkw) at (7,-2) {GTKWave};
\node (tc) at (5, 0) {test cases};
\node (diff) at (9,0) {\(\delta\)};
\draw[->] (flopoco) -- node[above left] {emulate} (mpfr);
\draw[->] (mpfr) -- (exe);
\draw[->] (exe) -- (diff);
\draw[->] (flopoco) -- node[below left] {generate} (vhdl) ;
\draw[->] (vhdl) -- (gtkw);
\draw[->] (gtkw) -- (diff);
\draw[->] (flopoco) -- node[above] {build} (tc);
\draw[->] (tc) -- (exe);
\draw[->] (tc) -- (gtkw);
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Proof assistant and automatic prover}
FloPoCo operators relies mostly on handmade proofs.
Proof assistant / automatic provers are
\begin{itemize}
\item either too complex to express FloPoCo in them
\item either too simple to handle parameterized numeric formats
\end{itemize}
Best of the two worlds : generate a proof per circuit
\end{frame}
\begin{frame}{FloPoCo in FloPoCo}
Create a handful set of carfully-proved primitives in FloPoCo and recycle them
into the rest of the code.
Already the case : BitHeaps, that can handle n-ary additions (and much more),
which are used in multiplications by constants, which are used in polynomial evaluation.
\end{frame}
\begin{frame}
\centering
\scalebox{0.5}{
\begin{tikzpicture}[x=1ex, y=1ex]
\fill[color=cyan, opacity=0.7] (25, -52) rectangle (50, -78);
\fpexparch{1}
\end{tikzpicture}
}
\end{frame}
\begin{frame}{Conclusion}
{\hfill\includegraphics[scale=0.4]{FloPoCoSmall}\hfill}
Flopoco :
\begin{itemize}
\item Fully parameterized in precision
\item Accurate to last bit
\end{itemize}
Leads to small and efficient operators
Proofs remain tedious
\end{frame}
\end{document}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment