report.tex

\documentclass[a4paper,12pt,oneside]{report}

\usepackage[T1]{fontenc} \usepackage{lmodern} \usepackage[utf8]{inputenc}
\usepackage[english]{babel} \usepackage{csquotes}
\usepackage{float} \usepackage{graphicx,subcaption}
\usepackage{amssymb,amsmath} \usepackage{siunitx}
\usepackage[style=numeric,backend=biber]{biblatex} \bibliography{refs}
\usepackage[top=4cm,bottom=4cm,right=3.5cm,left=3.5cm,headheight=15pt]{geometry}
%\usepackage[top=4cm,bottom=4cm,outer=3cm,inner=4cm]{geometry}
\usepackage{fancyhdr} \pagestyle{fancy} %\usepackage{lastpage}
%\usepackage{parskip} \setlength{\parindent}{0em} \setlength{\parskip}{1em}
\usepackage{fixltx2e}  % For \lagtextsubscript{}.
\usepackage[colorlinks=true,allcolors=black]{hyperref} \hypersetup{
	pdfauthor={Michaël Defferrard},
	pdftitle={Structured Auto-Encoder},
	pdfsubject={Master Thesis in Information Technologies}
}

\fancyhead[L]{\nouppercase{\leftmark}}
\fancyhead[R]{}
\fancyfoot[C]{\thepage}
%\lhead{Audio Classification with Structured Deep Learning} \chead{} \rhead{}
%\cfoot{-- \thepage --}

\usepackage[acronym]{glossaries} \makeglossaries
\newacronym{EPFL}{EPFL}{École Polytechnique Fédérale de Lausanne}
\newacronym{ICASSP}{ICASSP}{International Conference on Acoustics, Speech and Signal Processing}
\newacronym{CS}{CS}{Compressed Sensing}
\newacronym{ANNs}{ANNs}{Artificial Neural Networks}
\newacronym{CNN}{CNN}{Convolutional Neural Network}
\newacronym{RNN}{RNN}{Recursive Neural Network}
\newacronym{LSTM}{LSTM}{Long Short Term Memory}
\newacronym{MLP}{MLP}{Multi-Layer Perceptron}
\newacronym{RBM}{RBM}{Restricted Boltzmann Machine}
\newacronym{DBN}{DBN}{Deep Belief Network}
\newacronym{MIR}{MIR}{Music Information Retrieval}
\newacronym{MGR}{MGR}{Music Genre Recognition}
\newacronym{MIREX}{MIREX}{Music Information Retrieval Evaluation eXchange}
\newacronym{PSD}{PSD}{Predictive Sparse Decomposition}
\newacronym{CQT}{CQT}{Constant-Q Transform}
\newacronym{LCN}{LCN}{Local Contrast Normalization}
\newacronym{SVM}{SVM}{Support Vector Machine}
\newacronym{RIP}{RIP}{Restricted Isometry Property}
\newacronym{MP}{MP}{Matching Pursuit}
\newacronym{BP}{BP}{Basis Pursuit}
\newacronym{FISTA}{FISTA}{Fast Iterative Shrinkage-Thresholding Algorithm}
\newacronym{ADMM}{ADMM}{Alternating Direction Method of Multipliers}
\newacronym{PD}{PD}{Primal-Dual}
\newacronym{SGD}{SGD}{Stochastic Gradient Descent}
\newacronym{GPU}{GPU}{Graphical Processing Unit}
\newacronym{LASSO}{LASSO}{Least Absolute Shrinkage and Selection Operator}
\newacronym{AGC}{AGC}{Automatic Gain Control}
\newacronym{kNN}{kNN}{k Nearest Neighbors}
\newacronym{PCA}{PCA}{Principal Component Analysis}
\newacronym{NLDR}{NLDR}{Non-Linear Dimensionality Reduction}
\newacronym{SOM}{SOM}{Self-Organizing Map}
\newacronym{LLE}{LLE}{Locally-Linear Embedding}
\newacronym{OLS}{OLS}{Ordinary Least Squares}

\newcommand{\partref}[1]{Part~\ref{part:#1}}
\newcommand{\chapref}[1]{Chapter~\ref{chap:#1}}
\newcommand{\secref}[1]{Section~\ref{sec:#1}}
\newcommand{\eqnref}[1]{(\ref{eqn:#1})}  % Eqn~(\ref{#1})
\newcommand{\figref}[1]{Figure~\ref{fig:#1}}
\newcommand{\tabref}[1]{Table~\ref{tab:#1}}
\newcommand{\HRule}{\rule{\linewidth}{0.5mm}}

\newcommand{\R}{\mathbb{R}}
\DeclareMathOperator*{\argminop}{arg\,min}
\DeclareMathOperator*{\minimizeop}{minimize}
\DeclareMathOperator*{\proxop}{prox}
\DeclareMathOperator*{\sign}{sign}
\DeclareMathOperator*{\tr}{tr}
\DeclareMathOperator*{\spanning}{span}
\newcommand{\argmin}[1]{\argminop\limits_{#1}}
\newcommand{\minimize}[1]{\minimizeop\limits_{#1} \ }  % or \min
\newcommand{\prox}[1]{\text{prox}_{#1}}
\newcommand{\normT}[1]{\| #1 \|_2^2}
\newcommand{\normO}[1]{\| #1 \|_1}
\newcommand{\normZ}[1]{\| #1 \|_0}
\newcommand{\normF}[1]{\| #1 \|_\text{F}^2}
\newcommand{\inner}[2]{\langle #1 , #2 \rangle}
\renewcommand{\L}{\mathbf{L}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\E}{\mathbf{E}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Z}{\mathbf{Z}}
\renewcommand{\d}{\mathbf{d}}
\newcommand{\e}{\mathbf{e}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\z}{\mathbf{z}}
\newcommand{\eps}{\mathbf{\epsilon}}
\newcommand{\gam}{\mathbf{\Gamma}}
\newcommand{\Eone}{E_1(\Z, \D)}
\newcommand{\Etwo}{E_2(\Z, \D)}
\newcommand{\Ethree}{E_3(\Z, \D, \E)}
\newcommand{\Xx}{\mathcal{X}}
\newcommand{\G}{\mathcal{G}}
\newcommand{\V}{\mathcal{V}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\set}[2]{\{#1_i\}_{i=1}^#2}
\newcommand{\st}{\text{ s.t. }}
\newcommand{\cst}[2]{\|#1_#2\|_2 \leq 1}
%\newcommand{\forallx}[2]{\forall #1 \in \{1,\ldots,#2\}}
\newcommand{\forallx}[2]{#1 = 1, \ldots, #2}
%\newcommand{\forallx}[2]{\|#1_#2\|_2 \leq 1 \text{ for } #2 = 1,\ldots,#3}
\newcommand{\cstd}{\cst{\d}{i}, \ \forallx{i}{m}}
\newcommand{\cste}{\cst{\e}{k}, \ \forallx{k}{n}}
\newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\fd}{f_d(\Z, \D)}
\newcommand{\fz}{f_z(\Z)}
\newcommand{\fg}{f_g(\Z)}
\newcommand{\fe}{f_e(\Z, \E)}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{document}

%\hypersetup{pageanchor=false}
\begin{titlepage}
	
	\includegraphics[height=2.5cm]{img/logo_epfl}
	\hfill
	\includegraphics[height=2.5cm]{img/logo_lts2}
	\vspace{1.2cm}
	
	\begin{center}
		
		\textsc{\LARGE MASTER THESIS}\\
		\vspace{0.5cm}
		\large Electrical and Electronic Section\\
		\large Major in Information Technologies\\
		\vspace{0.8cm}
		
		\HRule
		\vspace{0.9cm}
		\textsc{\Huge Structured Auto-Encoder}\\
		\vspace{0.5cm}
		\textsc{\Large with application to}\\
		\vspace{0.2cm}
		\textsc{\Large Music Genre Recognition}\\
		\vspace{0.6cm}
		\HRule
		\vspace{1.3cm}
		
		\begin{minipage}{0.45\textwidth}
			\begin{flushleft} \large
				\textbf{Student}\\ \vspace{0.5ex}
				Michaël \textsc{Defferrard} \\ \vspace{2.5ex}
				\textbf{Professor} \\ \vspace{0.5ex}
				Pierre \textsc{Vandergheynst}
			\end{flushleft}
		\end{minipage}
		\begin{minipage}{0.45\textwidth}
			\begin{flushright} \large
				\textbf{Supervisors} \\ \vspace{0.5ex}
				Xavier \textsc{Bresson} \\ \vspace{0.2ex}
				Johan \textsc{Paratte}
			\end{flushright}
		\end{minipage}
		
		\vspace{1.5cm}
		\large EPFL LTS2 Laboratory\\
		\vspace{0.8cm}
		\large June 19, 2015
		
	\end{center}
	
\end{titlepage}
%\hypersetup{pageanchor=true}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% No new page for the abstract in report class.
\def\abstract{
	\vfil
	\begin{center}
		{\bfseries \abstractname\vspace{-.5em}}
	\end{center}
	\quotation
}
\def\endabstract{\par
	\endquotation
}

\pagestyle{empty}
\begin{abstract}
	In this work, we present a technique that learns discriminative audio features for Music Information Retrieval (MIR). The novelty of the proposed technique is to design auto-encoders that make use of data structures to learn enhanced sparse data representations. The data structure is borrowed from the Manifold Learning field, that is data are supposed to be sampled from smooth manifolds, which are here represented by graphs of proximities of the input data. As a consequence, the proposed auto-encoders finds sparse data representations that are quite robust w.r.t. perturbations. The model is formulated as a non-convex optimization problem. However, it can be decomposed into iterative sub-optimization problems that are convex and for which well-posed iterative schemes are provided in the context of the Fast Iterative Shrinkage-Thresholding (FISTA) framework. Our numerical experiments show two main results. Firstly, our graph-based auto-encoders improve the classification accuracy by 2\% over the auto-encoders without graph structure for the popular GTZAN music dataset. Secondly, our model is significantly more robust as it is 8\% more accurate than the standard model in the presence of 10\% of perturbations.
\end{abstract}

\renewcommand{\abstractname}{Acknowledgements}
\begin{abstract}
	I would first like to thank Dr. Xavier Bresson for his dedicated day-to-day supervision, which included a weekly meeting during which we discussed the achieved results and debated new ideas. He further provided me with many inputs, intuitions and references, which were very helpful to clarify some concepts. Finally, I want to thank him for his review of parts of this manuscript.

	I would also like to thank Johan Paratte for his helpful intuitions at the beginning of the project and his advices on how to structure the thesis.

	Finally, I would like to thank Prof. Pierre Vandergheynst which has made this very interesting project possible. Thanks to him and this project, I've learned much more that the content of this thesis.
\end{abstract}

\tableofcontents
\pagestyle{fancy}

%\printglossaries
%\addcontentsline{toc}{chapter}{Glossary}
%\addcontentsline{toc}{chapter}{Acronyms}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\chapter*{Introduction}
\addcontentsline{toc}{chapter}{Introduction}

%{\color{red} Introduction to the matter.}
%\chapter{Machine learning}
% What's the purpose of learning.
% Why learned features instead of hand-crafted features ?
%\section{Supervised vs unsupervised}
% Explain the difference. Why do we want unsupervised ?

% What will be presented. Design goal.
This thesis introduces a \textit{structured auto-encoder}, an auto-encoder variant which preserves the structure of the data while transforming it in a sparse representation. It learns sparse representations that explicitly take into account the local manifold structure of the data. The primary goal of the proposed algorithm is unsupervised representation learning toward the goal of feature extraction, while being robust to noisy data and fast at feature extraction (after the training phase). As the discriminative power of learned representations cannot be directly evaluated, the proposed algorithm shall be evaluated through a classification task. We propose an application in \gls{MIR} which consists of retrieving the genre of unknown musical clips.

%As we shall see, it has the properties of an hybrid between a sparse auto-encoder \cite{lecun2006sparseAutoencoders, ranzato2007stackedSparseAutoencoders} and a denoising auto-encoder \cite{bengio2008denoisingAutoencoders}.

% Plan.
This thesis report is divided in two parts. Through \partref{algorithm},
%the proposed model is introduced by a motivation in \chapref{motivation}
some background informations are given in \chapref{background} and the proposed model is constructed step by step in \chapref{model}, which is the core of this work. Well-posed iterative schemes to solve the resulting convex sub-optimization problems are then proposed in \chapref{optimization}.% while \chapref{related_work} compares it to existing models.
As a testbed of our algorithm's performance, an application to \gls{MGR} is proposed in \partref{application} with a presentation of the application in \chapref{MGR} and the proposed system in \chapref{system}.
% along with some implementation details in \chapref{implementation}.
\chapref{results} is dedicated to the presentation of the experimental results.

This work was accomplished as a Master thesis, which is an integral part of the major in Information Technologies curriculum from the Electrical and Electronic Section of the \gls{EPFL}. It was conducted at the LTS2 laboratory\footnote{The laboratory homepage is accessible at \url{http://lts2www.epfl.ch/}.}.

While the project was ongoing, I continuously published my thoughts, observations, findings, experiments, results and plans as well as a summary of the weekly meetings with my supervisors on an online blog\footnote{My research blog is available at \url{https://lts2research.epfl.ch/blog/mdeff/}.} in the format of an open laboratory notebook. In the spirit of open research, the code as well as all the results, this report and the ongoing paper are versioned with git and available online through GitHub\footnote{My GitHub account can be found at \url{https://github.com/mdeff}.}. Some continuation of this work shall be submitted to the 41st IEEE \gls{ICASSP}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\input{algorithm}
\input{application}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\chapter*{Conclusion}
\addcontentsline{toc}{chapter}{Conclusion}
% Discussion of our results. Strengths and weaknesses of our method.

In this work we introduced a novel auto-encoder architecture, which main characteristic is to learn sparse and structured representation of input data, borrowing ideas from sparse coding and manifold learning. Precisely, it learns sparse representations that explicitly take into account the local manifold structure of the data. The experimental results on music genre recognition showed that our model extracted more discriminant features than sparse auto-encoders.

%\section*{Future work}
%\addcontentsline{toc}{section}{Future work}
% Split future work on the algorithm and on the application ?
% Only about algorithm ?

%\paragraph{Direct encoder.}
% Verify the hypothesis
%Our model's third assumption about the existence of a direct encoder (\secref{assumptions}) has to be verified experimentally. It remains to be shown that the features extracted with the approximate scheme \eqnref{z_direct} are equivalent to those extracted with the exact scheme \eqnref{z_exact}. Previous work in the direction however suggests that this assumption is reasonable and that such an encoder should exist.

%\paragraph{TV norm.}
% TV norm on the graph instead of Dirichlet energy

%\paragraph{Distributed implementation.}
% Details ?
% Train chunks of D / Z via independant threads.
% Directly in the PyUNLocBoX ?
% Usefull only if we are not memory bandwidth limited.
% Why ATLAS does not do it by default ? OpenBLAS does it.
% Move to GPU.

%\paragraph{Multiple layers.}
%In the spirit of stacked auto-encoders
% Multiple layers of auto-encoders
% In spite of deep learning methods
% Reference to DBN: stacks of RBM, Stacked auto-encoders.

%\subsection{Deep learning}
% Rebranded as deep learning, achieve now astonishing results in a variety of fields (computer vision, NLP, machine translation, knowledge representation, automatic control)
% Deep learning: hierarchical feature learning --> we are not really using that (yet)
% Deep architectures can approximate any function (RNN any program, i.e. Turing complete)
% First fall of NN (success of kernel methods / SVM) (first AI winter ?) because perceptron was proved to not be sufficient, multi-layer was too hard (computation and training)
% Computationally expensive to train
% Needed to invent backpropagation to train multiple layers
% Hard to train: problem of vanishing gradient
% Objective function is highly non-convex.
% popularized by CNN.

%\paragraph{End-to-end learning.}
% For the application, not the algorithm.
% Learn features right from raw audio. If spectrograms are good features, we should retrieve them.

%\paragraph{Domain knowledge.}
% For the application, not the algorithm.
% Incorporate some domain knowledge about the dictionary via priors. Then it's not supervised anymore.
% Priors on the dictionary who are not necessarily domain specific, i.e. are general enough to apply to other kind of data.

\addcontentsline{toc}{chapter}{References}
\printbibliography

%\chapter*{Annexes}
%\addcontentsline{toc}{chapter}{Annexes}

%\section*{Conference paper}
%\addcontentsline{toc}{section}{Conference paper}
% Draft of ISMIR / ICASSP paper

%\section*{Code}
%\addcontentsline{toc}{section}{Code}
% IPython notebooks exported as LaTex.
% Available on GitHub.

\end{document}