<TeXmacs|1.99.7>

<style|<tuple|ieeetran|std-latex>>

<\body>
  <\hide-preamble>
    <new-theorem|prop|Proposition>

    <new-theorem|rem|Remark>

    <assign|algorithmicrequire|<macro|<with|font-series|bold|Input:>>>

    <assign|algorithmicensure|<macro|<with|font-series|bold|Output:>>>

    <assign|vect|<macro|1|<with|math-font-series|bold|<arg|1>>>>

    <assign|mat|<macro|1|<with|math-font-series|bold|<arg|1>>>>

    <assign|E|<macro|1|<math-ss|E><left|{><arg|1><right|}>>>

    <assign|cov|<macro|1|<math-up|cov><left|{><arg|1><right|}>>>

    <assign|Tr|<macro|1|<math-up|Tr><left|{><arg|1><right|}>>>

    <assign|ex|<macro|1|exp <left|{><arg|1><right|}>>>

    <assign|range|<macro|1|\<cal-R\><left|(><arg|1><right|)>>>

    <assign|etr|<macro|1|<math-up|etr><left|{><arg|1><right|}>>>

    <assign|diag|<macro|1|<math-up|diag><left|(><arg|1><right|)>>>

    <assign|bc|<macro|<vect|c>>>

    <assign|bd|<macro|<vect|d>>>

    <assign|q|<macro|<vect|q>>>

    <assign|bs|<macro|<vect|s>>>

    <assign|x|<macro|<vect|x>>>

    <assign|y|<macro|<vect|y>>>

    <assign|bm|<macro|<vect|m>>>

    <assign|A|<macro|<mat|A>>>

    <assign|B|<macro|<mat|B>>>

    <assign|C|<macro|<mat|C>>>

    <assign|D|<macro|<mat|D>>>

    <assign|Dbar|<macro|<wide|<mat|D>|\<bar\>>>>

    <assign|bE|<macro|<mat|E>>>

    <assign|F|<macro|<mat|F>>>

    <assign|Y|<macro|<mat|Y>>>

    <assign|Qy|<macro|<mat|Q><rsub|y>>>

    <assign|Ry|<macro|<mat|R><rsub|y>>>

    <assign|Qyorth|<macro|<mat|Q><rsup|\<perp\>><rsub|y>>>

    <assign|I|<macro|<mat|I>>>

    <assign|M|<macro|<mat|M>>>

    <assign|N|<macro|<mat|N>>>

    <assign|Null|<macro|<mat|Q><rsub|\<perp\>>>>

    <assign|bP|<macro|<mat|P>>>

    <assign|Pbar|<macro|<wide|<mat|P>|\<bar\>>>>

    <assign|Q|<macro|<mat|Q>>>

    <assign|R|<macro|<mat|R>>>

    <assign|iR|<macro|<mat|R><rsup|-1>>>

    <assign|Rs|<macro|<mat|R><rsub|s>>>

    <assign|iRs|<macro|<mat|R><rsub|s><rsup|-1>>>

    <assign|bS|<macro|<mat|S>>>

    <assign|T|<macro|<mat|T>>>

    <assign|bu|<macro|<vect|u>>>

    <assign|U|<macro|<mat|U>>>

    <assign|Ummsd|<macro|<wide|<mat|U>|^><rsub|mmsd>>>

    <assign|Ummsdlmbgh|<macro|<wide|<mat|U>|^><rsub|mmsd-LM-B>>>

    <assign|Ummsdlmvmf|<macro|<wide|<mat|U>|^><rsub|mmsd-LM-vMF>>>

    <assign|Ummsdcmbgh|<macro|<wide|<mat|U>|^><rsub|mmsd-CM-B>>>

    <assign|Ummsdcmvmf|<macro|<wide|<mat|U>|^><rsub|mmsd-CM-vMF>>>

    <assign|Umap|<macro|<wide|<mat|U>|^><rsub|map>>>

    <assign|Uorth|<macro|<mat|U><rsub|\<perp\>>>>

    <assign|Us|<macro|<mat|U><rsub|s>>>

    <assign|Ubar|<macro|<wide|<mat|U>|\<bar\>>>>

    <assign|Vbar|<macro|<wide|<mat|V>|\<bar\>>>>

    <assign|bv|<macro|<vect|v>>>

    <assign|w|<macro|<vect|w>>>

    <assign|X|<macro|<mat|X>>>

    <assign|z|<macro|<vect|z>>>

    <assign|Z|<macro|<mat|Z>>>

    <assign|bgamma|<macro|<vect|\<gamma\>>>>

    <assign|bGamma|<macro|<mat|\<Gamma\>>>>

    <assign|blambda|<macro|<vect|\<lambda\>>>>

    <assign|bLambda|<macro|<mat|\<Lambda\>>>>

    <assign|btheta|<macro|<vect|\<theta\>>>>

    <assign|bthetammsd|<macro|<wide|<vect|\<theta\>>|^><rsub|mmsd>>>

    <assign|varn|<macro|\<sigma\><rsub|n><rsup|2>>>

    <assign|ivarn|<macro|\<sigma\><rsub|n><rsup|-2>>>

    <assign|pdfN|<macro|1|2|\<cal-N\><left|(> <arg|1>,<arg|2><right|)>>>

    <assign|vMF|<macro|1|<math-up|vMF><left|(> <arg|1><right|)>>>

    <assign|Bgh|<macro|1|<math-up|B><left|(> <arg|1><right|)>>>

    <assign|BMF|<macro|1|2|3|<math-up|BMF><left|(>
    <arg|1>,<arg|2>,<arg|3><right|)>>>

    <assign|mB|<macro|1|2|3|4|<wide|<math-up|B>|~><left|(>
    <arg|1>,<arg|2>,<arg|3>,<arg|4><right|)>>>

    <assign|vBMF|<macro|1|2|<math-up|vBMF><left|(> <arg|1>,<arg|2><right|)>>>

    <assign|pdfU|<macro|1|2|\<cal-U\><left|(><left|[><arg|1>,<arg|2><right|]><right|)>>>

    <assign|pdfG|<macro|1|2|\<cal-G\><left|(><arg|1>,<arg|2><right|)>>>

    <assign|pdftG|<macro|1|2|3|4|\<cal-G\><rsub|t><left|(><arg|1>,<arg|2>,<arg|3>,<arg|4><right|)>>>

    <assign|oneFone|<macro|1|2|3|<space|0.17em><rsub|1>
    F<rsub|1><left|(><arg|1>,<arg|2>;<arg|3><right|)>>>

    <assign|zeroFone|<macro|1|2|<space|0.17em><rsub|0>
    F<rsub|1><left|(><arg|1>;<arg|2><right|)>>>
  </hide-preamble>

  <doc-data|<doc-title|Minimum mean square distance<next-line>estimation of a
  subspace>|<doc-author|<author-data|<author-name|Olivier
  Besson<rsup|<math|(1)>>, Nicolas Dobigeon<rsup|<math|(2)>> and Jean-Yves
  Tourneret<rsup|<math|(2)>><next-line><with|font-size|1|<rsup|<math|(1)>>
  University of Toulouse, ISAE, Department Electronics Optronics Signal,
  Toulouse, France<next-line><rsup|<math|(2)>> University of Toulouse,
  IRIT/INP-ENSEEIHT/TSA, Toulouse, France<next-line><with|font-size|0.84|font-family|tt|olivier.besson@isae.fr,{Nicolas.Dobigeon,Jean-Yves.Tourneret}@enseeiht.fr>>>>>|<doc-date|<date|>>>

  <abstract-data|<\abstract>
    We consider the problem of subspace estimation in a Bayesian setting.
    Since we are operating in the Grassmann manifold, the usual approach
    which consists of minimizing the mean square error (MSE) between the true
    subspace <math|<U>> and its estimate <math|<wide|<U>|^>> may not be
    adequate as the MSE is not the natural metric in the Grassmann manifold.
    As an alternative, we propose to carry out subspace estimation by
    minimizing the mean square distance (MSD) between <math|<U>> and its
    estimate, where the considered distance is a natural metric in the
    Grassmann manifold, viz. the distance between the projection matrices. We
    show that the resulting estimator is no longer the posterior mean of
    <math|<U>> but entails computing the principal eigenvectors of the
    posterior mean of <math|<U><U><rsup|T>>. Derivation of the MMSD estimator
    is carried out in a few illustrative examples including a linear Gaussian
    model for the data and a Bingham or von Mises Fisher prior distribution
    for <math|<U>>. In all scenarios, posterior distributions are derived and
    the MMSD estimator is obtained either analytically or implemented via a
    Markov chain Monte Carlo simulation method. The method is shown to
    provide accurate estimates even when the number of samples is lower than
    the dimension of <math|<U>>. An application to hyperspectral imagery is
    finally investigated.
  </abstract>>

  <new-page>

  <section|Problem statement>

  In many signal processing applications, the signals of interest do not span
  the entire observation space and a relevant and frequently used assumption
  is that they evolve in a low-dimensional subspace <cite|Scharf91>. Subspace
  modeling is accurate when the signals consist of a linear combination of
  <math|p> modes in a <math|N>-dimensional space, and constitute a good
  approximation for example when the signal covariance matrix is close to
  rank-deficient. As a consequence, subspace estimation plays a central role
  in recovering these signals with maximum accuracy. An ubiquitous solution
  to this problem is to resort to the singular value decomposition (SVD) of
  the data matrix. The SVD emerges naturally as the maximum likelihood
  estimator in the classical model <math|<Y>=<U><bS>+<N>>, where <math|<Y>>
  stands for the <math|N\<times\>K> observation matrix, <math|<U>> is the
  (deterministic) <math|N\<times\>p> matrix, with <math|p\<less\>N>, whose
  columns span the <math|p>-dimensional subspace of interest, <math|<bS>> is
  the <math|p\<times\>K> (deterministic) waveform matrix and <math|<N>> is
  the additive noise. The <math|p> principal left singular vectors of
  <math|<Y>> provide very accurate estimates of a basis for the range space
  <math|> of <math|<U>>, and have been used successfully, e.g., in estimating
  the frequencies of damped exponentials or the directions of arrival of
  multiple plane waves, see <cite|Kumaresan82|Kumaresan83> among others.
  However, the SVD can incur some performance loss in two main cases, namely
  when the signal-to-noise ratio (SNR) is very low and thereof the
  probability of a subspace swap or subspace leakage is high
  <cite|Thomas95|Hawkes01|Johnson08|Nadakuditi10>. A second case occurs when
  the number of samples <math|K> is lower than the subspace dimension
  <math|p>: indeed, <math|<Y>> is at most of rank <math|K> and information is
  lacking about how to complement <math|> in order to estimate <math|>.

  Under such circumstances, a Bayesian approach might be helpful as it
  enables one to assist estimation by providing some statistical information
  about <math|<U>>. We investigate such an approach herein and assign to the
  unknown matrix <math|<U>> an appropriate prior distribution, taking into
  account the specific structure of <math|<U>>. The paper is organized as
  follows. In section <reference|section:MMSD>, we propose an approach based
  on minimizing a natural distance on the Grassmann manifold, which yields a
  new estimator of <math|<U>>. The theory is illustrated in section
  <reference|section:examples> where the new estimator is derived for some
  specific examples. In section <reference|section:simulations> its
  performance is assessed through numerical simulations, and compared with
  conventional approaches. Section <reference|section:appli> studies an
  application to the analysis of interactions between pure materials
  contained in hyperspectral images.

  <section|Minimum mean square distance estimation><label|section:MMSD>

  In this section, we introduce an alternative to the conventional minimum
  mean square error (MMSE) estimator, in the case where a subspace is to be
  estimated. Let us consider that we wish to estimate the range space of
  <math|<U>> from the joint distribution <math|p<around*|(|<Y>,<U>|)>> where
  <math|<Y>> stands for the available data matrix. Usually, one is not
  interested in <math|<U>> <em|per se> but rather in its range space <math|>,
  and thus we are operating in the Grassmann manifold <math|G<rsub|N,p>>,
  i.e., the set of <math|p>-dimensional subspaces in <math|\<bbb-R\><rsup|N>>
  <cite|Edelman98>. It is thus natural to wonder whether the MMSE estimator
  (which is the chief systematic approach in Bayesian estimation
  <cite|Kay93>) is suitable in <math|G<rsub|N,p>>. The MMSE estimator
  <math|<wide|<btheta>|^>> of a vector <math|<btheta>> minimizes the average
  Euclidean distance between <math|<wide|<btheta>|^>> and <math|<btheta>>,
  i.e., <math|<E|<around*|\||<wide|<btheta>|^>-<btheta>|\|><rsub|2><rsup|2>>>.
  Despite the fact that this distance is natural in an Euclidean space, it
  may not be the more natural metric in <math|G<rsub|N,p>>. In fact, the
  natural distance between two subspaces <math|> and <math|> is given by
  <math|<around*|(|<big|sum><rsub|k=1><rsup|p>\<theta\><rsub|k><rsup|2>|)><rsup|1/2>>
  <cite|Edelman98> where <math|\<theta\><rsub|k>> are the principal angles
  between these subspaces, which can be obtained by SVD of
  <math|<U><rsub|2><rsup|T><U><rsub|1>> where <math|<U><rsub|1>> and
  <math|<U><rsub|2>> denote orthonormal bases for these subspaces
  <cite|Golub96>. The SVD of <math|<U><rsub|2><rsup|T><U><rsub|1>> is defined
  as <math|<U><rsub|2><rsup|T><U><rsub|1>=<X><diag|cos
  \<theta\><rsub|1>,\<cdots\>,cos \<theta\><rsub|p>><Z><rsup|T>>, where
  <math|<X>> and <math|<Z>> are two <math|p\<times\>p> unitary matrices.
  Therefore, it seems more adequate, rather than minimizing
  <math|<around*|\||<wide|<U>|^>-<U>|\|><rsub|F><rsup|2>> as the MMSE
  estimator does, to minimize the natural distance between the subspaces
  spanned by <math|<wide|<U>|^>> and <math|<U>>. Although this is the most
  intuitively appealing method, it faces the drawback that the cosines of the
  angles and not the angles themselves emerge naturally from the SVD.
  Therefore, we consider minimizing the sum of the squared sine of the angles
  between <math|<wide|<U>|^>> and <math|<U>>, since for small
  <math|\<theta\><rsub|k>>, <math|sin \<theta\><rsub|k>\<simeq\>\<theta\><rsub|k>>.
  As argued in <cite|Edelman98|Golub96>, this cost function is natural in the
  Grassmann manifold since it corresponds to the Frobenius norm of the
  difference between the projection matrices on the two subspaces, viz
  <math|<big|sum><rsub|k=1><rsup|p>sin<rsup|2>
  \<theta\><rsub|k>=<around*|\||<wide|<U>|^><wide|<U>|^><rsup|T>-<U><U><rsup|T>|\|><rsub|F><rsup|2>\<triangleq\>d<rsup|2><around*|(|<wide|<U>|^>,<U>|)>>.
  It should be mentioned that our approach follows along the same principles
  as in <cite|Srivastava00> where a Bayesian framework is proposed for
  subspace estimation, and where the author considers minimizing
  <math|d<around*|(|<wide|<U>|^>,<U>|)>>. Hence the theory presented in this
  section is similar to that of <cite|Srivastava00>, with some exceptions.
  Indeed the parameterization of the problem in <cite|Srivastava00> differs
  from ours and the application of the theory is also very different, see the
  next section.

  Given that <math|d<rsup|2><around*|(|<wide|<U>|^>,<U>|)>=2*<around*|(|p-<Tr|<wide|<U>|^><rsup|T><U><U><rsup|T><wide|<U>|^>>|)>>,
  we define the minimum mean-square distance (MMSD) estimator of <math|<U>>
  as

  <\equation>
    <Ummsd>=arg max<rsub|<wide|<U>|^>><E|<Tr|<wide|<U>|^><rsup|T><U><U><rsup|T><wide|<U>|^>>>.
  </equation>

  Since

  <\equation>
    <E|<Tr|<wide|<U>|^><rsup|T><U><U><rsup|T><wide|<U>|^>>>=<next-line><big|int><around*|[|<big|int><Tr|<wide|<U>|^><rsup|T><U><U><rsup|T><wide|<U>|^>>p<around*|(|<U>\|<Y>|)>*d<U>|]>*p<around*|(|<Y>|)>*d<Y>
  </equation>

  it follows that

  <align|<tformat|<table|<row|<cell|<Ummsd>>|<cell|=arg
  max<rsub|<wide|<U>|^>> <big|int><Tr|<wide|<U>|^><rsup|T><U><U><rsup|T><wide|<U>|^>>p<around*|(|<U>\|<Y>|)>*d<U><no-number>>>|<row|<cell|>|<cell|=arg
  max<rsub|<wide|<U>|^>><Tr|<wide|<U>|^><rsup|T><around*|[|<big|int><U><U><rsup|T>p<around*|(|<U>\|<Y>|)>*d<U>|]><wide|<U>|^>>.>>>>>

  Therefore, the MMSD estimate of the subspace spanned by <math|<U>> is given
  by the <math|p> largest eigenvectors of the matrix
  <math|<big|int><U><U><rsup|T>p<around*|(|<U>\|<Y>|)>*d<U>>, which we denote
  as

  <\equation>
    <label|Ummsd><Ummsd>=\<cal-P\><rsub|p><around*|{|<big|int><U><U><rsup|T>p<around*|(|<U>\|<Y>|)>*d<U>|}>.
  </equation>

  In other words, MMSD estimation amounts to find the best rank-<math|p>
  approximation to the posterior mean of the projection matrix
  <math|<U><U><rsup|T>> on <math|>. For notational convenience, let us denote
  <math|<M><around*|(|<Y>|)>=<big|int><U><U><rsup|T>p<around*|(|<U>\|<Y>|)>*d<U>>.
  Except for a few cases where this matrix can be derived in closed-form (an
  example is given in the next section), there usually does not exist any
  analytical expression for <math|<M><around*|(|<Y>|)>>. In such situation,
  an efficient way to approximate the matrix <math|<M><around*|(|<Y>|)>> is
  to use a Markov chain Monte Carlo simulation method whose goal is to
  generate random matrices <math|<U>> drawn from the posterior distribution
  <math|p<around*|(|<U>\|<Y>|)>>, and to approximate the integral in
  <eqref|Ummsd> by a finite sum. This aspect will be further elaborated in
  the next section. Let <math|<M><around*|(|<Y>|)>=<U><rsub|M><around|(|<Y>|)><mat|L><rsub|M><around|(|<Y>|)><U><rsub|M><rsup|T><around|(|<Y>|)>>
  denote the eigenvalue decomposition of <math|<M><around*|(|<Y>|)>> with
  <math|<mat|L><rsub|M><around|(|<Y>|)>=<diag|\<ell\><rsub|1><around|(|<Y>|)>,\<ell\><rsub|2><around|(|<Y>|)>,\<cdots\>,\<ell\><rsub|N><around|(|<Y>|)>>>
  and <math|\<ell\><rsub|1><around|(|<Y>|)>\<geq\>\<ell\><rsub|2><around|(|<Y>|)>\<geq\>\<cdots\>\<geq\>\<ell\><rsub|N><around|(|<Y>|)>>.
  Then the average distance between <math|<Ummsd>> and <math|<U>> is given by

  <align|<tformat|<table|<row|<cell|<label|HSbound><E|d<rsup|2><around*|(|<Ummsd>,<U>|)>>>|<cell|=2*p-2*<big|int><around*|[|<big|int><Tr|<Ummsd><rsup|T><U><U><rsup|T><Ummsd>>p<around*|(|<U>\|<Y>|)>*d<U>|]>*p<around*|(|<Y>|)>*d<Y><no-number>>>|<row|<cell|>|<cell|=2*p-2*<big|int><Tr|<Ummsd><rsup|T><M><around*|(|<Y>|)><Ummsd>>p<around*|(|<Y>|)>*d<Y><no-number>>>|<row|<cell|>|<cell|=2*p-2*<big|sum><rsub|k=1><rsup|p><big|int>\<ell\><rsub|k><around|(|<Y>|)>*p<around*|(|<Y>|)>*d<Y>.>>>>>

  The latter expression constitutes a lower bound on
  <math|<E|d<rsup|2><around*|(|<wide|<U>|^>,<U>|)>>> and is referred to as
  the Hilbert-Schmidt bound in <cite|Srivastava00|Grenander98>. As indicated
  in these references, and similarly to <math|<M><around*|(|<Y>|)>>, this
  lower bound may be difficult to obtain analytically.

  The MMSD approach can be extended to the mixed case where, in addition to
  <math|<U>>, a parameter vector <math|<btheta>> which can take arbitrary
  values in <math|\<bbb-R\><rsup|q>> needs to be estimated jointly with
  <math|<U>>. Under such circumstances, one can estimate <math|<U>> and
  <math|<btheta>> as

  <align|<tformat|<table|<row|<cell|<around*|(|<Ummsd>,<bthetammsd>|)>=arg
  min<rsub|<wide|<U>|^>,<wide|<btheta>|^>><math-ss|E><around*|{|-<Tr|<wide|<U>|^><rsup|T><U><U><rsup|T><wide|<U>|^>>+<around*|(|<wide|<btheta>|^>-<btheta>|)><rsup|T>*<around*|(|<wide|<btheta>|^>-<btheta>|)>|}>.>>>>>

  Doing so, the MMSD estimator of <math|<U>> is still be given by
  <eqref|Ummsd> while the MMSD and MMSE estimators of <math|<btheta>>
  coincide.

  <\rem>
    <label|rem:MMSD>The MMSD approach differs from an MMSE approach which
    would entail calculating the posterior mean of <math|<U>>, viz
    <math|<big|int><U>p<around*|(|<U>\|<Y>|)>*d<U>>. Note that the latter may
    not be meaningful, in particular when the posterior distribution
    <math|p<around*|(|<U>\|<Y>|)>> depends on <math|<U>> only through
    <math|<U><U><rsup|T>>, see next section for an example. In such a case,
    post-multiplication of <math|<U>> by any <math|p\<times\>p> unitary
    matrix <math|<Q>> yields the same value of
    <math|p<around*|(|<U>\|<Y>|)>>. Therefore averaging <math|<U>> over
    <math|p<around*|(|<U>\|<Y>|)>> does not make sense while computing
    <eqref|Ummsd> is relevant. On the other hand, if
    <math|p<around*|(|<U>\|<Y>|)>> depends on <math|<U>> directly, then
    computing the posterior mean of <math|<U>> can be investigated: an
    example where this situation occurs will be presented in the next
    section. As a final comment, observe that
    <math|<big|int><U>p<around*|(|<U>\|<Y>|)>*d<U>> is not necessarily
    unitary but its range space can be used to estimate <math|>.
  </rem>

  <\rem>
    We open a parenthesis here regarding the framework of this paper.
    Although it is not directly related to this paper (we do not address
    optimization problems here) it is interesting to note the recent growing
    interest in optimization problems on special manifolds, especially on the
    Stiefel manifold (the set of <math|N\<times\>p> matrices <math|<U>> such
    that <math|<U><rsup|T><U>=<I>>) and the Grassmann manifold, see the
    excellent tutorial paper by Edelman <em|et al.> <cite|Edelman98> as well
    as <cite|Absil08|Absil09>, and <cite|Abrudan08|Abrudan09|Fiori09b> for
    signal processing applications. These references show the interest of
    taking into account the underlying geometry of the problem, as we attempt
    to do herein.
  </rem>

  <section|Illustration examples><label|section:examples>

  In this section we illustrate the previous theory on some examples,
  including the conventional linear Gaussian model (conditioned on
  <math|<U>>) and a model involving the eigenvalue decomposition of the data
  covariance matrix. As a first step, we address the issue of selecting prior
  distributions for <math|<U>> and then move on to the derivation of the MMSD
  estimator.

  <subsection|Prior distributions>

  A crucial step in any Bayesian estimation scheme consists of selecting the
  prior distribution for the variables to be estimated. We focus here on
  distributions on the Stiefel or Grassmann manifold, depending whether we
  consider the matrix <math|<U>> itself or its range space. There exist only
  a few distributions on the Stiefel or Grassmann manifolds, the most widely
  accepted being the Bingham or von Mises Fisher (vMF) distributions
  <cite|Mardia99|Chikuse03>, which are given respectively by

  <align|<tformat|<table|<row|<cell|p<rsub|<math-up|B>><around|(|<U>|)>>|<cell|=<frac|1|<oneFone|<frac|1|2>*p|<frac|1|2>*N|<A>>><etr|<U><rsup|T><A><U>>>>|<row|<cell|p<rsub|<math-up|vMF>><around|(|<U>|)>>|<cell|=<frac|1|<zeroFone|<frac|1|2>*N|<frac|1|4><F><rsup|T><F>>><etr|<F><rsup|T><U>>>>>>>

  where <math|<etr|.>> stands for the exponential of the trace of the matrix
  between braces, <math|<A>> is an <math|N\<times\>N> symmetric matrix,
  <math|<F>> is an <math|N\<times\>p> arbitrary matrix, and
  <math|<zeroFone|a|<X>>>, <math|<oneFone|a|b|<X>>> are hypergeometric
  functions of matrix arguments, see e.g. <cite|Chikuse03> for their
  definitions. We will denote these distributions as <math|<Bgh|<A>>> and
  <math|<vMF|<F>>>, respectively. Observe that the Bingham distribution
  depends on <math|<U><U><rsup|T>> only, and can thus be viewed as a
  distribution on the Grassmann manifold <cite|Mardia99|Chikuse03> while the
  vMF distribution depends on <math|<U>> and is a distribution on the Stiefel
  manifold. In our case, in order to introduce some knowledge about
  <math|<U>>, we assume that it is \Pclose\Q to a given subspace spanned by
  the columns of an orthonormal matrix <math|<Ubar>>, and hence we consider
  two possible prior distributions for <math|<U>>, namely

  <align|<tformat|<table|<row|<cell|\<pi\><rsub|<math-up|B>><around*|(|<U>|)>>|<cell|\<propto\><etr|\<kappa\><U><rsup|T><Ubar><Ubar><rsup|T><U>><label|Bingham>>>|<row|<cell|\<pi\><rsub|<math-up|vMF>><around*|(|<U>|)>>|<cell|\<propto\><etr|\<kappa\><U><rsup|T><Ubar>><label|vMF>>>>>>

  where <math|\<propto\>> means \Pproportional to\Q. The distribution in
  <eqref|Bingham> is proportional to the sum of the squared cosine angles
  between <math|> and <math|> while <math|\<pi\><rsub|<math-up|vMF>><around*|(|<U>|)>>
  is proportional to the sum of the cosine angles between <math|> and
  <math|>. Note that <math|\<kappa\>> is a concentration parameter: the
  larger <math|\<kappa\>> the more concentrated around <math|<Ubar>> are the
  subspaces <math|<U>>. The difference between the two distributions is the
  following. In the Bingham distribution only <math|> and <math|> are close
  (at least for large values of <math|\<kappa\>>) since
  <math|\<pi\><rsub|<math-up|B>><around*|(|<U>|)>> is invariant to
  post-multiplication of <math|<U>> by any <math|p\<times\>p> unitary matrix
  <math|<Q>> . Hence <math|<U>> is not necessarily close to <math|<Ubar>>. In
  contrast, under the vMF prior distribution, <math|<U>> and <math|<Ubar>>
  are close. For illustration purposes, Figure <reference|fig:mean> displays
  the average fraction of energy of <math|<U>> in <math|> defined as

  <\equation>
    <math-up|AFE><around*|(|<U>,<Ubar>|)>=<E|<Tr|<U><rsup|T><Ubar><Ubar><rsup|T><U>>/p>.
  </equation>

  As can be observed from these figures, both distributions allow the
  distance between <math|<U>> and <math|<Ubar>> to be set in a rather
  flexible way. Their AFE is shown to be identical for small values of the
  concentration parameter but, when <math|\<kappa\>> increases, the AFE of
  the vMF distribution increases faster.

  <big-figure|<with|par-mode|center|<image|mean_energy_N=20_p=5.eps|8cm|||><label|fig:mean>>|Average
  fraction of energy of <math|<U>> in <math|> versus <math|\<kappa\>>.
  <math|N=20>, <math|p=5>.>

  Additionally, even if the AFE are close for small values of
  <math|\<kappa\>>, the distributions of the angles between <math|> and
  <math|> exhibit some differences, as shown in Figures
  <reference|fig:distrib> and <reference|fig:distrib> which display the
  probability density functions of these angles for <math|\<kappa\>=20>.

  <big-figure|<with|par-mode|center|<image|distrib_angle_bingham_N=20_p=5_kappa=20.eps|8cm|||><label|fig:distrib>>|Distribution
  of the angles between <math|> and <math|> for a Bingham distribution.
  <math|N=20>, <math|p=5> and <math|\<kappa\>=20>.>

  <big-figure|<with|par-mode|center|<image|distrib_angle_vmf_N=20_p=5_kappa=20.eps|8cm|||><label|fig:distrib>>|Distribution
  of the angles between <math|> and <math|> for a von Mises Fisher
  distribution. <math|N=20>, <math|p=5> and <math|\<kappa\>=20>.>

  <subsection|Linear model>

  In order to illustrate how the previous theory can be used in practice, we
  first consider a simple example, namely a linear Gaussian model
  (conditioned on <math|<U>>), i.e., we assume that the data follows the
  model <math|<Y>=<U><bS>+<N>> where the columns of <math|<N>> are
  independent and identically distributed (i.i.d.) Gaussian vectors with
  zero-mean and (known) covariance matrix <math|<varn><I>>. We assume that no
  knowledge about <math|<bS>> is available and hence its prior distribution
  is set to <math|\<pi\><around*|(|<bS>|)>\<propto\>1>. Therefore,
  conditioned on <math|<U>> we have

  <align|<tformat|<table|<row|<cell|<label|p(Y\|U)>>|<cell|p<around*|(|<Y>\|<U>|)>=<big|int>p<around*|(|<Y>\|<U>,<bS>|)>*\<pi\><around*|(|<bS>|)>*d<bS><no-number>>>|<row|<cell|>|<cell|\<propto\><big|int><etr|-<frac|1|2<varn>>*<around*|(|<Y>-<U><bS>|)><rsup|T>*<around*|(|<Y>-<U><bS>|)>>d<bS><no-number>>>|<row|<cell|>|<cell|\<propto\><etr|-<frac|1|2<varn>><Y><rsup|T><Y>+<frac|1|2<varn>><Y><rsup|T><U><U><rsup|T><Y>>.>>>>>

  When <math|<U>> follows the Bingham prior distribution, the posterior
  distribution of <math|<U>>, conditioned on <math|<Y>> is given by

  <\equation>
    <label|p(U\|Y)>p<around*|(|<U>\|<Y>|)>\<propto\><etr|<U><rsup|T><around*|[|\<kappa\><Ubar><Ubar><rsup|T>+<frac|1|2<varn>><Y><Y><rsup|T>|]><U>>
  </equation>

  which is recognized as a Bingham distribution with parameter matrix
  <math|\<kappa\><Ubar><Ubar><rsup|T>+<frac|1|2<varn>><Y><Y><rsup|T>>, i.e.,
  <math|<U>\|<Y>\<sim\><Bgh|\<kappa\><Ubar><Ubar><rsup|T>+<frac|1|2<varn>><Y><Y><rsup|T>>>.
  For such a Bingham distribution, it turns out that the eigenvectors of
  <math|<big|int><U><U><rsup|T>p<around*|(|<U>\|<Y>|)>*d<U>> coincide with
  those of <math|\<kappa\><Ubar><Ubar><rsup|T>+<frac|1|2<varn>><Y><Y><rsup|T>>,
  with the same ordering of their eigenvalues, see Appendix
  <reference|app:eigBingham> for a proof. Therefore the MMSD estimator is
  obtained in <with|font-series|bold|closed-form> as

  <\equation>
    <label|Ummsd><Ummsdlmbgh>=\<cal-P\><rsub|p>*<around*|{|\<kappa\><Ubar><Ubar><rsup|T>+<frac|1|2<varn>><Y><Y><rsup|T>|}>.
  </equation>

  Therefore, the MMSD estimator has a very simple form in this case. It
  consists of the principal subspace of a (weighted) combination of the a
  priori projection matrix <math|<Ubar><Ubar><rsup|T>> and the information
  brought by the data through <math|<Y><Y><rsup|T>>. Observe that, in this
  particular case of a Bingham posterior, the MMSD estimator coincides with
  the maximum a posteriori (MAP) estimator.

  Let us now consider the case where the prior distribution of <math|<U>> is
  vMF, and contrast it with the previous example. Using
  <eqref|p(Y\|U)<rsub|L>M> along with along with <eqref|vMF>, it follows that
  the posterior distribution now writes

  <\equation>
    <label|p(U\|Y)>p<around*|(|<U>\|<Y>|)>\<propto\><etr|\<kappa\><U><rsup|T><Ubar>+<frac|1|2<varn>><U><rsup|T><Y><Y><rsup|T><U>>
  </equation>

  which is referred to as the Bingham-von-Mises-Fisher (BMF) distribution
  with parameter matrices <math|<Y><Y><rsup|T>>, <math|<frac|1|2<varn>><I>>
  and <math|\<kappa\><Ubar>> respectively<footnote|The matrix <math|<X>> is
  said to have a <math|<BMF|<A>|<B>|<C>>> distribution -where <math|<A>> is
  an <math|N\<times\>N> symmetric matrix, <math|<B>> is a <math|p\<times\>p>
  diagonal matrix and <math|<C>> is an <math|N\<times\>p> matrix- if
  <math|p<around|(|<X>|)>\<propto\><etr|<C><rsup|T><X>+<B><X><rsup|T><A><X>>>.>.
  Although this distribution is known <cite|Chikuse03>, to our knowledge,
  there does not exist any analytic expression for the integral in
  <eqref|Ummsd> when <math|<U>\|<Y>> has the BMF distribution
  <eqref|p(U\|Y)<rsub|L>M<rsub|v>MF>. Therefore, the MMSD estimator cannot be
  computed in closed-form. In order to remedy this problem, a Markov chain
  Monte Carlo simulation method can be advocated <cite|Robert04|Robert07> to
  generate a large number of matrices <math|<U><rsup|<around|(|n|)>>> drawn
  from <eqref|p(U\|Y)<rsub|L>M<rsub|v>MF>, and to approximate <eqref|Ummsd>
  as

  <\equation>
    <label|Uiam><Ummsdlmvmf>\<simeq\>\<cal-P\><rsub|p>*<around*|{|<frac|1|N<rsub|r>>*<big|sum><rsub|n=N<rsub|<with|font-family|rm|b*i>>+1><rsup|N<rsub|<with|font-family|rm|b*i>>+N<rsub|r>><U><rsup|<around|(|n|)>><U><rsup|<around|(|n|)><rsup|H>>|}>.
  </equation>

  In <eqref|Uiam>, <math|N<rsub|<with|font-family|rm|b*i>>> is the number of
  burn-in samples and <math|N<rsub|r>> is the number of samples used to
  approximate the estimator. An efficient Gibbs sampling scheme to generate
  random unitary matrices drawn from a <math|<BMF|<A>|<B>|<C>>> distribution
  with arbitrary full-rank matrix <math|<A>> was proposed in <cite|Hoff09>.
  It amounts to sampling successively each column of <math|<U>> by generating
  a random unit norm vector drawn from a (vector) BMF distribution. In our
  case, <math|<A>=<Y><Y><rsup|T>> whose rank is <math|min <around|(|K,N|)>>
  and hence <math|<A>> is rank-deficient whenever <math|K\<less\>N>, a case
  of most interest to us. Note also that to generate matrices <math|<U>>
  drawn from the Bingham distribution in <eqref|Bingham>, we need to consider
  <math|<A>=<Ubar><Ubar><rsup|T>> which has rank <math|p\<less\>N>.
  Therefore, the scheme of <cite|Hoff09> needs to be adapted in order to
  generate random matrices drawn from <eqref|p(U\|Y)<rsub|L>M<rsub|v>MF>. In
  Appendix <reference|app:BMF>, we review the method of <cite|Hoff09> and
  show how it can be modified to handle the case of a rank-deficient matrix
  <math|<A>>.

  <\rem>
    <label|rem:iam>Interestingly enough, the above estimator in <eqref|Uiam>
    is the so-called induced arithmetic mean (IAM) <cite|Sarlette09> of the
    set of unitary matrices <math|<U><rsup|<around|(|n|)>>>,
    <math|n=N<rsub|<with|font-family|rm|b*i>>+1,\<cdots\>,N<rsub|<with|font-family|rm|b*i>>+N<rsub|r>>.
    It differs from the Karcher mean of the set
    <math|<U><rsup|<around|(|n|)>>>, <math|n=N<rsub|<with|font-family|rm|b*i>>+1,\<cdots\>,N<rsub|<with|font-family|rm|b*i>>+N<rsub|r>>,
    which truly minimizes the sum of the distances to all
    <math|<U><rsup|<around|(|n|)>>>. However, the Karcher mean may not exist
    and requires iterative schemes to be computed <cite|Begelfor06> while the
    IAM is straightforward to compute.
  </rem>

  <\rem>
    <label|rem:map>In the particular case where <math|<U>> has a Bingham
    prior distribution, the MAP estimator of <math|<U>> and its MMSD
    estimator are equal. This is no longer true when <math|<U>> has a vMF
    prior distribution, and hence a BMF posterior distribution. The mode of
    the latter is not known in closed-form either. However, it can be
    approximated by selecting, among the matrices generated by the Gibbs
    sampler, the matrix which results in the largest value of the posterior
    distribution.
  </rem>

  <subsection|Covariance matrix model>

  We now consider a more complicated case where <math|<Y>>, conditioned on
  <math|<U>> and <math|<bLambda>>, is Gaussian distributed with zero-mean and
  covariance matrix

  <\equation>
    <R>=<E|<Y><Y><rsup|T>>=<U><bLambda><U><rsup|T>+<varn><I>
  </equation>

  where <math|<U>> is an orthonormal basis for the signal subspace,
  <math|<bLambda>> is the diagonal matrix of the eigenvalues and
  <math|<varn>> stands for the white noise power which is assumed to be known
  here. As it will be more convenient and more intuitively appealing, we
  re-parametrize the covariance matrix as follows. The inverse of <math|<R>>
  can be written as

  <align|<tformat|<table|<row|<cell|<iR>>|<cell|=<U><around*|[|<around*|(|<bLambda>+<varn><I>|)><rsup|-1>-<ivarn><I>|]><U><rsup|T>+<ivarn><I><no-number>>>|<row|<cell|>|<cell|=<ivarn><I>-<ivarn><U><bLambda><around*|(|<bLambda>+<varn><I>|)><rsup|-1><U><rsup|T><no-number>>>|<row|<cell|>|<cell|\<triangleq\>\<nu\><I>-\<nu\><U><around*|(|<I>-<bGamma>|)><U><rsup|T>>>>>>

  where <math|\<nu\>\<triangleq\><ivarn>>,
  <math|<bGamma>\<triangleq\><diag|<bgamma>>> with
  <math|<bgamma>=<around*|[|<tabular*|<tformat|<table|<row|<cell|\<gamma\><rsub|1>>|<cell|\<gamma\><rsub|2>>|<cell|\<cdots\>>|<cell|\<gamma\><rsub|p>>>>>>|]><rsup|T>>
  and

  <\equation>
    0\<less\>\<gamma\><rsub|k>\<triangleq\><frac|<varn>|<varn>+\<lambda\><rsub|k>>\<less\>1.
  </equation>

  The idea is to parametrize the problem in terms of <math|<U>> and
  <math|<bGamma>> rather than <math|<U>> and <math|<bLambda>>. The interest
  of this transformation is twofold. First, it enables one to express all
  eigenvalues with respect to the white noise level. Indeed, one has
  <math|<R>=\<nu\><rsup|-1><Uorth><Uorth><rsup|T>+\<nu\><rsup|-1><U><bGamma><rsup|-1><U><rsup|T>>
  where <math|<Uorth>> is an orthonormal basis for <rsup|<math|\<perp\>>> and
  hence the <math|\<gamma\><rsub|k>>s are representative of the scaling
  between the \Psignal\Q eigenvalues and the noise eigenvalues. In fact, they
  carry information about the signal-to-noise ratio since
  <math|\<gamma\><rsub|k>=<around*|(|1+<frac|\<lambda\><rsub|k>|\<sigma\><rsup|2>>|)><rsup|-1>>
  and <math|<frac|\<lambda\><rsub|k>|\<sigma\><rsup|2>>> represents the SNR
  of the <math|k>-th signal component. Second, this new parametrization will
  facilitate derivation of the conditional distributions required for the
  Gibbs sampler.

  Since <math|<Y>> conditioned on <math|<U>> and <math|<bgamma>> is Gaussian,
  it follows that

  <\equation>
    p<around*|(|<Y>\|<U>,<bgamma>|)>=<around*|(|2*\<pi\>|)><rsup|-N*K/2><around*|\||<R>|\|><rsup|-K/2><etr|-<frac|1|2><Y><rsup|T><iR><Y>>.
  </equation>

  From <math|<iR>=\<nu\><Uorth><Uorth><rsup|T>+\<nu\><U><bGamma><U><rsup|T>>,
  it ensues that <math|<around*|\||<iR>|\|>=\<nu\><rsup|N><around*|\||<bGamma>|\|>>
  and hence

  <\equation>
    <label|p(Y\|Ugamma)>p<around*|(|<Y>\|<U>,<bgamma>|)>\<propto\><around*|\||<bGamma>|\|><rsup|K/2><etr|-<frac|1|2><Y><rsup|T><around*|[|\<nu\><I>-\<nu\><U><around*|(|<I>-<bGamma>|)><U><rsup|T>|]><Y>>.
  </equation>

  Let us now consider the prior distributions for <math|<U>> and
  <math|<bgamma>> . We will consider either a Bingham or vMF distribution for
  <math|<U>>. As for <math|<bgamma>>, we assume that <math|\<gamma\><rsub|k>>
  are a priori independent random variables uniformly distributed in the
  interval <math|<around*|[|\<gamma\><rsub|->,\<gamma\><rsub|+>|]>>, i.e.,

  <\equation>
    <label|p(gamma)>\<pi\><around|(|<bgamma>|)>=<big|prod><rsub|k=1><rsup|p><around*|(|\<gamma\><rsub|+>-\<gamma\><rsub|->|)><rsup|-1>*\<bbb-I\><rsub|<around*|[|\<gamma\><rsub|->,\<gamma\><rsub|+>|]>><around|(|\<gamma\><rsub|k>|)>.
  </equation>

  The value of <math|\<gamma\><rsub|+>> [respectively
  <math|\<gamma\><rsub|->>] can be set to <math|1> [respectively <math|0>] if
  a non-informative prior is desired. Otherwise, if some information is
  available about the SNR, <math|\<gamma\><rsub|->> and
  <math|\<gamma\><rsub|+>> can be chosen so as to reflect this knowledge
  since <math|\<gamma\><rsub|k>=<around*|(|1+S*N*R<rsub|k>|)><rsup|-1>>:
  <math|\<gamma\><rsub|+>> [resp. <math|\<gamma\><rsub|->>] rules the lowest
  [resp. highest] value of the SNR, say <math|S*N*R<rsub|->> [resp.
  <math|S*N*R<rsub|+>>].

  With the Bingham assumption for <math|\<pi\><around*|(|<U>|)>>, the joint
  posterior distribution of <math|<U>> and <math|<bgamma>> is

  <align|<tformat|<table|<row|<cell|<label|joint>>|<cell|p<around*|(|<U>,<bgamma>\|<Y>|)>\<propto\>p<around*|(|<Y>\|<U>,<bgamma>|)>*\<pi\><around*|(|<U>|)>*\<pi\><around*|(|<bgamma>|)><no-number>>>|<row|<cell|>|<cell|\<propto\><around*|\||<bGamma>|\|><rsup|K/2><around*|(|<big|prod><rsub|k=1><rsup|p>\<bbb-I\><rsub|<around*|[|\<gamma\><rsub|->,\<gamma\><rsub|+>|]>><around|(|\<gamma\><rsub|k>|)>|)><no-number>>>|<row|<cell|>|<cell|\<times\><etr|\<kappa\><U><rsup|T><Ubar><Ubar><rsup|T><U>+<frac|\<nu\>|2><Y><rsup|T><U><around*|(|<I>-<bGamma>|)><U><rsup|T><Y>>.>>>>>

  In order to come up with the posterior distribution of <math|<U>> only, we
  need to marginalize <eqref|joint<rsub|p>osterior<rsub|b>ingham> with
  respect to <math|<bgamma>>. Let <math|<Z>=<Y><rsup|T><U>=<around*|[|<tabular*|<tformat|<table|<row|<cell|<z><rsub|1>>|<cell|<z><rsub|2>>|<cell|\<cdots\>>|<cell|<z><rsub|p>>>>>>|]>>.
  Then, from <eqref|joint<rsub|p>osterior<rsub|b>ingham> one has

  <align|<tformat|<table|<row|<cell|p<around*|(|<U>\|<Y>|)>>|<cell|=<big|int>p<around*|(|<U>,<bgamma>\|<Y>|)>*d<space|0.17em><bgamma><no-number>>>|<row|<cell|>|<cell|\<propto\><etr|\<kappa\><U><rsup|T><Ubar><Ubar><rsup|T><U>+<frac|\<nu\>|2><U><rsup|T><Y><Y><rsup|T><U>><no-number>>>|<row|<cell|>|<cell|\<times\><big|prod><rsub|k=1><rsup|p><big|int><rsub|\<gamma\><rsub|->><rsup|\<gamma\><rsub|+>>\<gamma\><rsub|k><rsup|K/2><ex|-<frac|\<nu\>|2>*\<gamma\><rsub|k><around*|\||<z><rsub|k>|\|><rsup|2>>d*\<gamma\><rsub|k>*<no-number>>>|<row|<cell|>|<cell|\<propto\><etr|\<kappa\><U><rsup|T><Ubar><Ubar><rsup|T><U>+<frac|\<nu\>|2><U><rsup|T><Y><Y><rsup|T><U>><no-number>>>|<row|<cell|>|<cell|\<times\><big|prod><rsub|k=1><rsup|p><around*|\||<z><rsub|k>|\|><rsup|-2*<around|(|1+K/2|)>>*<around*|[|\<gamma\>*<around*|(|<frac|\<nu\>|2>*\<gamma\><rsub|+><around*|\||<z><rsub|k>|\|><rsup|2>,1+<frac|K|2>|)>-\<gamma\>*<around*|(|<frac|\<nu\>|2>*\<gamma\><rsub|-><around*|\||<z><rsub|k>|\|><rsup|2>,1+<frac|K|2>|)>|]>>>>>>

  where <math|\<gamma\><around|(|x,a|)>=<big|int><rsub|0><rsup|x>t<rsup|a-1>*e<rsup|-t>*d*t>
  is the incomplete Gamma function. Unfortunately, the above distribution
  does not belong to any known family and it is thus problematic to generate
  samples drawn from it. Instead, in order to sample according to
  <eqref|joint<rsub|p>osterior<rsub|b>ingham>, we propose to use a Gibbs
  sampler drawing samples according to <math|p<around*|(|<U>\|<Y>,<bgamma>|)>>
  and <math|p<around*|(|\<gamma\><rsub|k>\|<Y>,<U>|)>> for
  <math|k=1,\<cdots\>,p>. From <eqref|joint<rsub|p>osterior<rsub|b>ingham>,
  the conditional distribution of <math|<U>> is

  <\equation>
    <label|p(U\|Ygamma)>p<around*|(|<U>\|<Y>,<bgamma>|)>\<propto\><etr|\<kappa\><U><rsup|T><Ubar><Ubar><rsup|T><U>+<frac|\<nu\>|2>*<around*|(|<I>-<bGamma>|)><U><rsup|T><Y><Y><rsup|T><U>>
  </equation>

  which is recognized as a (modified) Bingham
  distribution<footnote|<math|<X>\<sim\><mB|<A><rsub|1>|<B><rsub|1>|<A><rsub|2>|<B><rsub|2>>\<Leftrightarrow\>p<around|(|<X>|)>\<propto\><etr|<B><rsub|1><X><rsup|T><A><rsub|1><X>+<B><rsub|2><X><rsup|T><A><rsub|2><X>>>>

  <\equation>
    <U>\|<Y>,<bgamma>\<sim\><mB|<Ubar><Ubar><rsup|T>|\<kappa\><I>|<Y><Y><rsup|T>|<frac|\<nu\>|2>*<around*|(|<I>-<bGamma>|)>>.
  </equation>

  Let us now turn to the conditional distribution of
  <math|<bgamma>\|<Y>,<U>>. From <eqref|joint<rsub|p>osterior<rsub|b>ingham>
  one has

  <align|<tformat|<table|<row|<cell|<label|p(gamma\|Uy)>>|<cell|p<around*|(|<bgamma>\|<Y>,<U>|)>\<propto\><around*|\||<bGamma>|\|><rsup|K/2><etr|-<frac|\<nu\>|2><Z><bGamma><Z><rsup|T>><around*|(|<big|prod><rsub|k=1><rsup|p>\<bbb-I\><rsub|<around*|[|\<gamma\><rsub|->,\<gamma\><rsub|+>|]>><around|(|\<gamma\><rsub|k>|)>|)><no-number>>>|<row|<cell|>|<cell|\<propto\><big|prod><rsub|k=1><rsup|p><around*|[|\<gamma\><rsub|k><rsup|K/2><ex|-<frac|\<nu\>|2><around*|\||<z><rsub|k>|\|><rsup|2>*\<gamma\><rsub|k>>\<bbb-I\><rsub|<around*|[|\<gamma\><rsub|->,\<gamma\><rsub|+>|]>><around|(|\<gamma\><rsub|k>|)>|]>>>>>>

  which is the product of independent gamma distributions with parameters
  <math|<frac|K|2>+1> and <math|<frac|\<nu\>|2><around*|\||<z><rsub|k>|\|><rsup|2>>,
  truncated in the interval <math|<around*|[|\<gamma\><rsub|->,\<gamma\><rsub|+>|]>>.
  We denote this distribution as <math|\<gamma\><rsub|k>\<sim\><pdftG|<frac|K|2>+1|<frac|\<nu\>|2><around*|\||<z><rsub|k>|\|><rsup|2>|\<gamma\><rsub|->|\<gamma\><rsub|+>>>.
  Random variables with such a distribution can be efficiently generated
  using the accept-reject scheme of <cite|Chung98>.

  The above conditional distributions can now be used in a Gibbs sampler, as
  described in Table <reference|table:gibbs>. When <math|<U>> has a vMF prior
  distribution, it is straightforward to show that <math|<U>>, conditioned on
  <math|<Y>> and <math|<bgamma>>, follows a BMF distribution
  <math|<U>\|<Y>,<bgamma>\<sim\><BMF|<Y><Y><rsup|T>|<frac|\<nu\>|2>*<around*|(|<I>-<bGamma>|)>|\<kappa\><Ubar>>>
  while the posterior distribution of <math|<bgamma>\|<Y>,<U>> is still given
  by <eqref|p(gamma\|Uy)<rsub|b>ingham>. Therefore line
  <reference|line:U\|Ygamma> of the Gibbs sampler in Table
  <reference|table:gibbs> just needs to be modified in order to handle this
  case.

  <vspace|2fn>

  <\big-table>
    \ <algo-require|initial values <math|<U><rsup|<around|(|0|)>>>,
    <math|<bgamma><rsup|<around|(|0|)>>>>

    <\algo-for|<math|n=1,\<cdots\>,N<rsub|b*i>+N<rsub|r>>>
      <algo-state|sample <math|<U><rsup|<around|(|n|)>>> from
      <math|<mB|\<kappa\><I>|<Ubar><Ubar><rsup|T>|<frac|\<nu\>|2>*<around*|(|<I>-<bGamma><rsup|<around|(|n-1|)>>|)>|<Y><Y><rsup|T>>>
      in <eqref|p(U\|Ygamma)<rsub|b>ingham>.<label|line:U\|Ygamma>>

      <algo-state|for <math|k=1,\<cdots\>,p>, sample
      <math|\<gamma\><rsub|k><rsup|<around|(|n|)>>> from
      <math|<pdftG|<frac|K|2>+1|<frac|\<nu\>|2><around*|\||<Y><rsup|T><bu><rsub|k><rsup|<around|(|n|)>>|\|><rsup|2>|\<gamma\><rsub|->|\<gamma\><rsub|+>>>
      in <eqref|p(gamma\|Uy)<rsub|b>ingham>.>
    </algo-for>

    <algo-ensure|sequence of random variables <math|<U><rsup|<around|(|n|)>>>
    and <math|<bgamma><rsup|<around|(|n|)>>>>

    <label|table:gibbs>
  </big-table|Gibbs sampler>

  <section|Simulations><label|section:simulations>

  In this section we illustrate the performance of the approach developed
  above through Monte Carlo simulations. In all simulations <math|N=20>,
  <math|p=5> and <math|\<kappa\>=20>. The matrix <math|<bS>> is generated
  from a Gaussian distribution with zero-mean and covariance matrix
  <math|\<sigma\><rsub|s><rsup|2><I>> and the signal-to-noise ratio is
  defined as <math|S*N*R=10*log<rsub|10><around*|(|\<sigma\><rsub|s><rsup|2>/<varn>|)>>.
  The matrix <math|<U>> is generated from the Bingham distribution
  <eqref|Bingham> or the vMF distribution <eqref|vMF> and, for the sake of
  simplicity, <math|<Ubar>=<around*|[|<tabular*|<tformat|<table|<row|<cell|<I><rsub|p>>|<cell|<mat|0>>>>>>|]><rsup|T>>.
  The number of burn-in iterations in the Gibbs sampler is set to
  <math|N<rsub|<with|font-family|rm|b*i>>=10> and <math|N<rsub|r>=1000>. The
  MMSD estimator <eqref|Ummsd> is compared with the MAP estimator, the MMSE
  estimator, the usual SVD-based estimator and the estimator
  <math|<wide|<U>|^>=<Ubar>> that discards the available data and use only
  the a priori knowledge. The latter is referred to as \PUbar\Q in the
  figures. The estimators are evaluated in terms of the fraction of energy of
  <math|<wide|<U>|^>> in <math|>, i.e., <math|<math-up|AFE><around*|(|<wide|<U>|^>,<U>|)>>.

  <subsection|Linear model>

  We begin with the linear model. Figures <reference|fig:mmsd> to
  <reference|fig:mmsd> investigate the influence of <math|K> and <math|S*N*R>
  onto the performance of the estimators. Figures <reference|fig:mmsd> and
  <reference|fig:mmsd> concern the Bingham prior while the vMF prior has been
  used to obtain Figures <reference|fig:mmsd> and <reference|fig:mmsd>. From
  inspection of these figures, the following conclusions can be drawn:

  <\itemize>
    <item>the MMSD estimator performs better than the estimator
    <math|<wide|<U>|^>=<Ubar>>, even at low SNR. The improvement is all the
    more pronounced that <math|K> is large. Therefore, the MMSD estimator
    makes a sound use of the data to improve accuracy compared to using the
    prior knowledge only.

    <item>the MMSD estimator performs better than the SVD, especially at low
    SNR. Moreover, and this is a distinctive feature of this Bayesian
    approach, it enables one to estimate the subspace even when the number of
    snapshots <math|K> is less than the size of the subspace <math|p>.

    <item>for a Bingham prior, the MMSE performs very poorly since the
    posterior distribution of <math|<U>> conditioned on <math|<Y>> depends on
    <math|<U><U><rsup|T>> only. Hence, averaging the matrix <math|<U>> itself
    does not make sense, see our remark <reference|rem:MMSD>. In contrast,
    when <math|<U>> has a vMF prior, the posterior depends on both <math|<U>>
    and <math|<U><U><rsup|T>>: in this case, the MMSE performs well and is
    close to the MMSD. Note however that the vMF prior is more restrictive
    than the Bingham prior.

    <item>the MMSD estimator also outperforms the MAP estimator.
  </itemize>

  As a conclusion, the MMSD estimator performs better than most other
  estimators in the large majority of cases.

  <subsection|Covariance matrix model>

  We now conduct simulations with the covariance matrix model. The simulation
  parameters are essentially the same as in the previous section, except for
  the SNR. More precisely, the random variables <math|\<gamma\><rsub|k>> are
  drawn from the uniform distribution in <eqref|p(gamma)> where
  <math|\<gamma\><rsub|->> and <math|\<gamma\><rsub|+>> are selected such
  that <math|S*N*R<rsub|->=5>dB and <math|S*N*R<rsub|+>=10>dB. The results
  are shown in Fig. <reference|fig:mmsd> for the Bingham prior and Fig.
  <reference|fig:mmsd> for the vMF prior. They corroborate the previous
  observations made on the linear model, viz that the MMSD estimator offers
  the best performance over all methods.

  <section|Application to hyperspectral imagery><label|section:appli>

  In this section, we show how the proposed subspace estimation procedure can
  be efficiently used for an application to multi-band image analysis. For
  several decades, hyperspectral imagery has received considerable attention
  because of its great interest for various purposes: agriculture monitoring,
  mineral mapping, military concerns, etc. One of the crucial issue when
  analyzing such image is the spectral unmixing which aims to decompose an
  observed pixel <math|<y><rsub|\<ell\>>> into a collection of <math|R=p+1>
  reference signatures, <math|<bm><rsub|1>,\<ldots\>,<bm><rsub|R>> (called
  <em|endmembers>) and to retrieve the respective proportions of these
  signatures (or <em|abundances>) <math|a<rsub|1,\<ell\>>,\<ldots\>,a<rsub|R,\<ell\>>>
  in this pixel <cite|Keshava2002>. To describe the physical process that
  links the endmembers and their abundances to the measurements, the most
  widely admitted mixing model is linear

  <\equation>
    <label|eq:LMM><y><rsub|\<ell\>>=<big|sum><rsub|r=1><rsup|R>a<rsub|r,\<ell\>><bm><rsub|r>
  </equation>

  where <math|<y><rsub|\<ell\>>\<in\>\<bbb-R\><rsup|N>> is the pixel spectrum
  measured in <math|N> spectral bands, <math|<bm><rsub|r>\<in\>\<bbb-R\><rsup|N>>
  (<math|r=1,\<ldots\>,R>) are the <math|R> endmember spectra and
  <math|a<rsub|r,\<ell\>>> (<math|r=1,\<ldots\>,R>) are their corresponding
  abundances. Due to obvious physical considerations, the abundances obey two
  kinds of constraints. Since they represent proportions, they must satisfy
  the following positivity and additivity constraints

  <\equation>
    <label|eq:constraints><choice|<tformat|<table|<row|<cell|a<rsub|r,\<ell\>>\<geq\>0,<space|1em>r=1,\<ldots\>,R,>>|<row|<cell|<big|sum><rsub|r=1><rsup|R>a<rsub|r,\<ell\>>=1.>>>>>
  </equation>

  Let now consider <math|L> pixels <math|<y><rsub|1>,\<ldots\>,<y><rsub|L>>
  of an hyperspectral image induced by the linear mixing model (LMM) in
  <eqref|eq:LMM> with the abundance constraints <eqref|eq:constraints>. It is
  clear that the dataset formed by these <math|L> pixels lies in a
  lower-dimensional subspace <math|\<cal-U\>\<subset\>\<bbb-R\><rsup|p>>.
  More precisely, in this subspace <math|\<cal-U\>>, the dataset belongs to a
  simplex whose vertices are the endmembers
  <math|<bm><rsub|1>,\<ldots\>,<bm><rsub|R>> to be recovered. Most of the
  unmixing strategies developed in the hyperspectral imagery literature are
  based on this underlying geometrical formulation of the LMM. Indeed, the
  estimation of the endmembers is generally conducted in the
  lower-dimensional space <math|\<cal-U\>>, previously identified by a
  standard dimension reduction technique such as the principal component
  analysis (PCA) <cite|Keshava2002>. However, it is well known that the model
  linearity is a simplifying assumption and does not hold anymore in several
  contexts, circumventing the standard unmixing algorithms. Specifically,
  non-linearities are known to occur for scenes including mixtures of
  minerals or vegetation. As a consequence, evaluating the suitability of the
  LMM assumption for a given hyperspectral image is a capital question that
  can be conveniently addressed by the approach introduced above.

  <subsection|Synthetic data>

  First, we investigate the estimation of the subspace <math|\<cal-U\>> when
  the image pixels are non-linear functions of the abundances. For this
  purpose, a <math|50\<times\>50> synthetic hyperspectral image is generated
  following a recently introduced non-linear model referred to as generalized
  bilinear model (GBM). As indicated in <cite|Halimi2011>, the GBM is notably
  well adapted to describe non-linearities due to multipath effects. It
  assumes that the observed pixel spectrum <math|<y><rsub|\<ell\>>> can be
  written

  <\equation>
    <label|eq:GBM><y><rsub|\<ell\>>=<big|sum><rsub|r=1><rsup|R>a<rsub|r,\<ell\>><bm><rsub|r>+<big|sum><rsub|i=1><rsup|R-1><big|sum><rsub|j=i+1><rsup|R>\<gamma\><rsub|i,j,\<ell\>>*a<rsub|i,\<ell\>>*a<rsub|j,\<ell\>><bm><rsub|i>\<odot\><bm><rsub|j>
  </equation>

  where <math|\<odot\>> stands for the Hadamard (termwise) product and the
  abundances <math|a<rsub|r,\<ell\>>> (<math|r=1,\<ldots\>,R>) satisfy the
  constraints in <eqref|eq:constraints>. In <eqref|eq:GBM>, the parameters
  <math|\<gamma\><rsub|i,j,\<ell\>>> (which belong to
  <math|<around|[|0,1|]>>) characterize the importance of non-linear
  interactions between the endmembers <math|<bm><rsub|i>> and
  <math|<bm><rsub|j>> in the <math|\<ell\>>-th pixel. In particular, when
  <math|\<gamma\><rsub|i,j,\<ell\>>=0> (<math|\<forall\>i,j>), the GBM
  reduces to the standard LMM <eqref|eq:LMM>. Moreover, when
  <math|\<gamma\><rsub|i,j,\<ell\>>=1> (<math|\<forall\>i,j>), the GBM leads
  to the non-linear model introduced by Fan <em|et al.> in <cite|Fan2009>. In
  this simulation, the synthetic image has been generated using the GBM with
  <math|R=3> endmember signatures extracted from a spectral library. The
  corresponding abundances have been uniformly drawn in the set defined by
  the constraints <eqref|eq:constraints>. We have assumed that there is no
  interaction between endmembers <math|<bm><rsub|1>> and <math|<bm><rsub|3>>,
  and between endmembers <math|<bm><rsub|2>> and <math|<bm><rsub|3>>
  resulting in <math|\<gamma\><rsub|1,3,\<ell\>>=\<gamma\><rsub|2,3,\<ell\>>=0>,
  <math|\<forall\>\<ell\>>. Moreover, the interactions between endmembers
  <math|<bm><rsub|1>> and <math|<bm><rsub|2>> are defined by the map of
  coefficients <math|\<gamma\><rsub|1,2,\<ell\>>> displayed in Fig.
  <reference|fig:dist><nbsp>(top, left panel) where a black (resp. white)
  pixel represents the lowest (resp. highest) degree of non-linearity. As can
  be seen in this figure, <math|75%> of the pixels (located in the bottom and
  upper right squares of the image) are mixed according to the LMM resulting
  in <math|\<gamma\><rsub|1,2,\<ell\>>=0>. The <math|25%> remaining image
  pixels (located in the upper left square of the image) are mixed according
  to the GBM with nonlinearity coefficients
  <math|\<gamma\><rsub|1,2,\<ell\>>> radially increasing from <math|0> to
  <math|1> (<math|\<gamma\><rsub|1,2,\<ell\>>=0> in the image center and
  <math|\<gamma\><rsub|1,2,\<ell\>>=1> in the upper left corner of the
  image). Note that this image contains a majority of pixels that are mixed
  linearly and belong to a common subspace of <math|\<bbb-R\><rsup|2>>.
  Conversely, the non-linearly mixed pixels do not belong to this
  subspace<footnote|Assuming there is a majority of image pixels that are
  mixed linearly is a reasonable assumption for most hyperspectral images.>.
  We propose here to estimate the local subspace
  <math|\<cal-U\><rsub|\<ell\>>> where a given image pixel
  <math|<y><rsub|\<ell\>>> and its nearest spectral neighbors
  <math|\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>> live
  (<math|\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>> denotes the set of
  the (<math|K-1>)-nearest neighbors of <math|<y><rsub|\<ell\>>>).

  Assuming as a first approximation that all the image pixels are linearly
  mixed, all these pixels are approximately contained in a common
  <math|2>-dimensional subspace <math|<wide|\<cal-U\>|\<bar\>>> that can be
  determined by performing a PCA of <math|<y><rsub|1>,\<ldots\>,<y><rsub|L>>
  (see <cite|Dobigeon2009sp> for more details). The corresponding principal
  vectors spanning <math|<wide|\<cal-U\>|\<bar\>>> are gathered in a matrix
  <math|<Ubar>>. This matrix <math|<Ubar>> is used as <em|a priori> knowledge
  regarding the <math|2>-dimensional subspace containing
  <math|<around*|{|<y><rsub|\<ell\>>,\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>|}><rsub|\<ell\>=1,\<ldots\>,L>>.
  However, this crude estimation can be refined by the Bayesian estimation
  strategy developed in the previous sections. More precisely, for each pixel
  <math|<y><rsub|\<ell\>>>, we compute the MMSD estimator of the
  <math|N\<times\>p> matrix <math|<U><rsub|\<ell\>>>, whose columns are
  supposed to span the subspace <math|\<cal-U\><rsub|\<ell\>>> containing
  <math|<y><rsub|\<ell\>>> and its <math|K-1>-nearest neighbors
  <math|\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>>. The Bayesian
  estimator <math|<wide|<U>|^><rsub|\<ell\>>> is computed from its
  closed-form expression <eqref|Ummsd<rsub|L>M<rsub|B>ingham>, i.e., using
  the Bingham prior where <math|<Ubar>> has been introduced above. Then, for
  each pixel, we evaluate the distance between the two projection matrices
  <math|<wide|<U>|^><rsub|\<ell\>><wide|<U>|^><rsub|\<ell\>><rsup|T>> and
  <math|<Ubar><Ubar><rsup|T>> onto the subspaces
  <math|<wide|\<cal-U\>|^><rsub|\<ell\>>=> and
  <math|<wide|\<cal-U\>|\<bar\>>=>, respectively. As stated in
  Section<nbsp><reference|section:MMSD>, the natural distance between these
  two projection matrices is given by <math|d<rsup|2><around*|(|<wide|<U>|^><rsub|\<ell\>>,<Ubar>|)>=2*<around*|(|p-<Tr|<wide|<U>|^><rsub|\<ell\>><rsup|T><Ubar><Ubar><rsup|T><wide|<U>|^><rsub|\<ell\>>>|)>>.
  The resulting distance maps are depicted in Fig. <reference|fig:dist>
  (bottom panels) for <math|2> non-zero values of
  <math|\<eta\>\<triangleq\>2<varn>\<kappa\>> (as it can be noticed in
  <eqref|Ummsd<rsub|L>M<rsub|B>ingham>, this hyperparameter <math|\<eta\>>
  balances the quantity of <em|a priori> knowledge <math|<Ubar>> included in
  the estimation with respect to the information brought by the data). For
  comparison purpose, the subspace <math|<wide|\<cal-U\>|^><rsub|\<ell\>>>
  has been also estimated by a crude SVD of
  <math|<around*|{|<y><rsub|\<ell\>>,\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>|}>>
  (top right panel). In this case, <math|<wide|<U>|^><rsub|\<ell\>>> simply
  reduces to the associated principal singular vectors and can be considered
  as the MMSD estimator of <math|<U><rsub|\<ell\>>> obtained for
  <math|\<eta\>=0>.

  These figures show that, for the <math|75%> of the pixels generated using
  the LMM (bottom and right parts of the image), the subspace
  <math|<wide|\<cal-U\>|\<bar\>>> estimated by an SVD of the whole dataset
  <math|<y><rsub|1>,\<ldots\>,<y><rsub|L>> is very close to the hyperplanes
  <math|<wide|\<cal-U\>|^><rsub|\<ell\>>> locally estimated from
  <math|<around*|{|<y><rsub|\<ell\>>,\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>|}>>
  through the proposed approach (for any value of <math|\<eta\>>). Regarding
  the remaining <math|25%> pixels resulting from the GBM (top left part of
  the image), the following comments can be made. When a crude SVD of
  <math|<around*|{|<y><rsub|\<ell\>>,\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>|}>>
  is conducted, i.e., when no prior knowledge is taken into account to
  compute the MMSD (<math|\<eta\>=0>, top right panel), the distance between
  the locally estimated subspace <math|<wide|\<cal-U\>|^><rsub|\<ell\>>> and
  the <em|a priori> assumed hyperplane <math|<wide|\<cal-U\>|\<bar\>>> does
  not reflect the non-linearities contained in the image. Conversely, when
  this crude SVD is regularized by incorporating prior knowledge with
  <math|\<eta\>=0.5> and <math|\<eta\>=50> (bottom left and right panels,
  respectively), leading to the MMSD estimator, the larger the degree of
  non-linearity, the larger the distance between <math|<Ubar>> and
  <math|<wide|<U>|^><rsub|\<ell\>>>. To summarize, evaluating the distance
  between the MMSD estimator <math|<wide|<U>|^><rsub|\<ell\>>> and the <em|a
  priori> given matrix <math|<Ubar>> allows the degree of non-linearity to be
  quantified. This interesting property is exploited on a real hyperspectral
  image in the following section.

  <subsection|Real data>

  The real hyperspectral image considered in this section has been acquired
  in 1997 over Moffett Field, CA, by the NASA spectro-imager AVIRIS. This
  image, depicted with composite true colors in Fig. <reference|fig:dist>
  (top, left panel), has been minutely studied in <cite|Dobigeon2009sp>
  assuming a linear mixing model. The scene consists of a large part of a
  lake (black pixels, top) and a coastal area (bottom) composed of soil
  (brown pixels) and vegetation (green pixels), leading to <math|R=3>
  endmembers whose spectra and abundance maps can be found in
  <cite|Dobigeon2009sp>. A simple estimation of a lower-dimensional space
  <math|<wide|\<cal-U\>|\<bar\>>> where the pixels live can be conducted
  through a direct SVD of the whole dataset, providing the <em|a priori>
  matrix <math|<Ubar>>. As in the previous section, this crude estimation can
  be refined by computing locally the MMSD estimators
  <math|<wide|<U>|^><rsub|\<ell\>>> spanning the subspaces
  <math|<wide|\<cal-U\>|^><rsub|\<ell\>>> (bottom panels). These estimators
  have been also computed with <math|\<eta\>=0>, corresponding to an SVD of
  <math|<around*|{|<y><rsub|\<ell\>>,\<cal-V\><rsub|\<ell\>><rsup|<around|(|K-1|)>>|}>>
  (top, right figure). The distances between <math|<Ubar>> and
  <math|<wide|<U>|^><rsub|\<ell\>>> have been reported in the maps of Fig.
  <reference|fig:dist>. Again, for <math|\<eta\>=0> (top, right panel), a
  simple local SVD is unable to locate possible non-linearities in the scene.
  However, for two<footnote|Additional results obtained with other values of
  <math|\<eta\>> are available online at <slink|http://dobigeon.perso.enseeiht.fr/app_MMSD.html>.>
  non-zero values <math|\<eta\>=0.5> and <math|\<eta\>=50> (bottom left and
  right panels, respectively), the distances between the <em|a priori>
  recovered subspace <math|<wide|\<cal-U\>|\<bar\>>> and the MMSD-based
  subspace <math|<wide|\<cal-U\>|^><rsub|\<ell\>>> clearly indicate that some
  non-linear effects occur in specific parts of the image, especially in the
  lake shore. Note that the non-linearities identified by the proposed
  algorithm are very similar to the ones highlighted in <cite|Halimi2011>
  where the unmixing procedure was conducted by using the GBM defined in
  <eqref|eq:GBM>. This shows the accuracy of the proposed MMSD estimator to
  localize the non-linearities occurring in the scene, which is interesting
  for the analysis of hyperspectral images.

  <section|Conclusions>

  This paper considered the problem of estimating a subspace using some
  available a priori information. Towards this end, a Bayesian framework was
  advocated, where the subspace <math|<U>> is assumed to be drawn from an
  appropriate prior distribution. However, since we operate in a Grassmann
  manifold, the conventional MMSE approach is questionable as it amounts to
  minimizing a distance which is not the most meaningful on the Grassmann
  manifold. Consequently, we revisited the MMSE approach and proposed, as an
  alternative, to minimize a natural distance on the Grassmann manifold. A
  general framework was formulated resulting in a novel estimator which
  entails computing the principal eigenvectors of the posterior mean of
  <math|<U><U><rsup|T>>. The theory was exemplified on a few simple examples,
  where the MMSD estimator can either be obtained in closed-form or requires
  resorting to an MCMC simulation method. The new approach enables one to
  combine efficiently the prior knowledge and the data information, resulting
  in a method that performs well at low SNR or with very small sample
  support. A successful application to the analysis of non-linearities
  contained in hyperspectral images was also presented.

  <appendices>

  <section|The eigenvalue decomposition of
  <math|<big|int><U><U><rsup|T>p<rsub|<math-up|B>><around|(|<U>|)>*d<U>>><label|app:eigBingham>

  The purpose of this appendix is to prove the following proposition which
  can be invoked to obtain the MMSD estimator whenever the posterior
  distribution <math|p<around|(|<U>\|<Y>|)>> is a Bingham distribution.

  <\prop>
    <label|prop:eigBingham>Let <math|<U>\<in\>\<bbb-R\><rsup|N\<times\>p>> be
    an orthogonal matrix -<math|<U><rsup|T><U>=<I>>- drawn from a Bingham
    distribution with parameter matrix <math|<A>>

    <\equation>
      <label|p>p<rsub|<math-up|B>><around|(|<U>|)>=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><etr|<U><rsup|T><A><U>>
    </equation>

    with <math|\<kappa\><rsub|<math-up|B>><around|(|<A>|)>=ln
    <oneFone|<frac|1|2>*p|<frac|1|2>*N|<A>>>. Let
    <math|<A>=<U><rsub|a><bLambda><rsub|a><U><rsub|a><rsup|T>> denote the
    eigenvalue decomposition of <math|<A>> where the eigenvalues are ordered
    in descending order. Let us define <math|<M>=<big|int><U><U><rsup|T>p<rsub|<math-up|B>><around|(|<U>|)>*d<U>>.
    Then the eigenvalue decomposition of <math|<M>> writes

    <\equation*>
      <M>=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><U><rsub|a><bGamma><U><rsub|a><rsup|T>
    </equation*>

    with <math|<bGamma>=<frac|\<partial\><ex|\<kappa\><rsub|<math-up|B>><around|(|<A>|)>>|\<partial\><bLambda><rsub|a>>>
    and <math|\<gamma\><rsub|1>\<geq\>\<gamma\><rsub|2>\<geq\>\<cdots\>\<geq\>\<gamma\><rsub|N>>
    where <math|\<gamma\><rsub|n>=<bGamma><around|(|n,n|)>>.
  </prop>

  <\IEEEproof>
    \ For notational convenience, let us work with the projection matrix
    <math|<bP>=<U><U><rsup|T>> whose distribution on the Grassmann manifold
    is <cite|Chikuse03>

    <\equation>
      <label|p(P)>p<around|(|<bP>|)>=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><etr|<bP><A>>.
    </equation>

    We have then that

    <align*|<tformat|<table|<row|<cell|<M>>|<cell|=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><big|int><bP><etr|<bP><U><rsub|a><bLambda><rsub|a><U><rsub|a><rsup|T>>d<bP>>>|<row|<cell|>|<cell|=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><U><rsub|a><around*|[|<big|int><U><rsub|a><rsup|T><bP><U><rsub|a><etr|<U><rsub|a><rsup|T><bP><U><rsub|a><bLambda><rsub|a>>d<bP>|]><U><rsub|a><rsup|T>>>|<row|<cell|>|<cell|=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><U><rsub|a><around*|[|<big|int><bP><etr|<bP><bLambda><rsub|a>>d<bP>|]><U><rsub|a><rsup|T>>>|<row|<cell|>|<cell|=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>><U><rsub|a><bGamma><U><rsub|a><rsup|T>.>>>>>

    Moreover <math|<bGamma>> is diagonal since, for any orthogonal diagonal
    matrix <math|<D>>,

    <align*|<tformat|<table|<row|<cell|<bGamma><D>>|<cell|=<big|int><bP><D><etr|<bP><bLambda><rsub|a>>d<bP>>>|<row|<cell|>|<cell|=<D><around*|[|<big|int><D><rsup|T><bP><D><etr|<D><rsup|T><bP><D><D><rsup|T><bLambda><rsub|a><D>>d<bP>|]>>>|<row|<cell|>|<cell|=<D><big|int><bP><etr|<bP><bLambda><rsub|a>>d<bP>>>|<row|<cell|>|<cell|=<D><bGamma>>>>>>

    where, to obtain the third line, we made use of the fact that
    <math|<D><rsup|T><bLambda><rsub|a><D>=<bLambda><rsub|a>>. It follows that
    the eigenvectors of <math|<M>> and <math|<A>> coincide, and that the
    eigenvalues of <math|<M>> are <math|<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>>\<gamma\><rsub|n>>,
    for <math|n=1,\<cdots\>,N>. Moreover, it is known that
    <math|<ex|-\<kappa\><rsub|<math-up|B>><around|(|<A>|)>>=<ex|-\<kappa\><rsub|<math-up|B>><around|(|<bLambda><rsub|a>|)>>>
    and, from <eqref|p(P)>, one has

    <\equation*>
      <ex|\<kappa\><rsub|<math-up|B>><around|(|<bLambda><rsub|a>|)>>=<big|int><etr|<bP><bLambda><rsub|a>>d<bP>.
    </equation*>

    Differentiating the latter equation with respect to
    <math|\<lambda\><rsub|a><around|(|k|)>> and denoting
    <math|p<rsub|n>=<bP><around|(|n,n|)>>, one obtains

    <align*|<tformat|<table|<row|<cell|<frac|\<partial\><ex|\<kappa\><rsub|<math-up|B>><around|(|<bLambda><rsub|a>|)>>|\<partial\>*\<lambda\><rsub|a><around|(|k|)>>>|<cell|=<frac|\<partial\>|\<partial\>*\<lambda\><rsub|a><around|(|k|)>>*<big|int><ex|<big|sum><rsub|n=1><rsup|N>\<lambda\><rsub|a><around|(|n|)>*p<rsub|n>>d<bP>>>|<row|<cell|>|<cell|=<big|int>p<rsub|k><etr|<bP><bLambda><rsub|a>>d<bP>>>|<row|<cell|>|<cell|=\<gamma\><rsub|k>.>>>>>

    The previous equation enables one to relate the eigenvalues of <math|<A>>
    and those of <math|<M>>. It remains to prove that
    <math|\<gamma\><rsub|1>\<geq\>\<gamma\><rsub|2>\<geq\>\<cdots\>\<geq\>\<gamma\><rsub|N>>.
    Towards this end, we make use of a very general theorem due to Letac
    <cite|Letac10>, which is briefly outlined below. Let

    <\equation*>
      P<around|(|\<mu\>,<A>|)><around|(|d<X>|)>=<ex|\<kappa\><rsub|\<mu\>><around|(|<A>|)>><etr|<X><rsup|T><A>>\<mu\><around|(|d<X>|)>
    </equation*>

    be a probability associated with a unitarily invariant measure
    <math|\<mu\>> on the set of <math|N\<times\>N> symmetric matrices.
    Consider the case of a diagonal matrix
    <math|<A>=<diag|a<rsub|1>,a<rsub|2>,\<cdots\>,a<rsub|N>>> with
    <math|a<rsub|1>\<geq\>a<rsub|2>\<geq\>\<cdots\>\<geq\>a<rsub|N>>. Then
    <cite|Letac10> proves that <math|<M>=<big|int><X>P<around|(|\<mu\>,<A>|)><around|(|d<X>|)>>
    is also diagonal, and moreover if <math|<M>=<diag|m<rsub|1>,m<rsub|2>,\<cdots\>,m<rsub|N>>>
    then <math|m<rsub|1>\<geq\>m<rsub|2>\<geq\>\<cdots\>\<geq\>m<rsub|N>>.
    Use of this theorem completes the proof of the proposition.
  </IEEEproof>

  \;

  <\rem>
    Most of the proposition could be proved, however in a rather indirect
    way, using the results in <cite|Jupp79>. In this reference, Jupp and
    Mardia consider maximum likelihood estimation of the parameter matrix
    <math|<A>> from the observation of <math|K> independent matrices
    <math|<U><rsub|k>> drawn from <eqref|p<rsub|B>(A)>. Let
    <math|<Pbar>=K<rsup|-1>*<big|sum><rsub|k=1><rsup|K><U><rsub|k><U><rsub|k><rsup|T>>
    and let its eigenvalue decomposition be
    <math|<Pbar>=<Vbar><Dbar><Vbar><rsup|T>>. Then the maximum likelihood
    estimate <math|<wide|<A>|^>> of <math|<A>> has eigenvalue decomposition
    <math|<wide|<A>|^>=<Vbar><D><Vbar><rsup|T>> with
    <math|<wide|d|\<bar\>><rsub|n>=\<partial\><ex|\<kappa\><rsub|<math-up|B>><around|(|<D>|)>>/\<partial\>*d<rsub|n>>.
    Moreover, due to Barndorff-Nielsen theorem for exponential families, one
    has

    <\equation*>
      \<tau\><rsub|B><around|(|<wide|<A>|^>|)>=<big|int><bP><ex|\<kappa\><rsub|B><around|(|<wide|<A>|^>|)>><etr|<bP><wide|<A>|^>>d<bP>=<Vbar><Dbar><Vbar><rsup|T>
    </equation*>

    which proves, since <math|<wide|<A>|^>=<Vbar><D><Vbar><rsup|T>>, that

    <\equation*>
      <big|int><bP><ex|\<kappa\><rsub|B><around|(|<D>|)>><etr|<bP><Vbar><D><Vbar><rsup|T>>d<bP>=<Vbar><Dbar><Vbar><rsup|T>.
    </equation*>

    In <cite|Jupp79> however, no results about the ordering of the
    eigenvalues was given.
  </rem>

  <section|Sampling from the Bingham-von Mises Fisher
  distribution><label|app:BMF>

  In this appendix, we show how to sample a unitary random matrix
  <math|<X>\<in\>\<bbb-R\><rsup|N\<times\>p>> from a (matrix) Bingham von
  Mises Fisher (BMF) distribution, <math|<X>\<sim\><BMF|<A>|<B>|<C>>>. As
  will be explained shortly, this amounts to sampling successively each
  column of <math|<X>>, and entails generating a random unit norm vector
  drawn from a (vector) BMF distribution. We briefly review how to sample the
  columns of <math|<X>> and then explain how to sample from a vector BMF
  distribution.

  <subsection|The matrix BMF distribution>

  The density of <math|<X>\<sim\><BMF|<A>|<B>|<C>>> is given by

  <align|<tformat|<table|<row|<cell|<label|matrix>p<around*|(|<X>\|<A>,<B>,<C>|)>>|<cell|\<propto\><etr|<C><rsup|T><X>+<B><X><rsup|T><A><X>><no-number>>>|<row|<cell|>|<cell|\<propto\><big|prod><rsub|k=1><rsup|p><ex|<bc><rsub|k><rsup|T><x><rsub|k>+<B><around|(|k,k|)><x><rsub|k><rsup|T><A><x><rsub|k>>>>>>>

  where <math|<X>=<around*|[|<tabular*|<tformat|<table|<row|<cell|<x><rsub|1>>|<cell|<x><rsub|2>>|<cell|\<cdots\>>|<cell|<x><rsub|p>>>>>>|]>>
  and <math|<C>=<around*|[|<tabular*|<tformat|<table|<row|<cell|<bc><rsub|1>>|<cell|<bc><rsub|2>>|<cell|\<cdots\>>|<cell|<bc><rsub|p>>>>>>|]>>.
  In <cite|Hoff09> a Gibbs-sampling strategy was presented in order to sample
  from this distribution, in the case where <math|<A>> is full-rank. We
  consider here a situation where <math|<A>> is rank-deficient and therefore
  we need to bring appropriate modifications to the scheme of <cite|Hoff09>
  in order to handle the rank deficiency of <math|<A>>. As evidenced from
  <eqref|matrix<rsub|B>MF> the distribution of <math|<A>> is a product of
  vector BMF distributions, except that the columns of <math|<X>> are not
  statistically independent since they are orthogonal with probability one.
  Let us rewrite <math|<X>> as <math|<X>=<around*|[|<tabular*|<tformat|<table|<row|<cell|<x><rsub|1>>|<cell|\<cdots\>>|<cell|<x><rsub|k-1>>|<cell|<Null><z>>|<cell|<x><rsub|k+1>>|<cell|\<cdots\>>|<cell|<x><rsub|p>>>>>>|]>>
  where <math|<z>\<in\>\<cal-S\><rsub|N-p+1>=<around*|{|<x>\<in\>\<bbb-R\><rsup|N-p+1\<times\>1>;<x><rsup|T><x>=1|}>>
  and <math|<Null>> is an <math|N\<times\>N-p+1> orthonormal basis for
  <rsup|<math|\<perp\>>> where <math|<X><rsub|-k>> stands for the matrix
  <math|<X>> with its <math|k>-th column removed. As shown in <cite|Hoff09>
  the conditional density of <math|<z>> given <math|<X><rsub|-k>> is

  <align|<tformat|<table|<row|<cell|p<around*|(|<z>\|<X><rsub|-k>|)>>|<cell|\<propto\><ex|<bc><rsub|k><rsup|T><Null><z>+<B><around|(|k,k|)><z><rsup|T><Null><rsup|T><A><Null><z>><no-number>>>|<row|<cell|>|<cell|\<propto\><ex|<wide|<bc>|~><rsub|k><rsup|T><z>+<z><rsup|T><wide|<A>|~><z>>>>>>>

  where <math|<wide|<bc>|~><rsub|k>=<Null><rsup|T><bc><rsub|k>> and
  <math|<wide|<A>|~>=<B><around|(|k,k|)><Null><rsup|T><A><Null>>. Therefore,
  <math|<z>\|<X><rsub|-k>> follows a vector BMF distribution
  <math|<z>\|<X><rsub|-k>\<sim\><vBMF|<wide|<A>|~>|<wide|<bc>|~><rsub|k>>>. A
  Markov chain that converges to <math|<BMF|<A>|<B>|<C>>> can thus be
  constructed as follows: \ <algo-require|initial value
  <math|<X><rsup|<around|(|0|)>>>>

  <\algo-for|<math|k=1,\<cdots\>,p> (random order)>
    <algo-state|compute a basis <math|<Null>> for the null space of
    <math|<X><rsub|-k>> and set <math|<z>=<Null><rsup|T><x><rsub|k>>.>

    <algo-state|compute <math|<wide|<bc>|~><rsub|k>=<Null><rsup|T><bc><rsub|k>>
    and <math|<wide|<A>|~>=<B><around|(|k,k|)><Null><rsup|T><A><Null>>.>

    <algo-state|sample <math|<z>> from a <math|<vBMF|<wide|<A>|~>|<wide|<bc>|~><rsub|k>>>
    distribution (<em|see next section>).<label|sample>>

    <algo-state|set <math|<x><rsub|k>=<Null><z>>.>
  </algo-for>

  <subsection|The vector BMF distribution>

  The core part of the above algorithm, see line <reference|sample>, is to
  draw a unit-norm random vector <math|<x>> distributed according to a vector
  Bingham-von Mises Fisher distribution. The latter distribution on the
  <math|M>-dimensional sphere has a density with respect to the uniform
  distribution given by

  <\equation>
    p<around*|(|<x>\|<bc>,<A>|)>\<propto\><ex|<bc><rsup|T><x>+<x><rsup|T><A><x>>,<x>\<in\>\<cal-S\><rsub|M>.
  </equation>

  In <cite|Hoff09> a Gibbs-sampling strategy was presented in order to sample
  from this distribution. While <math|<A>> was assumed to be full-rank in
  <cite|Hoff09>, we consider here a situation where <math|<A>> is
  rank-deficient, i.e. its eigenvalue decomposition can be written as
  <math|<A>=<bE><bLambda><bE><rsup|T>> where <math|<bE>> stands for the
  orthonormal matrix of the eigenvectors and
  <math|<bLambda>=<diag|\<lambda\><rsub|1>,\<lambda\><rsub|2>,\<cdots\>,\<lambda\><rsub|r>,0,\<cdots\>,0>>
  is the diagonal matrix of its eigenvalues. Our derivation follows along the
  same lines as in <cite|Hoff09> with the appropriate modifications due to
  the rank deficiency of <math|<A>>. Let <math|<y>=<bE><rsup|T><x>\<in\>\<cal-S\><rsub|M>>
  and <math|<bd>=<bE><rsup|T><bc>>. Since
  <math|y<rsub|M><rsup|2>=1-<big|sum><rsub|k=1><rsup|M-1>y<rsub|k><rsup|2>>,
  the uniform density in terms of the unconstrained coordinates
  <math|<around*|{|y<rsub|1>,y<rsub|2>,\<cdots\>,y<rsub|M-1>|}>> is
  proportional to <math|<around|\||y<rsub|M>|\|><rsup|-1>> and the density of
  <math|<around*|{|y<rsub|1>,y<rsub|2>,\<cdots\>,y<rsub|M-1>|}>> is given by
  <cite|Hoff09>

  <align|<tformat|<table|<row|<cell|p<around*|(|<y>\|<bd>,<bE>|)>>|<cell|\<propto\><ex|<bd><rsup|T><y>+<y><rsup|T><bLambda><y>><around|\||y<rsub|M>|\|><rsup|-1>,<space|1em>y<rsub|M><rsup|2>=1-<big|sum><rsub|k=1><rsup|M-1>y<rsub|k><rsup|2>*<no-number>>>|<row|<cell|>|<cell|\<propto\><ex|<big|sum><rsub|k=1><rsup|M>d<rsub|k>*y<rsub|k>+<big|sum><rsub|k=1><rsup|r>\<lambda\><rsub|k>*y<rsub|k><rsup|2>><around|\||y<rsub|M>|\|><rsup|-1>.>>>>>

  In order to sample from this distribution, a Gibbs sampling strategy is
  advocated. Towards this end, we need to derive the conditional
  distributions of <math|y<rsub|k>>, given <math|<y><rsub|-k>> where
  <math|<y><rsub|-k>> stands for the vector <math|<y>> with its <math|k>-th
  component removed. Similarly to <cite|Hoff09>, let us make the change of
  variables <math|\<theta\><rsub|k>=y<rsub|k><rsup|2>> and let
  <math|<q>=<around*|[|<tabular*|<tformat|<table|<row|<cell|<frac|y<rsub|1><rsup|2>|1-y<rsub|k><rsup|2>>>|<cell|<frac|y<rsub|2><rsup|2>|1-y<rsub|k><rsup|2>>>|<cell|\<cdots\>>|<cell|<frac|y<rsub|M><rsup|2>|1-y<rsub|k><rsup|2>>>>>>>|]><rsup|T>>,
  so that <math|<around*|{|y<rsub|1><rsup|2>,y<rsub|2><rsup|2>,\<cdots\>,y<rsub|M><rsup|2>|}>=<around*|{|\<theta\><rsub|k>,<around*|(|1-\<theta\><rsub|k>|)><q><rsub|-k>|}>>.
  Since this change of variables is not bijective, i.e.
  <math|y<rsub|k>\<pm\>\<theta\><rsub|k><rsup|1/2>>, we need to introduce the
  sign <math|s<rsub|k>> of <math|y<rsub|k>>, and we let
  <math|<bs>=<around*|[|<tabular*|<tformat|<table|<row|<cell|s<rsub|1>>|<cell|s<rsub|2>>|<cell|\<cdots\>>|<cell|s<rsub|M>>>>>>|]><rsup|T>>.
  Note that <math|y<rsub|M><rsup|2>=1-<big|sum><rsub|k=1><rsup|M-1>y<rsub|k><rsup|2>>,
  <math|<around|\||y<rsub|M>|\|>=<around|(|1-\<theta\><rsub|k>|)><rsup|1/2>*q<rsub|M><rsup|1/2>>
  and <math|q<rsub|M>=1-<big|sum><rsub|\<ell\>=1,\<ell\>\<neq\>k><rsup|M-1>q<rsub|\<ell\>>>.
  As shown in <cite|Hoff09>, the Jacobian of the transformation from
  <math|<around*|{|y<rsub|1>,y<rsub|2>,\<cdots\>,y<rsub|M-1>|}>> to
  <math|<around*|{|\<theta\>,q<rsub|1>,\<cdots\>,q<rsub|k-1>,q<rsub|k+1>,\<cdots\>,q<rsub|M-1>|}>>
  is proportional to <math|\<theta\><rsub|k><rsup|-1/2>*<around*|(|1-\<theta\><rsub|k>|)><rsup|<around|(|M-2|)>/2>*<big|prod><rsub|\<ell\>=1,\<ell\>\<neq\>k><rsup|M-1>q<rsub|\<ell\>><rsup|-1/2>>,
  and therefore the joint distribution of
  <math|\<theta\><rsub|k>,s<rsub|k>,<q><rsub|-k>,<bs><rsub|-k>> can be
  written as

  <align|<tformat|<table|<row|<cell|<label|joint>p<around*|(|\<theta\><rsub|k>,s<rsub|k>,<q><rsub|-k>,<bs><rsub|-k>|)>>|<cell|\<propto\>\<theta\><rsub|k><rsup|-1/2>*<around*|(|1-\<theta\><rsub|k>|)><rsup|<around|(|M-3|)>/2><around*|(|<big|prod><rsub|\<ell\>\<neq\>k>q<rsub|\<ell\>><rsup|-1/2>|)><no-number>>>|<row|<cell|>|<cell|\<times\><ex|s<rsub|k>*\<theta\><rsub|k><rsup|1/2>*d<rsub|k>+<around*|(|1-\<theta\><rsub|k>|)><rsup|1/2>*<big|sum><rsub|\<ell\>\<neq\>k>d<rsub|\<ell\>>*s<rsub|\<ell\>>*q<rsub|\<ell\>><rsup|1/2>><no-number>>>|<row|<cell|>|<cell|\<times\><choice|<tformat|<table|<row|<cell|<ex|\<theta\><rsub|k>*\<lambda\><rsub|k>+<around*|(|1-\<theta\><rsub|k>|)>*<big|sum><rsub|\<ell\>=1,\<ell\>\<neq\>k><rsup|r>q<rsub|\<ell\>>*\<lambda\><rsub|\<ell\>>>>|<cell|1\<leq\>k\<leq\>r>>|<row|<cell|<ex|<around*|(|1-\<theta\><rsub|k>|)>*<big|sum><rsub|\<ell\>=1><rsup|r>q<rsub|\<ell\>>*\<lambda\><rsub|\<ell\>>>>|<cell|r+1\<leq\>k\<leq\>M>>>>>.>>>>>

  It follows that

  <\itemize>
    <item>for <math|k\<in\><around*|[|1,r|]>>

    <align|<tformat|<table|<row|<cell|p<around*|(|\<theta\><rsub|k>,s<rsub|k>\|<q><rsub|-k>,<bs><rsub|-k>|)>>|<cell|\<propto\>\<theta\><rsub|k><rsup|-1/2>*<around*|(|1-\<theta\><rsub|k>|)><rsup|<around|(|M-3|)>/2><ex|\<theta\><rsub|k>*\<lambda\><rsub|k>+<around*|(|1-\<theta\><rsub|k>|)><q><rsub|-k><rsup|T><blambda><rsub|-k>><no-number>>>|<row|<cell|>|<cell|\<times\><ex|s<rsub|k>*\<theta\><rsub|k><rsup|1/2>*d<rsub|k>+<around*|(|1-\<theta\><rsub|k>|)><rsup|1/2>*<around*|[|<bs><rsub|-k>\<odot\><q><rsub|-k><rsup|1/2>|]><rsup|T><bd><rsub|-k>>.>>>>>

    <item>for <math|k\<in\><around*|[|r+1,M|]>>

    <align|<tformat|<table|<row|<cell|p<around*|(|\<theta\><rsub|k>,s<rsub|k>\|<q><rsub|-k>,<bs><rsub|-k>|)>>|<cell|\<propto\>\<theta\><rsub|k><rsup|-1/2>*<around*|(|1-\<theta\><rsub|k>|)><rsup|<around|(|M-3|)>/2><ex|<around*|(|1-\<theta\><rsub|k>|)><q><rsup|T><blambda>><no-number>>>|<row|<cell|>|<cell|\<times\><ex|s<rsub|k>*\<theta\><rsub|k><rsup|1/2>*d<rsub|k>+<around*|(|1-\<theta\><rsub|k>|)><rsup|1/2>*<around*|[|<bs><rsub|-k>\<odot\><q><rsub|-k><rsup|1/2>|]><rsup|T><bd><rsub|-k>>.>>>>>
  </itemize>

  In the previous equations, <math|\<odot\>> stands for the element-wise
  vector or matrix product and <math|<q><rsub|-k><rsup|1/2>> is a short-hand
  notation to designate the vector <math|<around*|[|<tabular*|<tformat|<table|<row|<cell|q<rsub|1><rsup|1/2>>|<cell|\<cdots\>>|<cell|q<rsub|k-1><rsup|1/2>>|<cell|q<rsub|k+1><rsup|1/2>>|<cell|\<cdots\>>|<cell|q<rsub|M><rsup|1/2>>>>>>|]><rsup|T>>.
  In order to sample from <math|p<around*|(|\<theta\><rsub|k>,s<rsub|k>\|<q><rsub|-k>,<bs><rsub|-k>|)>>,
  we first sample <math|\<theta\><rsub|k>> from

  <align|<tformat|<table|<row|<cell|<label|pdf>p<around*|(|\<theta\><rsub|k>\|<q><rsub|-k>,<bs><rsub|-k>|)>>|<cell|=p*<around*|(|\<theta\><rsub|k>,s<rsub|k>=-1\|<q><rsub|-k>,<bs><rsub|-k>|)>+p*<around*|(|\<theta\><rsub|k>,s<rsub|k>=1\|<q><rsub|-k>,<bs><rsub|-k>|)><no-number>>>|<row|<cell|>|<cell|\<propto\>\<theta\><rsub|k><rsup|-1/2>*<around*|(|1-\<theta\><rsub|k>|)><rsup|<around|(|M-3|)>/2><ex|a<rsub|k>*\<theta\><rsub|k>+b<rsub|k>*<around*|(|1-\<theta\><rsub|k>|)><rsup|1/2>><no-number>>>|<row|<cell|>|<cell|\<times\><around*|[|<ex|-d<rsub|k>*\<theta\><rsub|k><rsup|1/2>>+<ex|-d<rsub|k>*\<theta\><rsub|k><rsup|1/2>>|]>>>>>>

  where <math|b<rsub|k>=<around*|[|<bs><rsub|-k>\<odot\><q><rsub|-k><rsup|1/2>|]><rsup|T><bd><rsub|-k>>
  and

  <\equation>
    a<rsub|k>=<choice|<tformat|<table|<row|<cell|\<lambda\><rsub|k>-<q><rsub|-k><rsup|T><blambda><rsub|-k>>|<cell|k\<in\><around*|[|1,r|]>>>|<row|<cell|-<q><rsup|T><blambda>>|<cell|k\<in\><around*|[|r+1,M|]>>>>>>.
  </equation>

  Next, we sample <math|s<rsub|k>\<in\><around*|{|-1,+1|}>> with
  probabilities proportional to <math|<around*|(|e<rsup|-d<rsub|k>*\<theta\><rsub|k><rsup|1/2>>,e<rsup|+d<rsub|k>*\<theta\><rsub|k><rsup|1/2>>|)>>.
  In order to sample from the distribution in
  <eqref|pdf<rsub|t>hetak<rsub|c>ond>, an efficient rejection sampling scheme
  was proposed in <cite|Hoff09>, where the proposal distribution is a beta
  distribution with suitably chosen parameters.

  <section*|Acknowledgment>

  The authors would like to thank Prof. Kit Bigham from the University of
  Minnesota for insightful comments on the Bingham distribution and for
  pointing reference <cite|Jupp79>. They are also indebted to Prof. Grard
  Letac, University of Toulouse, for fruitful discussions leading to the
  proof of Proposition <reference|prop:eigBingham> given in Appendix
  <reference|app:eigBingham>.

  <\bibliography|bib|IEEEtran|mmsd>
    <bib-list|[99]|>
  </bibliography>

  <new-page>

  <big-figure|<with|par-mode|center|<image|mmsd_lm_bingham_vs_K_N=20_p=5_kappa=20_SNR=5.eps|8cm|||><label|fig:mmsd>>|Fraction
  of energy of <math|<wide|<U>|^>> in <math|> versus <math|K>. <math|N=20>,
  <math|p=5>, <math|\<kappa\>=20> and <math|S*N*R=5>dB. Linear model, Bingham
  prior.>

  <big-figure|<with|par-mode|center|<image|mmsd_lm_bingham_vs_SNR_N=20_p=5_kappa=20_K=5.eps|8cm|||><label|fig:mmsd>>|Fraction
  of energy of <math|<wide|<U>|^>> in <math|> versus <math|S*N*R>.
  <math|N=20>, <math|p=5>, <math|\<kappa\>=20> and <math|K=5>. Linear model,
  Bingham prior.>

  <big-figure|<with|par-mode|center|<image|mmsd_lm_vmf_vs_K_N=20_p=5_kappa=20_SNR=5.eps|8cm|||><label|fig:mmsd>>|Fraction
  of energy of <math|<wide|<U>|^>> in <math|> versus <math|K>. <math|N=20>,
  <math|p=5>, <math|\<kappa\>=20> and <math|S*N*R=5>dB. Linear model, vMF
  prior.>

  <big-figure|<with|par-mode|center|<image|mmsd_lm_vmf_vs_SNR_N=20_p=5_kappa=20_K=5.eps|8cm|||><label|fig:mmsd>>|Fraction
  of energy of <math|<wide|<U>|^>> in <math|> versus <math|S*N*R>.
  <math|N=20>, <math|p=5>, <math|\<kappa\>=20> and <math|K=5>. Linear model,
  vMF prior.>

  <big-figure|<with|par-mode|center|<image|mmsd_ulu_bingham_vs_K_N=20_p=5_kappa=20_SNR=5-10.eps|8cm|||><label|fig:mmsd>>|Fraction
  of energy of <math|<wide|<U>|^>> in <math|> versus <math|K>. <math|N=20>,
  <math|p=5>, <math|\<kappa\>=20>, <math|S*N*R<rsub|->=5>dB and
  <math|S*N*R<rsub|+>=10>dB. Covariance matrix model, Bingham prior.>

  <big-figure|<with|par-mode|center|<image|mmsd_ulu_vmf_vs_K_N=20_p=5_kappa=20_SNR=5-10.eps|8cm|||><label|fig:mmsd>>|Fraction
  of energy of <math|<wide|<U>|^>> in <math|> versus <math|K>. <math|N=20>,
  <math|p=5>, <math|\<kappa\>=20>, <math|S*N*R<rsub|->=5>dB and
  <math|S*N*R<rsub|+>=10>dB. Covariance matrix model, vMF prior.>

  <big-figure|<with|par-mode|center|<image|fig_dist_map_L=2_knn=3_spect_nb_synth_GBM_nb_new.eps|8cm|||><label|fig:dist>>|Top,
  left: non-linearity coefficients <math|\<gamma\><rsub|1,2>>. Top, right:
  distance between <math|<Ubar>> and <math|<wide|<U>|^><rsub|n>> estimated
  with <math|\<eta\>=0>. Bottom: distance between <math|<wide|<U>|\<bar\>>>
  and <math|<wide|<U>|^><rsub|\<ell\>>> estimated with <math|\<eta\>=0.5>
  (left) and <math|\<eta\>=50> (right).>

  <big-figure|<with|par-mode|center|<image|fig_dist_map_L=2_knn=3_spect_nb.eps|8cm|||><label|fig:dist>>|Top,
  left: The Moffett Field scene as composite true colors. Top, right:
  distance between <math|<Ubar>> and <math|<wide|<U>|^><rsub|n>> estimated
  with <math|\<eta\>=0>. Bottom: distance between <math|<wide|<U>|\<bar\>>>
  and <math|<wide|<U>|^><rsub|\<ell\>>> estimated with <math|\<eta\>=0.5>
  (left) and <math|\<eta\>=50> (right).>
</body>