<TeXmacs|1.99.7>

<style|<tuple|ieeetran|std-latex>>

<\body>
  <\hide-preamble>
    <assign|figsize|<macro|4.45>>

    <assign|defn|<macro|\<triangleq\>>>

    <assign|limit|<macro|1|2|<math|lim<rsub|<arg|1>\<rightarrow\><arg|2>>>>>

    <assign|uvec|<macro|1|<math|<wide*|<with|math-font-series|bold|<arg|1>>|\<bar\>>>>>

    <assign|tvec|<macro|1|<math|<wide|<with|math-font-series|bold|<arg|1>>|~>>>>

    <assign|ovec|<macro|1|<math|<wide|<with|math-font-series|bold|<arg|1>>|\<bar\>>>>>

    <assign|hvec|<macro|1|<math|<wide|<with|math-font-series|bold|<arg|1>>|^>>>>

    <assign|bvec|<macro|1|<math|<wide|<with|math-font-series|bold|<arg|1>>|\<breve\>>>>>

    <assign|vec|<macro|1|<math|<with|math-font-series|bold|<arg|1>>>>>

    <assign|mat|<macro|1|<math|<around*|[|<tabular*|<tformat|<table|<row|<cell|<arg|1>>>>>>|]>>>>

    <assign|smallmat|<macro|1|<math|<around*|[|<matrix*|<tformat|<table|<row|<cell|<arg|1>>>>>>|]>>>>

    <assign|ip|<macro|1|<math|<around*|\<langle\>|<arg|1>|\<rangle\>>>>>

    <assign|norm|<macro|1|<math|<around|\<\|\|\>|<arg|1>|\<\|\|\>>>>>

    <assign|mc|<macro|1|<math|<with|math-font|cal|<arg|1>>>>>

    <assign|st|<macro|<nbsp>s.t.<nbsp>>>

    <assign|barst|<macro|1|<move|<math|<around*|\||<rsub|<arg|1>>|\<nobracket\>>>|0pt|-0.5mm>>>

    <assign|Barst|<macro|1|<move|<math|<around*|\||<rsub|<arg|1>>|\<nobracket\>>>|0pt|-0.5mm>>>

    <assign|zci|<macro|<with|math-display|false|<frac|1|z<rsup|*>>>>>

    <assign|Real|<macro|\<bbb-R\>>>

    <assign|Complex|<macro|\<bbb-C\>>>

    <assign|Int|<macro|\<bbb-Z\>>>

    <assign|Nat|<macro|\<bbb-N\>>>

    <assign|Rat|<macro|\<bbb-Q\>>>

    <assign|Field|<macro|\<bbb-F\>>>

    <assign|Ell|<macro|\<cal-L\>>>

    <assign|kron|<macro|\<otimes\>>>

    <assign|conv|<macro|\<star\>>>

    <assign|cconv|<macro|\<circledast\>>>

    <assign|modulo|<macro|1|<left|langle><arg|1> <right|rangle>>>

    <assign|floor|<macro|1|<left|lfloor><arg|1> <right|rfloor>>>

    <assign|ceil|<macro|1|<left|lceil><arg|1> <right|rceil>>>

    <assign|tran|<macro|<rsup|<textsf>>T>>

    <assign|herm|<macro|<rsup|<textsf>>H>>

    <assign|of|<macro|1|<rsup|(<arg|1>)>>>

    <assign|oft|<macro|1|<rsup|(<arg|1>)<with|font-family|ss|T>>>>

    <assign|ofc|<macro|1|<rsup|(<arg|1>)*>>>

    <assign|ofH|<macro|1|<rsup|(<arg|1>)<with|font-family|ss|H>>>>

    <assign|ind|<macro|1<space|-2.5pt>I>>

    <assign|real|<macro|Re>>

    <assign|imag|<macro|Im>>

    <assign|sgn|<macro|sgn>>

    <assign|csgn|<macro|csgn>>

    <assign|E|<macro|E>>

    <assign|var|<macro|var>>

    <assign|cov|<macro|Cov>>

    <assign|kurt|<macro|kurt>>

    <assign|Kurt|<macro|<mc|K>>>

    <assign|row|<macro|row>>

    <assign|col|<macro|col>>

    <assign|spn|<macro|span>>

    <assign|nul|<macro|null>>

    <assign|rank|<macro|rank>>

    <assign|tr|<macro|tr>>

    <assign|diag|<macro|diag>>

    <assign|ddiag|<macro|ddiag>>

    <assign|sinc|<macro|sinc>>

    <assign|vect|<macro|vec>>

    <assign|bdiag|<macro|blkdiag>>

    <assign|Diag|<macro|<mc|D>>>

    <assign|cir|<macro|<mc|C>>>

    <assign|*|<macro|<argmax>>>

    <assign|*|<macro|<argmin>>>

    <assign|erfc|<macro|erfc>>

    <assign|Cov|<macro|Cov>>

    <assign|polylog|<macro|polylog>>

    <new-theorem|theorem|Theorem>

    <new-theorem|lemma|Lemma>

    <new-theorem|conjecture|Conjecture>

    <new-theorem|corollary|Corollary>

    <new-theorem|example|Example>

    <assign|eqref|<macro|1|(<reference|eq:>)>>

    <assign|Eqref|<macro|1|Equation<nbsp>(<reference|eq:>)>>

    <assign|Figref|<macro|1|Figure<nbsp><reference|fig:>>>

    <assign|figref|<macro|1|Fig.<nbsp><reference|fig:>>>

    <assign|tabref|<macro|1|Table<nbsp><reference|tab:>>>

    <assign|secref|<macro|1|Section<nbsp><reference|sec:>>>

    <assign|Secref|<macro|1|Section<nbsp><reference|sec:>>>

    <assign|appref|<macro|1|Appendix<nbsp><reference|app:>>>

    <assign|lemref|<macro|1|Lemma<nbsp><reference|lem:>>>

    <assign|thmref|<macro|1|Theorem<nbsp><reference|thm:>>>

    <assign|corref|<macro|1|Corollary<nbsp><reference|cor:>>>

    <assign|conref|<macro|1|Conjecture<nbsp><reference|con:>>>

    <assign|exaref|<macro|1|Example<nbsp><reference|exa:>>>

    <assign|etal|<macro|et al. >>

    <assign|ie|<macro|i.e., >>

    <assign|eg|<macro|e.g., >>

    <assign|textr|<macro|1|<with|color|Red|<arg|1>>>>

    <assign|textg|<macro|1|<with|color|Green|<arg|1>>>>

    <assign|textb|<macro|1|<with|color|Blue|<arg|1>>>>

    <assign|texto|<macro|1|<with|color|Orange|<arg|1>>>>

    <assign|textc|<macro|1|<with|color|Cyan|<arg|1>>>>

    <assign|comment|<macro|1|<vspace|2mm><no-indent><next-counter|comment><with|font-shape|slanted|<wide*|Comment
    <the-section>.<thecomment>|\<bar\>> <arg|1>:> >>

    <assign|texthead|<macro|1|<vspace|1fn><no-indent><next-counter|texthead><with|font-shape|slanted|<wide*|<the-section>.<thetexthead>)
    <arg|1>|\<bar\>>:> >>

    <assign|itemize|<macro|body|<\list|<labelitemi>|<topsep>0.05in
    <itemsep>0in>
      <arg|body>
    </list>>>

    <assign|giv|<macro|<space|0.17em>\|<space|0.17em>>>

    <assign|biggiv|<macro|<space|0.17em><mid|\|><space|0.17em>>>

    <assign|Biggiv|<macro|<space|0.17em><mid|\|><space|0.17em>>>

    <assign|const|<macro|\<bbb-S\>>>

    <assign|bin|<macro|\<bbb-B\>>>

    <assign|Lhpd|<macro|L<rsub|<with|font-family|ss|hpd>>>>

    <assign|Np|<macro|N<rsub|<textsf>>p>>

    <assign|Nd|<macro|N<rsub|<textsf>>d>>

    <assign|Md|<macro|M<rsub|<textsf>>d>>

    <assign|Mc|<macro|M<rsub|<textsf>>c>>

    <assign|Mi|<macro|M<rsub|<textsf>>i>>

    <assign|Mt|<macro|M<rsub|<textsf>>t>>

    <assign|pt|<macro|<rsub|<textsf>>pt>>

    <assign|SNR|<macro|<with|font-family|ss|SNR>>>

    <assign|BER|<macro|<with|font-family|ss|BER>>>

    <assign|NMSE|<macro|<with|font-family|ss|NMSE>>>

    <assign|Ls|<macro|K>>

    <assign|gt|<macro|g<rsub|<with|font-family|ss|t>>>>

    <assign|gr|<macro|g<rsub|<with|font-family|ss|r>>>>

    <assign|Lpre|<macro|L<rsub|<with|font-family|ss|pre>>>>

    <assign|inp|<macro|<rsub|<with|font-family|ss|in>,j>>>

    <assign|out|<macro|<rsub|<with|font-family|ss|out>,i>>>

    <assign|pri|<macro|<rsup|<text>><with|font-family|ss|apri>>>

    <assign|post|<macro|<rsup|<text>><with|font-family|ss|post>>>

    <assign|ext|<macro|<rsup|<text>><with|font-family|ss|ext>>>

    <assign|lmmse|<macro|<rsub|<text>><with|font-family|ss|lmmse>>>

    <assign|lasso|<macro|<rsub|<text>><with|font-family|ss|lasso>>>
  </hide-preamble>

  <doc-data|<doc-title|A Message-Passing Receiver for BICM-OFDM over Unknown
  Clustered-Sparse Channels>|<doc-author|<author-data|<author-misc|Please
  direct all correspondence to Prof. Philip Schniter, Dept. ECE, The Ohio
  State University, 2015 Neil Ave., Columbus OH 43210, e-mail:
  schniter@ece.osu.edu, phone 614.247.6488, fax
  614.292.7596.>|<author-misc|This work has been supported in part by NSF
  grant CCF-1018368 and DARPA/ONR grant N66001-10-1-4090, and an allocation
  of computing time from the Ohio Supercomputer
  Center.>|<author-name|Philip<nbsp>Schniter<rsup|<math|\<ast\>>>>>>|<doc-date|<date|>>>

  <abstract-data|<\abstract>
    We propose a factor-graph-based approach to joint
    channel-estimation-and-decoding (JCED) of bit-interleaved coded
    orthogonal frequency division multiplexing (BICM-OFDM). In contrast to
    existing designs, ours is capable of exploiting not only sparsity in
    sampled channel taps but also clustering among the large taps, behaviors
    which are known to manifest at larger communication bandwidths. In order
    to exploit these channel-tap structures, we adopt a two-state Gaussian
    mixture prior in conjunction with a Markov model on the hidden state. For
    loopy belief propagation, we exploit a ``generalized approximate message
    passing'' (GAMP) algorithm recently developed in the context of
    compressed sensing, and show that it can be successfully coupled with
    soft-input soft-output decoding, as well as hidden Markov inference,
    through the standard sum-product framework. For <math|N> subcarriers and
    <math|M> bits per subcarrier (and any channel length <math|L\<less\>N>),
    the resulting JCED-GAMP scheme has a computational complexity of only
    <math|<mc|O><around|(|N*log<rsub|2>N+N*2<rsup|M>|)>>. Numerical
    experiments show that our scheme yields BER performance within 1 dB of
    the known-channel bound and 4 dB better than decoupled
    channel-estimation-and-decoding via LASSO.
  </abstract>>

  }

  <section|Introduction><label|sec:intro>

  When designing a digital communications receiver, it is common to model the
  effects of multipath propagation in discrete time using a convolutive
  linear channel that, in the slow-fading scenario, can be characterized by a
  fixed impulse response <math|<around|{|x<rsub|j>|}><rsub|j=0><rsup|L-1>>
  over the duration of one codeword. When the communication bandwidth is
  sufficiently low, the \Ptaps\Q <math|<around|{|x<rsub|j>|}><rsub|j=0><rsup|L-1>>
  are well modeled as independent complex Gaussian random variables,
  resulting in the \Puncorrelated Rayleigh-fading\Q and \Puncorrelated
  Rician-fading\Q models that have dominated the wireless communications
  literature for many decades <cite|Molisch:Book:05>. For receiver design,
  the Gaussian tap assumption is very convenient because the optimal
  estimation scheme is well known to be linear <cite|Poor:Book:94>. As the
  communication bandwidth increases, however, the channel taps are no longer
  well-modeled as Gaussian nor independent. Rather, they tend to be
  heavy-tailed or \Psparse\Q in that only a few values in
  <math|<around|{|x<rsub|j>|}><rsub|j=0><rsup|L-1>> have significant
  amplitude <cite|Cramer:TAP:02|Preisig:JAcSA:04|Molisch:TVT:05|Czink:TWC:07>.
  Moreover, groups of large taps are often clustered together in lag
  <math|j>. These behaviors are both a blessing and a curse: a blessing
  because, of all tap distributions, the independent Gaussian one is most
  detrimental to capacity <cite|Medard:TIT:00>, but a curse because optimal
  channel estimation becomes non-linear and thus receiver design becomes more
  complicated.

  Recently, there have been many attempts to apply breakthrough non-linear
  estimation techniques from the field of \Pcompressive sensing\Q
  <cite|Mar:SPM:08> (e.g., LASSO <cite|Tibshirani:JRSSb:96|Chen:JSC:98>) to
  the wireless channel estimation problem. We refer to this approach as
  \Pcompressed channel sensing\Q (CCS), after the recent comprehensive
  overview <cite|Bajwa:PROC:10>. The CCS literature generally takes a
  <em|decoupled> approach to the problem of channel estimation and data
  decoding, in that pilot-symbol knowledge is first exploited for
  sparse-channel estimation, after which the channel estimate is used for
  data decoding. However, this decoupled approach has been shown to be
  suboptimal when the taps are non-Gaussian <cite|Kannu:arXiv:10>.

  The considerations above motivate a <em|joint> approach to
  structured-sparse-channel-estimation and decoding that offers both
  near-optimal decoding performance and low implementation complexity. In
  this paper, we propose exactly such a scheme. In particular, we focus on
  orthogonal frequency-division multiplexing (OFDM) with bit-interleaved
  coded modulation (BICM), and propose a novel factor-graph-based receiver
  that leverages recent results in \Pgeneralized approximate message
  passing\Q (GAMP) <cite|Rangan:10b>, soft-input/soft-output (SISO) decoding
  <cite|MacKay:Book:03>, and structured-sparse estimation
  <cite|Schniter:CISS:10>. Our receiver assumes a clustered-sparse
  channel-tap prior constructed using a two-state Gaussian mixture with a
  Markov model on the hidden tap state. The scheme that we propose has only
  <math|<mc|O><around|(|N*log<rsub|2>N+N*2<rsup|M>|)>> complexity, where
  <math|N> denotes the number of subcarriers and <math|M> denotes the number
  of bits per subcarrier, facilitating large values of <math|N> and channel
  length <math|L\<less\>N> (e.g., we use <math|N<space|-0.17em>=<space|-0.17em>1024>
  and <math|L<space|-0.17em>=<space|-0.17em>256> for our numerical results).
  For rich non-line-of-sight (NLOS) channels generated according to the
  IEEE<nbsp>802.15.4a standard <cite|Molisch:802.15.4a>, our numerical
  experiments show bit error rate (<math|<BER>>) performance within <math|1>
  dB of the known-channel bound and 4 dB better than decoupled
  channel-estimation-and-decoding via LASSO.

  We now place our work in the context of existing factor-graph designs.
  Factor-graph based joint channel-estimation and decoding (JCED) has existed
  for more than a decade (see, e.g., the early overview
  <cite|Worthen:TIT:01>). To calculate the messages passed among the nodes of
  the factor graph, first instincts suggest to apply the standard
  \Psum-product algorithm\Q (SPA) <cite|Pearl:Book:88>. Exact SPA on the JCED
  factor graph is computationally infeasible, however, and so it must be
  approximated. For this, there are many options, since many well-known
  iterative inference algorithms can themselves be recognized as SPA
  approximations: the expectation-maximization (EM) algorithm
  <cite|Dauwels:ISIT:05>, particle filtering <cite|Dauwels:ISIT:06>,
  variational Bayes (or \Pmean-field\Q) techniques <cite|Dauwels:ISIT:07>,
  and even steepest descent <cite|Dauwels:ITW:05>. Moreover, because the JCED
  factor graph is loopy, even exact SPA is not guaranteed to yield the
  correct output distributions. It is perhaps not surprising that, amidst
  this uncertainty about exact SPA and its \Pbest\Q approximation, a number
  of factor-graph designs for JCED over frequency-selective channels have
  been proposed (e.g., <cite|Novak:ICASSP:09|Liu:PIMRC:09|Knievel:ICC:10|Kirkelund:GLOBE:10>).

  Our approach differs from existing factor-graph JCED designs in that it
  uses 1) an apriori sparse (i.e., non-Gaussian) channel-tap prior, 2) a
  clustered (i.e., non-independent) channel-tap prior, and 3) a
  state-of-the-art SPA approximation known as \Pgeneralized approximate
  message passing\Q (GAMP), which has been rigorously analyzed and shown to
  yield asymptotically exact posteriors as
  <math|N,L<space|-0.17em>\<rightarrow\><space|-0.17em>\<infty\>>
  <cite|Rangan:10b>. In fact, we conjecture that the success of our method is
  due in large part to the principled approximations used within GAMP. We
  also note that, although we focus on the case of clustered-sparse channels,
  our approach could be applied to non-sparse (i.e., Gaussian) or
  non-clustered (i.e., independent) channel-taps or, e.g., non-sparse
  channels with unknown length <math|L> <cite|Novak:ICASSP:09>, with minor
  modifications of our assumed channel prior.

  Finally, we mention that this work is an evolution of our earlier work
  <cite|Schniter:ASIL:10|Schniter:PHYCOM:11> that was limited to an exactly
  sparse channel, that did exploit clustering, and that was based on the
  \Prelaxed belief propagation\Q (RBP) algorithm <cite|Rangan:10v2>, which
  has higher implementation complexity than GAMP. For example, the JCED
  scheme proposed in <cite|Schniter:ASIL:10|Schniter:PHYCOM:11> has
  complexity <math|<mc|O><around|(|N*L<space|-0.17em>+<space|-0.17em>N*2<rsup|M>|)>>,
  which grows in the channel length <math|L>.

  Our paper is organized as follows. In <secref|model> we detail our
  assumptions on the OFDM system and the channel prior, and provide an
  illustrative example of clustered-sparse behavior with the
  IEEE<nbsp>802.15.4a channel model. In <secref|jced> we detail our
  GAMP-based JCED approach, in <secref|sims> we report the results of our
  simulation study, and in <secref|conc> we conclude.

  Throughout the paper, we use the following notation. <math|<Real>> denotes
  the field of reals and <math|<Complex>> the complex field.
  <math|<around|(|\<cdummy\>|)><rsup|\<ast\>>> denotes conjugate and
  <math|<real><around|(|\<cdummy\>|)>> extracts the real part. Furthermore,
  <math|\<delta\><around|(|\<tau\>|)>> denotes the Dirac delta waveform while
  <math|<around|{|\<delta\><rsub|n>|}><rsub|n=-\<infty\>><rsup|\<infty\>>>
  denotes the Kronecker delta sequence. Also,
  <math|<around|\<langle\>|j|\<rangle\>><rsub|N>> denotes
  <math|j>-modulo-<math|N>, <math|<conv>> convolution, and <math|\<propto\>>
  denotes equality up to a scaling. We use boldface capital letters like
  <math|<wide|B|\<vect\>>> to denote matrices and boldface small letters like
  <math|<wide|b|\<vect\>>> to denote vectors.
  <math|<Diag><around|(|<wide|b|\<vect\>>|)>> constructs a diagonal matrix
  from the vector <math|<wide|b|\<vect\>>> and <math|<wide|I|\<vect\>>>
  denotes the identity matrix. For matrices and vectors,
  <math|<around|(|\<cdummy\>|)><tran>> denotes transpose and
  <math|<around|(|\<cdummy\>|)><herm>> denotes conjugate transpose. When
  <math|x<rsub|j>> is a realization of random variable <math|X<rsub|j>>, we
  write <math|x<rsub|j><space|-0.17em>\<sim\><space|-0.17em>X<rsub|j>> and
  use <math|<E><rsub|X<rsub|j>><around|{|x<rsub|j>|}>> to denote the mean,
  <math|<value|var><rsub|X<rsub|j>><around|{|x<rsub|j>|}>> the variance,
  <math|p<rsub|X<rsub|j>><around|(|x<rsub|j>|)>> the pdf, and
  <math|p<rsub|X<rsub|j>\|D<rsub|j>><around|(|x<rsub|j><giv>d<rsub|j>|)>> the
  pdf conditioned on the event <math|D<rsub|j><space|-0.17em>=<space|-0.17em>d<rsub|j>>.
  Sometimes we omit the subscript when there is no danger of confusion,
  yielding, e.g., <math|<E><around|{|x<rsub|j>|}>>,
  <math|<value|var><around|{|x<rsub|j>|}>>, <math|p<around|(|x<rsub|j>|)>>
  and <math|p<around|(|x<rsub|j><giv>d<rsub|j>|)>>.
  <math|<mc|C*N><around|(|x;<wide|x|^>,\<mu\>|)><space|-0.17em><defn><space|-0.17em><around|(|\<pi\>*\<mu\>|)><rsup|-1>*exp
  (-\<mu\><rsup|-1>*<around|\||x-<wide|x|^>|\|><rsup|2>)> denotes the
  circular Gaussian pdf with mean <math|<wide|x|^>> and variance
  <math|\<mu\><rsup|x>>. In fact, we often use
  <math|<around|(|<wide|v|^><rsub|j>,\<mu\><rsub|j><rsup|v>|)>> when
  referring to the mean and variance of <math|V<rsub|j>>.

  <section|System Model><label|sec:model>

  <subsection|The BICM-OFDM model><label|sec:OFDM>

  We consider an OFDM system with <math|N> subcarriers, each modulated by a
  QAM symbol from a <math|2<rsup|M>>-ary unit-energy constellation
  <math|<const>>. Of the <math|N> subcarriers, <math|<Np>> are dedicated as
  pilots,<footnote|For our GAMP decoder, we recommend
  <math|<Np><space|-0.17em>=<space|-0.17em>0>; see <secref|sims>.> and the
  remaining <math|<Nd><space|-0.17em><defn><space|-0.17em>N<space|-0.17em>-<space|-0.17em><Np>>
  are used to transmit a total of <math|<Mt>> training bits and
  <math|<Md><space|-0.17em><defn><space|-0.17em><Nd>M<space|-0.17em>-<space|-0.17em><Mt>>
  coded/interleaved data bits. The data bits are generated by encoding
  <math|<Mi>> information bits using a rate-<math|R> coder, interleaving
  them, and partitioning the resulting <math|<Mc><space|-0.17em><defn><space|-0.17em><Mi>/R>
  bits among an integer number <math|Q<space|-0.17em><defn><space|-0.17em><Mc>/<Md>>
  of OFDM symbols. We note that the resulting scheme has a spectral
  efficiency of <math|\<eta\><space|-0.17em><defn><space|-0.17em><Md>R/N>
  information bits per channel use (bpcu).

  In the sequel, we use <math|s<of|k><space|-0.17em>\<in\><space|-0.17em><const>>
  for <math|k<space|-0.17em>\<in\><space|-0.17em><around|{|1,\<ldots\>,2<rsup|M>|}>>
  to denote the <math|k<rsup|t*h>> element of the QAM constellation, and
  <math|<wide|c|\<vect\>><of|k><space|-0.17em><defn><space|-0.17em><around|[|c<rsub|1><of|k>,\<ldots\>,c<rsub|M><of|k>|]><tran>>
  to denote the corresponding bits as defined by the symbol mapping.
  Likewise, we use <math|s<rsub|i><around|[|q|]><space|-0.17em>\<in\><space|-0.17em><const>>
  for the QAM symbol transmitted on the <math|i<rsup|t*h>> subcarrier of the
  <math|q<rsup|t*h>> OFDM symbol and <math|<wide|c|\<vect\>><rsub|i><around|[|q|]><space|-0.17em><defn><space|-0.17em><around|[|c<rsub|i,1><around|[|q|]>,\<ldots\>,c<rsub|i,M><around|[|q|]>|]><tran>>
  for the coded/interleaved bits corresponding to that symbol. We use
  <math|<wide|c|\<vect\>><around|[|q|]><space|-0.17em><defn><space|-0.17em><around|[|<wide|c|\<vect\>><rsub|0><around|[|q|]>,\<ldots\>,<wide|c|\<vect\>><rsub|N-1><around|[|q|]>|]><tran>>
  to denote the coded/interleaved bits in the <math|q<rsup|t*h>> OFDM symbol
  and <math|<wide|c|\<vect\>><space|-0.17em><defn><space|-0.17em><around|[|<wide|c|\<vect\>><around|[|1|]>,\<ldots\>,<wide|c|\<vect\>><around|[|Q|]>|]><tran>>
  to denote the entire (interleaved) codeword. The elements of
  <math|<wide|c|\<vect\>>> that are apriori known as pilot or training bits
  will be referred to as <math|<wide|c|\<vect\>><pt>>. The remainder of
  <math|<wide|c|\<vect\>>> is determined from the information bits
  <math|<wide|b|\<vect\>><space|-0.17em><defn><space|-0.17em><around|[|b<rsub|1>,\<ldots\>,b<rsub|<Mi>>|]><tran>>
  by coding/interleaving.

  To modulate the <math|q<rsup|t*h>> OFDM symbol, an <math|N>-point inverse
  discrete Fourier transform (DFT) <math|<wide|\<Phi\>|\<vect\>><herm>> is
  applied to the QAM sequence <math|<wide|s|\<vect\>><around|[|q|]><space|-0.17em>=<space|-0.17em><around|[|s<rsub|0><around|[|q|]>,\<ldots\>,s<rsub|N-1><around|[|q|]>|]><tran>>,
  yielding the time-domain sequence <math|<wide|\<Phi\>|\<vect\>><herm><wide|s|\<vect\>><around|[|q|]><space|-0.17em>=<space|-0.17em><wide|a|\<vect\>><around|[|q|]><space|-0.17em>=<space|-0.17em><around|[|a<rsub|0><around|[|q|]>,\<ldots\>,a<rsub|N-1><around|[|q|]>|]><tran>>.
  The OFDM waveform <math|a<around|(|t|)>> is then constructed using
  <math|L>-cyclic-prefixed versions of <math|<around|{|a<rsub|j><around|[|q|]>|}>>
  and the transmission pulse <math|<gt><around|(|\<tau\>|)>>:

  <eqnarray|<tformat|<table|<row|<cell|a<around|(|t|)>>|<cell|=>|<cell|<big|sum><rsub|q=1><rsup|Q><big|sum><rsub|j=-L><rsup|N-1>a<rsub|<around|\<langle\>|j|\<rangle\>><rsub|N>><space|-0.17em><around|[|q|]><space|0.17em><gt><around*|(|t-j*T-q*<around|(|N+L|)>*T|)>,<eq-number>>>>>>

  with <math|T> denoting the baud interval (in seconds) and
  <math|L\<less\>N>.

  The waveform <math|a<around|(|t|)>> propagates through a noisy channel with
  an impulse response <math|h<around|(|\<tau\>|)>> that is supported on the
  interval <math|<around|[|\<tau\><rsub|min>,\<tau\><rsub|max>|]>>, resulting
  in the receiver input waveform

  <eqnarray|<tformat|<table|<row|<cell|r<around|(|t|)>>|<cell|=>|<cell|v<around|(|t|)>+<big|int><rsub|\<tau\><rsub|min>><rsup|\<tau\><rsub|max>>h<around|(|\<tau\>|)>*a*<around|(|t-\<tau\>|)>*d*\<tau\>,<eq-number>>>>>>

  where <math|v<around|(|t|)>> is a Gaussian noise process with flat power
  spectral density <math|N<rsub|o>>. We note that a time-invariant channel is
  assumed for simplicity. The receiver samples <math|r<around|(|t|)>> through
  the reception pulse <math|<gr><around|(|\<tau\>|)>>, obtaining

  <eqnarray|<tformat|<table|<row|<cell|r<rsub|j><around|[|q|]>>|<cell|=>|<cell|<big|int>r<around|(|t|)><space|0.17em><gr><around*|(|j*T+q*<around|(|N+L|)>*T-t|)>*d*t,<eq-number>>>>>>

  and applies an <math|N>-DFT <math|<wide|\<Phi\>|\<vect\>>> to each
  time-domain sequence <math|<wide|r|\<vect\>><around|[|q|]><space|-0.17em>=<space|-0.17em><around|[|r<rsub|0><around|[|q|]>,\<ldots\>,r<rsub|N-1><around|[|q|]>|]><tran>>,
  yielding the frequency-domain samples <math|<wide|\<Phi\>|\<vect\>>*<wide|r|\<vect\>><around|[|q|]><space|-0.17em>=<space|-0.17em><wide|y|\<vect\>><around|[|q|]><space|-0.17em>=<space|-0.17em><around|[|y<rsub|0><around|[|q|]>,\<ldots\>,y<rsub|L-1><around|[|q|]>|]><tran>>
  for <math|q=1*\<ldots\>*Q>.

  Defining the pulse-shaped channel response
  <math|x<around|(|\<tau\>|)><space|-0.17em><defn><space|-0.17em><around|(|<gr><conv>h<conv><gt>|)><around|(|\<tau\>|)>>,
  it is well known (e.g., <cite|Cimini:TCOM:85>) that, when the support of
  <math|x<around|(|\<tau\>|)>> is contained within the interval
  <math|<around|[|0,L*T|)>>, the frequency domain observation on the
  <math|i<rsup|t*h>> subcarrier can be written as

  <eqnarray|<tformat|<table|<row|<cell|y<rsub|i><around|[|q|]>>|<cell|=>|<cell|s<rsub|i><around|[|q|]>*z<rsub|i><around|[|q|]>+v<rsub|i><around|[|q|]>,<eq-number><label|eq:yi>>>>>>

  where <math|z<rsub|i><around|[|q|]>\<in\><Complex>> is the
  <math|i<rsup|t*h>> subcarrier's gain and
  <math|<around|{|v<rsub|i><around|[|q|]>|}>> are Gaussian noise samples.
  Furthermore, defining the uniformly sampled channel \Ptaps\Q
  <math|x<rsub|j><around|[|q|]><space|-0.17em><defn><space|-0.17em>x*<around|(|j*T<space|-0.17em>+<space|-0.17em>q*<around|(|N<space|-0.17em>+<space|-0.17em>L|)>*T|)>>,
  the subcarrier gains are related to these taps through the DFT:

  <eqnarray|<tformat|<table|<row|<cell|z<rsub|i><around|[|q|]>>|<cell|=>|<cell|<big|sum><rsub|j=0><rsup|L-1>\<Phi\><rsub|i*j>*x<rsub|j><around|[|q|]>.<eq-number><label|eq:zi>>>>>>

  In addition, when <math|<around|(|<gr><conv><gt>|)><around|(|\<tau\>|)>> is
  a Nyquist pulse, <math|<around|{|v<rsub|i><around|[|q|]>|}><rsub|\<forall\>i,q>>
  are statistically independent with variance
  <math|\<mu\><rsup|v><space|-0.17em>=<space|-0.17em>N<rsub|o>>.

  To simplify the development, we assume that <math|Q=1> in the sequel (but
  not in the simulations), and drop the index <math|<around|[|q|]>> for
  brevity.

  <subsection|A clustered-sparse tap prior><label|sec:GM2>

  Empirical studies <cite|Cramer:TAP:02|Preisig:JAcSA:04|Molisch:TVT:05|Czink:TWC:07>
  have suggested that, when the baud rate <math|T<rsup|-1>> is sufficiently
  large, the channel taps <math|<around|{|x<rsub|j>|}>> are \Psparse\Q in
  that the tap distributions tend to be heavy tailed. The same empirical
  studies suggest that large taps tend to be clustered in the lag domain.
  Furthermore, both the sparsity and clustering behaviors can be
  lag-dependent, such as when the receiver's timing-synchronization mechanism
  aligns the first strong multipath arrivals with a particular reference lag
  <math|j>. A concrete example of these behaviors will be given in
  <secref|IEEE>.

  Since our message-passing-based receiver design is inherently Bayesian, we
  seek a prior on the taps <math|<around|{|x<rsub|j>|}>> that is capable of
  representing this lag-dependent clustered sparsity. For this purpose, we
  assume a two-state Gaussian mixture (GM2) prior,<footnote|The message
  passing algorithm described in <secref|gamp> can also handle non-Gaussian
  mixtures and/or mixtures with more than two terms.>

  <eqnarray|<tformat|<table|<row|<cell|p<around|(|x<rsub|j>|)>>|<cell|=>|<cell|<around|(|1-\<lambda\><rsub|j>|)><mc|C*N><around|(|x<rsub|j>;0,\<mu\><rsup|0><rsub|j>|)>+\<lambda\><rsub|j><mc|C*N><around|(|x<rsub|j>;0,\<mu\><rsup|1><rsub|j>|)>,<eq-number><label|eq:pxj>>>>>>

  where <math|\<mu\><rsub|j><rsup|0><space|-0.17em>\<geq\><space|-0.17em>0>
  denotes the variance while in the \Psmall\Q state,
  <math|\<mu\><rsub|j><rsup|1><space|-0.17em>\<gtr\><space|-0.17em>\<mu\><rsub|j><rsup|0>>
  denotes the variance while in the \Pbig\Q state, and
  <math|\<lambda\><rsub|j><space|-0.17em><defn><space|-0.17em>Pr
  <around|{|d<rsub|j><space|-0.17em>=<space|-0.17em>1|}>> denotes the prior
  probability of <math|x<rsub|j>> being in the \Pbig\Q state. Here, we use
  <math|d<rsub|j><space|-0.17em>\<in\><space|-0.17em><around|{|0,1|}>> to
  denote the hidden state, implying the state-conditional pdf
  <math|p<around|(|x<rsub|j><giv>d<rsub|j>|)>=<mc|C*N><around|(|x<rsub|j>;0,\<mu\><rsup|d<rsub|j>><rsub|j>|)>>.

  For example, if <math|x<rsub|j>> was presumed to be a \Psparse\Q tap, then
  we would choose <math|\<lambda\><rsub|j><space|-0.17em>\<ll\><space|-0.17em>1>
  and <math|\<mu\><rsub|j><rsup|1><space|-0.17em>\<gg\><space|-0.17em>\<mu\><rsub|j><rsup|0>>
  in <eqref|pxj>. If, on the other hand, <math|x<rsub|j>> is presumed to be
  (non-sparse) Rayleigh-fading, we would choose
  <math|\<lambda\><rsub|j><space|-0.17em>=<space|-0.17em>1> and set
  <math|\<mu\><rsub|j><rsup|1>> equal to the tap variance, noting that
  <math|\<mu\><rsub|j><rsup|0>> becomes inconsequential. If <math|x<rsub|j>>
  is presumed to be Nakagami-fading or similar, we could fit the GM2
  parameters <math|<around|[|\<lambda\><rsub|j>,\<mu\><rsub|j><rsup|0>,\<mu\><rsub|j><rsup|1>|]>>
  appropriately using the EM algorithm, as described in
  <cite-detail|Bishop:Book:07|p.<nbsp>435>. The GM2 prior has been used
  successfully in many other non-Gaussian inference problems (see, e.g.,
  <cite|Ishwaran:AS:05>), and our premise here is that the GM2 model achieves
  a good balance between fidelity and tractability when modeling channel taps
  as well.

  To capture the big-tap clustering behavior, we employ a hidden Markov model
  (HMM). For this, we model the tap states
  <math|<around|{|d<rsub|j>|}><rsub|j=0><rsup|L-1>> as a Markov chain (MC)
  with switching probabilities <math|p<rsub|j><rsup|01><space|-0.17em><defn><space|-0.17em>Pr
  <around|{|d<rsub|j+1><space|-0.17em>=<space|-0.17em>0<giv>d<rsub|j><space|-0.17em>=<space|-0.17em>1|}>>
  and <math|p<rsub|j><rsup|10><space|-0.17em><defn><space|-0.17em>Pr
  <around|{|d<rsub|j+1><space|-0.17em>=<space|-0.17em>1<giv>d<rsub|j><space|-0.17em>=<space|-0.17em>0|}>>.
  Here, <math|p<rsub|j><rsup|01>\<less\>0.5> implies that the neighbors of a
  big <math|x<rsub|j>> tend to be big, and
  <math|p<rsub|j><rsup|10>\<less\>0.5> implies that the neighbors of a small
  <math|x<rsub|j>> tend to be small. We note that
  <math|<around|{|p<rsub|j><rsup|01>,p<rsub|j><rsup|10>|}><rsub|j=0><rsup|L-1>>
  must be consistent with <math|<around|{|\<lambda\><rsub|j>|}><rsub|j=0><rsup|L-1>>
  in that the following must hold for all <math|j>:

  <eqnarray|<tformat|<table|<row|<cell|<mat|<tformat|<table|<row|<cell|\<lambda\><rsub|j+1><nbsp>>|<cell|<nbsp>1<space|-0.17em>-<space|-0.17em>\<lambda\><rsub|j+1>>>>>>>|<cell|=>|<cell|<mat|<tformat|<table|<row|<cell|\<lambda\><rsub|j><nbsp>>|<cell|<nbsp>1<space|-0.17em>-<space|-0.17em>\<lambda\><rsub|j>>>>>><mat|<tformat|<table|<row|<cell|1<space|-0.17em>-<space|-0.17em>p<rsub|j><rsup|01>>|<cell|p<rsub|j><rsup|01>>>|<row|<cell|p<rsub|j><rsup|10>>|<cell|1<space|-0.17em>-<space|-0.17em>p<rsub|j><rsup|10>>>>>>.<eq-number>>>>>>

  Although we allow correlation among the tap states, we assume that the tap
  <em|amplitudes> are conditionally independent, i.e.,
  <math|p<around|(|x<rsub|j+1>,x<rsub|j><giv>d<rsub|j+1>,d<rsub|j>|)><space|-0.17em>=<space|-0.17em>p<around|(|x<rsub|j><giv>d<rsub|j>|)>*p<around|(|x<rsub|j+1><giv>d<rsub|j+1>|)>>.
  Our experiences with IEEE 802.15.4a channels (see below) suggest that this
  is a valid assumption.

  We emphasize that the model parameters <math|<around|{|\<lambda\><rsub|j>,p<rsub|j><rsup|01>,p<rsub|j><rsup|01>,\<mu\><rsub|j><rsup|1>,\<mu\><rsub|j><rsup|0>|}>>
  are allowed to vary with lag <math|j>, facilitating the exploitation of
  apriori known lag-dependencies in sparsity and/or clustering.

  <subsection|An illustrative example: IEEE 802.15.4a
  channels><label|sec:IEEE>

  As an illustrative example of the clustered-sparse tap behavior described
  above, we generated realizations of the tap vector
  <math|<wide|x|\<vect\>><defn><around|[|x<rsub|0>,\<ldots\>,x<rsub|L-1>|]><tran>>
  from channel impulse responses <math|h<around|(|\<tau\>|)>> generated
  according to the method specified in the IEEE<nbsp>802.15.4a
  \Pultra-wideband\Q standard <cite|Molisch:802.15.4a>, which uses the
  Saleh-Valenzuela model <cite|Saleh:JSAC:87>

  <eqnarray|<tformat|<table|<row|<cell|h<around|(|\<tau\>|)>>|<cell|=>|<cell|<big|sum><rsub|c=0><rsup|C><big|sum><rsub|k=0><rsup|K>h<rsub|k,c>*e<rsup|j*\<phi\><rsub|k,c>>*\<delta\>*<around|(|\<tau\>-T<rsub|c>-\<tau\><rsub|k,c>|)>,<eq-number>>>>>>

  where <math|C> denotes the number of clusters, <math|T<rsub|c>> the delay
  of the <math|c<rsup|t*h>> cluster, <math|K> the number of components per
  cluster, <math|<around|{|\<tau\><rsub|k,c>|}>> the relative component
  delays, <math|<around|{|h<rsub|k,c>|}>> the component amplitudes, and
  <math|<around|{|\<phi\><rsub|k,c>|}>> the component phases. In particular,
  the 802.15.4a standard specifies the following.

  <\itemize>
    <item>The cluster arrival times are a Poisson process with rate
    <math|\<Lambda\>>, i.e., <math|p<around|(|T<rsub|c><giv>T<rsub|c-1>|)><space|-0.17em>=<space|-0.17em>\<Lambda\>*exp
    (-\<Lambda\>*<around|(|T<rsub|c>-T<rsub|c-1>|)>)>. The initial cluster
    delay <math|T<rsub|0><space|-0.17em>\<geq\><space|-0.17em>\<tau\><rsub|min>>,
    as seen by the receiver, is a function of the timing synchronization
    algorithm.

    <item>The component arrivals are a mixture of two Poisson processes:
    <math|p<around|(|\<tau\><rsub|k,c>\|\<tau\><rsub|k-1,c>|)><space|-0.17em>=<space|-0.17em>\<beta\>*\<lambda\><rsub|1>*exp
    (-\<lambda\><rsub|1>*<around|(|\<tau\><rsub|k,c>-\<tau\><rsub|k-1,c>|)>)<space|-0.17em>+<space|-0.17em><around|(|1-\<beta\>|)>*\<lambda\><rsub|2>*exp
    (-\<lambda\><rsub|2>*<around|(|\<tau\><rsub|k,c>-\<tau\><rsub|k-1,c>|)>)>
    with <math|\<tau\><rsub|0,c>=0>.

    <item>The component energies obey

    <eqnarray|<tformat|<table|<row|<cell|<E><around|{|<around|\||h<rsub|k,c>|\|><rsup|2>|}>>|<cell|=>|<cell|<frac|exp
    (-T<rsub|l>/\<Gamma\>-\<tau\><rsub|k,l>/\<gamma\>)|\<gamma\>*<around|[|<around|(|1-\<beta\>|)>*\<lambda\><rsub|1>+\<beta\>*\<lambda\><rsub|2>+1|]>>,<eq-number>>>>>>

    where <math|\<Gamma\>> is the cluster decay time constant and
    <math|\<gamma\>> is the intra-cluster decay time constant.

    <item>The amplitudes <math|<around|{|h<rsub|k,c>|}>> are i.i.d Nakagami
    with <math|m>-factors randomly generated via i.i.d
    <math|m\<sim\><mc|N><around|(|m<rsub|0>,<wide|m|^><rsup|2><rsub|0>|)>>.

    <item>The phases <math|<around|{|\<phi\><rsub|k,c>|}>> are i.i.d uniform
    on <math|<around|[|0,2*\<pi\>|)>>.

    <item>The number of clusters, <math|C>, is Poisson distributed with mean
    <math|<wide|C|\<bar\>>>, i.e., <math|p<around|(|C|)><space|-0.17em>=<space|-0.17em><around|(|<wide|C|\<bar\>>|)><rsup|C>*exp
    (-<wide|C|\<bar\>>)/<around|(|C!|)>>.

    <item>The number of components per cluster, <math|K>, is set large enough
    to yield a desired modeling accuracy.
  </itemize>

  Beyond the above specifications, we assume the following.

  <\itemize>
    <item>The parameters <math|<around|{|\<Lambda\>,\<lambda\><rsub|1>,\<lambda\><rsub|2>,\<beta\>,\<Gamma\>,\<gamma\>,m<rsub|0>,<wide|m|^><rsub|0>,<wide|C|\<bar\>>|}>>
    are set according to the 802.15.4a \Poutdoor NLOS\Q scenario
    <cite|Molisch:802.15.4a>.

    <item><math|K<space|-0.17em>=<space|-0.17em>100> components per cluster
    are used.

    <item>The pulses <math|<gt><around|(|\<tau\>|)>> and
    <math|<gr><around|(|\<tau\>|)>> are square-root raised cosine (SRRC)
    designs with parameter <math|0.5>.

    <item>The system bandwidth equals <math|T<rsup|-1>=256> MHz.

    <item>The number of taps (and CP length) was set at <math|L=256>
    (implying a maximal delay spread of <math|1*<space|0.17em>\<mu\>>sec) in
    order to capture all significant energy in <math|h<around|(|\<tau\>|)>>.

    <item>The initial delay was generated via
    <math|T<rsub|0><space|-0.17em>=<space|-0.17em><Lpre>T+<wide|T|~><rsub|0>>,
    where <math|<Lpre><space|-0.17em>=<space|-0.17em>20> and where
    <math|<wide|T|~><rsub|0>> is exponentially distributed with mean
    <math|T>, i.e., <math|p<around|(|<wide|T|~><rsub|0>|)><space|-0.17em>=<space|-0.17em>\<Lambda\><rsub|0>*exp
    (-\<Lambda\><rsub|0>*<wide|T|~><rsub|0>>) for
    <math|\<Lambda\><rsub|0><space|-0.17em>=<space|-0.17em>1/T>. Here,
    <math|<Lpre>> was chosen so that <math|<around|{|x<rsub|j>|}><rsub|j=0><rsup|<Lpre>>>
    captures the \Ppre-cursor\Q energy contributed by the pulse shape, while
    <math|\<Lambda\><rsub|0>> models a positive synchronization uncertainty.
  </itemize>

  We now show results from an experiment conducted using <math|10000>
  realizations of the tap vector <math|<wide|x|\<vect\>>>. In <figref|hist>,
  we show histograms of <math|<real><around|(|x<rsub|j>|)>> for lags
  <math|j<space|-0.17em>\<in\><space|-0.17em><around|{|5,23,128,230|}>>.
  There it can be seen that the empirical distribution of
  <math|<real><around|(|x<rsub|j>|)>> changes significantly with lag
  <math|j>: for pre-cursor lags <math|j<space|-0.17em>\<less\><space|-0.17em><Lpre>>,
  it is approximately Gaussian; for near-cursor lags
  <math|j<space|-0.17em>\<approx\><space|-0.17em><Lpre>>, it is approximately
  Laplacian; and, for post-cursor lags <math|j<space|-0.17em>\<gg\><space|-0.17em><Lpre>>,
  it is extremely heavy-tailed. In <figref|realization>, we show a typical
  realization of <math|<wide|x|\<vect\>>> and notice clustering among the big
  taps. For comparison, we also plot an empirical estimate of the power-delay
  profile (PDP) <math|<wide|\<rho\>|\<vect\>><defn><around|[|\<rho\><rsub|0>,\<ldots\>,\<rho\><rsub|L-1>|]><tran>>
  in <figref|realization>, where <math|\<rho\><rsub|j><space|-0.17em><defn><space|-0.17em><E><around|{|<around|\||x<rsub|j>|\|><rsup|2>|}>>.

  Next, we fit the GM2 parameters <math|<around|{|\<lambda\><rsub|j>,\<mu\><rsup|0><rsub|j>,\<mu\><rsup|1><rsub|j>|}><rsub|j=0><rsup|L-1>>
  using the EM algorithm, as described in
  <cite-detail|Bishop:Book:07|p.<nbsp>435>. The resulting big-variance
  profile <math|<wide|\<mu\>|\<vect\>><rsup|1><defn><around|[|\<mu\><rsup|1><rsub|0>,\<ldots\>,\<mu\><rsup|1><rsub|L-1>|]><tran>>
  and small-variance profile <math|<wide|\<mu\>|\<vect\>><rsup|0>> are shown
  in <figref|realization>, while the sparsity profile
  <math|<wide|\<lambda\>|\<vect\>><defn><around|[|\<lambda\><rsub|0>,\<ldots\>,\<lambda\><rsub|L-1>|]><tran>>
  is shown in <figref|gm2>. Not surprisingly, the best-fit GM2 parameters
  also change significantly with lag <math|j>. In particular, as <math|j>
  becomes larger, the variance ratio <math|\<mu\><rsup|1><rsub|j>/\<mu\><rsup|0><rsub|j>>
  increases while the big-tap-probability <math|\<lambda\><rsub|j>>
  decreases, corresponding to an increase in sparsity. Meanwhile, there
  exists a peak in <math|\<lambda\><rsub|j>> near
  <math|j<space|-0.17em>=<space|-0.17em><Lpre>> that results from
  synchronization.

  Next, we empirically estimated the switching probabilities
  <math|<wide|p|\<vect\>><rsup|01><defn><around|[|p<rsup|01><rsub|0>,\<ldots\>,p<rsup|01><rsub|L-1>|]><tran>>
  and <math|<wide|p|\<vect\>><rsup|10>>. To do this, we first detected the
  hidden state vector <math|<wide|d|\<vect\>><defn><around|[|d<rsub|0>,\<ldots\>,d<rsub|L-1>|]><tran>>
  underlying each realization of <math|<wide|x|\<vect\>>> by comparing each
  element <math|x<rsub|j>> to the maximum a-posteriori (MAP) threshold
  <cite|Poor:Book:94> (shown in <figref|realization>). From these detected
  states, estimates of <math|<wide|p|\<vect\>><rsup|01>> and
  <math|<wide|p|\<vect\>><rsup|10>> were then computed by counting the number
  of switches (at each fixed lag <math|j>) over the <math|10000>
  realizations. From the plots in <figref|gm2>, we see that the switching
  probabilities are lag-dependent as well.

  Finally, using our estimates of <math|<wide|d|\<vect\>>>, we computed the
  normalized conditional correlation

  <\equation*>
    <with|math-display|false|<frac|<E><around|{|x<rsub|j+1>*x<rsub|j><rsup|\<ast\>><giv>d<rsub|j+1>=1,d<rsub|j>=1|}>|<sqrt|<E><around|{|<around|\||x<rsub|j+1>|\|><rsup|2><giv>d<rsub|j+1>=1,d<rsub|j>=1|}><E><around|{|<around|\||x<rsub|j>|\|><rsup|2><giv>d<rsub|j+1>=1,d<rsub|j>=1|}>>>>
  </equation*>

  and found that the magnitude was <math|\<less\><space|-0.17em>0.1>,
  validating our assumption of conditionally independent tap amplitudes.

  In summary, we have seen that IEEE<nbsp>802.15.4a channels do indeed yield
  taps with the lag-dependent clustered sparsity described in <secref|GM2>.
  Moreover, we have shown how the GM2-HMM parameters can be estimated from
  realizations of <math|<wide|x|\<vect\>>>. Next, we propose an efficient
  factor-graph based approach to joint channel-estimation and decoding (JCED)
  for BICM-OFDM using the GM2-HMM prior proposed in <secref|GM2>.

  <putFrag|hist|Histograms of <math|<real><around|(|x<rsub|j>|)>> for lags
  <math|j\<in\><around|{|5,23,128,230|}>>, with ``tight'' axes. With
  synchronization delay <math|<Lpre><space|-0.17em>=<space|-0.17em>20>, note
  that the histogram appears Gaussian for
  <math|j<space|-0.17em>\<less\><space|-0.17em><Lpre>>, Laplacian for
  <math|j<space|-0.17em>\<approx\><space|-0.17em><Lpre>>, and very sparse for
  <math|j<space|-0.17em>\<gg\><space|-0.17em><Lpre>>.|<figsize>|>

  <putFrag|realization|A sample realization of channel taps
  <math|<around|{|x<rsub|j>|}>> generated from the IEEE<nbsp>802.15.4a model
  with SRRC pulse shaping. Also shown is the empirically estimated PDP, best
  fits of the GM2 parameters <math|<around|{|\<mu\><rsup|0><rsub|j>,\<mu\><rsup|1><rsub|j>|}>>,
  and the MAP threshold for detecting the hidden state <math|d<rsub|j>> given
  the tap value <math|x<rsub|j>>.|<figsize>|<assign|sz|<macro|0.7>>
  <psfrag*|channel realization|b>[b][1.0] <psfrag*|lag
  [baud]|>[][<sz>]<with|font-family|ss|lag <math|j> [baud]> >

  <putFrag|gm2|Empirically estimated statistics on the tap-states
  <math|<around|{|d<rsub|j>|}>>. Top: <math|\<lambda\><rsub|j><defn>Pr
  <around|{|d<rsub|j><space|-0.17em>=<space|-0.17em>1|}>>, middle:
  <math|p<rsup|01><rsub|j><defn>Pr <around|{|d<rsub|j+1><space|-0.17em>=<space|-0.17em>0<giv>d<rsub|j><space|-0.17em>=<space|-0.17em>1|}>>,
  bottom: <math|p<rsup|10><rsub|j><defn>Pr
  <around|{|d<rsub|j+1><space|-0.17em>=<space|-0.17em>1<giv>d<rsub|j><space|-0.17em>=<space|-0.17em>0|}>>.
  The red dashed line shows the synchronization reference,
  <math|j=<Lpre>=20>.|<figsize>|<assign|sz|<macro|0.7>>
  <psfrag*|lambda|b>[b][<sz>]<math|\<lambda\><rsub|j>>
  <psfrag*|p01|>[][<sz>]<math|p<rsup|01><rsub|j>>
  <psfrag*|p10|>[][<sz>]<math|p<rsup|10><rsub|j>> <psfrag*|lag
  [baud]|>[][<sz>]<with|font-family|ss|lag <math|j> [baud]> >

  <section|Joint Channel Estimation and Decoding><label|sec:jced>

  <putFrag|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust|Factor graph of the
  JCED problem for a toy example with <math|<Mi>=3> information bits,
  <math|<Np>=1> pilot subcarrier (at subcarrier index <math|i=3>),
  <math|<Mt>=2> training bits, <math|M=2> bits per QAM symbol, <math|N=4>
  OFDM subcarriers, and channel impulse response length
  <math|L=3>.|3.5|<assign|vs|<macro|0.8>> <assign|cs|<macro|0.7>>
  <assign|ts|<macro|0.50>> <psfrag*|SISO|B>[Bl][<ts>]<with|font-family|ss|SISO
  decoding> <psfrag*|relaxed|B>[Bl][<ts>]<with|font-family|ss|GAMP>
  <psfrag*|MC|B>[Bl][<ts>]<with|font-family|ss|MC>
  <psfrag*|b1|Bl>[Bl][<vs>]<math|b<rsub|1>>
  <psfrag*|b2|Bl>[Bl][<vs>]<math|b<rsub|2>>
  <psfrag*|b3|Bl>[Bl][<vs>]<math|b<rsub|3>>
  <psfrag*|b4|Bl>[Bl][<vs>]<math|b<rsub|4>>
  <psfrag*|c11|B>[Bl][<cs>]<math|c<rsub|0,1>>
  <psfrag*|c21|B>[Bl][<cs>]<math|c<rsub|1,1>>
  <psfrag*|c31|B>[Bl][<cs>]<math|c<rsub|2,1>>
  <psfrag*|c41|B>[Bl][<cs>]<math|c<rsub|3,1>>
  <psfrag*|c12|B>[Bl][<cs>]<math|c<rsub|0,2>>
  <psfrag*|c22|B>[Bl][<cs>]<math|c<rsub|1,2>>
  <psfrag*|c32|B>[Bl][<cs>]<math|c<rsub|2,2>>
  <psfrag*|c42|B>[Bl][<cs>]<math|c<rsub|3,2>>
  <psfrag*|m1|b>[Bl][<vs>]<math|<mc|M><rsub|0>>
  <psfrag*|m2|b>[Bl][<vs>]<math|<mc|M><rsub|1>>
  <psfrag*|m3|b>[Bl][<vs>]<math|<mc|M><rsub|2>>
  <psfrag*|m4|b>[Bl][<vs>]<math|<mc|M><rsub|3>>
  <psfrag*|s1|B>[Bl][<vs>]<math|s<rsub|0>>
  <psfrag*|s2|B>[Bl][<vs>]<math|s<rsub|1>>
  <psfrag*|s3|B>[Bl][<vs>]<math|s<rsub|2>>
  <psfrag*|s4|B>[Bl][<vs>]<math|s<rsub|3>>
  <psfrag*|y1|B>[Bl][<vs>]<math|y<rsub|0>>
  <psfrag*|y2|B>[Bl][<vs>]<math|y<rsub|1>>
  <psfrag*|y3|B>[Bl][<vs>]<math|y<rsub|2>>
  <psfrag*|y4|B>[Bl][<vs>]<math|y<rsub|3>>
  <psfrag*|x1|Bl>[Bl][<vs>]<math|x<rsub|1>>
  <psfrag*|x2|Bl>[Bl][<vs>]<math|x<rsub|2>>
  <psfrag*|x3|Bl>[Bl][<vs>]<math|x<rsub|3>>
  <psfrag*|d1|B>[Bl][<vs>]<math|d<rsub|1>>
  <psfrag*|d2|B>[Bl][<vs>]<math|d<rsub|2>>
  <psfrag*|d3|B>[Bl][<vs>]<math|d<rsub|3>>
  <psfrag*|uni|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|uniform>>|<row|<cell|prior>>>>>>
  <psfrag*|info|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|info>>|<row|<cell|bits>>>>>>
  <psfrag*|code|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|code
  &>>|<row|<cell|interlv>>>>>> <psfrag*|pt|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|pilots
  &>>|<row|<cell|training>>>>>> <psfrag*|bits|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|coded>>|<row|<cell|bits>>>>>>
  <psfrag*|map|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|symbol>>|<row|<cell|mapping>>>>>>
  <psfrag*|QAM|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|QAM>>|<row|<cell|symbs>>>>>>
  <psfrag*|y|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|OFDM>>|<row|<cell|observ>>>>>>
  <psfrag*|chan|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|channel>>|<row|<cell|taps>>>>>>
  <psfrag*|gm|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|sparse>>|<row|<cell|prior>>>>>>
  <psfrag*|st|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|tap>>|<row|<cell|states>>>>>>
  <psfrag*|m|>[Bl][<ts>]<with|font-family|ss|<tabular*|<tformat|<cwith|1|-1|1|-1|cell-valign|c>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<table|<row|<cell|cluster>>|<row|<cell|prior>>>>>>
  >

  Our goal is to infer the information bits <math|<wide|b|\<vect\>>> from the
  OFDM observations <math|<wide|y|\<vect\>>> and the pilot/training bits
  <math|<wide|c|\<vect\>><pt>>, without knowing the channel state
  <math|<wide|x|\<vect\>>>. In particular, we aim to maximize the posterior
  pmf <math|p<around|(|b<rsub|m><giv><wide|y|\<vect\>>,<wide|c|\<vect\>><pt>|)>>
  of each info bit. To exploit prior knowledge that <math|<wide|x|\<vect\>>>
  is clustered-sparse, we employ the GM2-HMM prior described in <secref|GM2>.
  As a result, the info-bit posterior can be decomposed into the following
  product of factors:

  <eqnarray|<tformat|<table|<row|<cell|<lefteqn|p<around|(|b<rsub|m><giv><wide|y|\<vect\>>,<wide|c|\<vect\>><pt>|)><space|0.17em>=<space|0.17em><big|sum><rsub|<wide|b|\<vect\>><rsub|-m>>p<around|(|<wide|b|\<vect\>><giv><wide|y|\<vect\>>,<wide|c|\<vect\>><pt>|)><space|0.17em>\<propto\><space|0.17em><big|sum><rsub|<wide|b|\<vect\>><rsub|-m>>p<around|(|<wide|y|\<vect\>><giv><wide|b|\<vect\>>,<wide|c|\<vect\>><pt>|)>*p<around|(|<wide|b|\<vect\>>|)>><space|1em>>|<cell|>|<cell|<eq-number><label|eq:propto>>>|<row|<cell|>|<cell|=>|<cell|<big|int><rsub|<wide|x|\<vect\>>><big|sum><rsub|<wide|s|\<vect\>>,<wide|d|\<vect\>>,<wide|c|\<vect\>>,<wide|b|\<vect\>><rsub|-m>><space|-4mm>p<around|(|<wide|y|\<vect\>><giv><wide|s|\<vect\>>,<wide|x|\<vect\>>|)>*p<around|(|<wide|x|\<vect\>><giv><wide|d|\<vect\>>|)>*p<around|(|<wide|d|\<vect\>>|)>*p<around|(|<wide|s|\<vect\>><giv><wide|c|\<vect\>>|)>*p<around|(|<wide|c|\<vect\>><giv><wide|b|\<vect\>>,<wide|c|\<vect\>><pt>|)>*p<around|(|<wide|b|\<vect\>>|)>>>|<row|<cell|>|<cell|=>|<cell|<big|int><rsub|<wide|x|\<vect\>>><big|sum><rsub|<wide|d|\<vect\>>><big|prod><rsub|j=0><rsup|L-1>p<around|(|x<rsub|j><giv>d<rsub|j>|)>*p<around|(|d<rsub|j><giv>d<rsub|j-1>|)>*<big|sum><rsub|<wide|s|\<vect\>>><big|prod><rsub|i=0><rsup|N-1>p<around|(|y<rsub|i><giv>s<rsub|i>,<wide|x|\<vect\>>|)>>>|<row|<cell|>|<cell|>|<cell|\<times\><big|sum><rsub|<wide|c|\<vect\>>>p<around|(|s<rsub|i><giv><wide|c|\<vect\>><rsub|i>|)>*<big|sum><rsub|<wide|b|\<vect\>><rsub|-m>>p<around|(|<wide|c|\<vect\>><giv><wide|b|\<vect\>>,<wide|c|\<vect\>><pt>|)>*<big|prod><rsub|m=1><rsup|<Mi>>p<around|(|b<rsub|m>|)>,<space|1em><eq-number><label|eq:factored>>>>>>

  where <math|<wide|b|\<vect\>><rsub|-m><space|-0.17em><defn><space|-0.17em><around|[|b<rsub|1>,\<ldots\>,b<rsub|m-1>,b<rsub|m+1>,\<ldots\>,b<rsub|<Mi>>|]><tran>>.
  This factorization is illustrated by the <em|factor graph> in
  <figref|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust>, where the round nodes
  represent random variables and the square nodes represent the factors of
  the posterior exposed in <eqref|factored>.

  <subsection|Background on belief propagation><label|sec:bp>

  Although exact evaluation of the posteriors
  <math|<around|{|p<around|(|b<rsub|m><giv><wide|y|\<vect\>>,<wide|c|\<vect\>><pt>|)>|}>>
  is computationally impractical for the problem sizes of interest, these
  posteriors can be approximately evaluated using <em|belief propagation>
  (BP) <cite|Pearl:Book:88> on the loopy factor graph in
  <figref|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust>. In textbook BP,
  beliefs take the form of pdfs/pmfs that are propagated among nodes of the
  factor graph via the <em|sum/product algorithm> (SPA):

  <\enumerate>
    <item>Say the factor node <math|f> is connected to the variable nodes
    <math|<around|{|v<rsub|a>|}><rsub|a=1><rsup|A>>. The belief passed from
    <math|f> to <math|v<rsub|b>> is <math|p<rsub|f\<rightarrow\>v<rsub|b>><around|(|v<rsub|b>|)>\<propto\><big|int><rsub|<around|{|v<rsub|a>|}><rsub|a\<neq\>b>>f<around|(|v<rsub|1>,\<ldots\>,v<rsub|A>|)>*<big|prod><rsub|a\<neq\>b>p<rsub|v<rsub|a>\<rightarrow\>f><around|(|v<rsub|a>|)>>,
    given the beliefs <math|<around|{|p<rsub|v<rsub|a>\<rightarrow\>f>|(>\<cdot\><around|)||}><rsub|a\<neq\>b>>
    recently passed to <math|f>.

    <item>Say the variable node <math|v> is connected to the factor nodes
    <math|<around|{|f<rsub|1>,\<ldots\>,f<rsub|B>|}>>. The belief passed from
    <math|v> to <math|f<rsub|a>> is <math|p<rsub|v\<rightarrow\>f<rsub|a>><around|(|v|)>\<propto\><big|prod><rsub|b\<neq\>a>p<rsub|f<rsub|b>\<rightarrow\>v><around|(|v|)>>,
    given the beliefs <math|<around|{|p<rsub|f<rsub|b>\<rightarrow\>v>|(>\<cdot\><around|)||}><rsub|b\<neq\>a>>
    recently passed to <math|v>.

    <item>Say the variable node <math|v> is connected to the factor nodes
    <math|<around|{|f<rsub|1>,\<ldots\>,f<rsub|B>|}>>. The posterior on
    <math|v> is the product of all recently arriving beliefs, i.e.,
    <math|p<around|(|v|)>\<propto\><big|prod><rsub|b=1><rsup|B>p<rsub|f<rsub|b>\<rightarrow\>v><around|(|v|)>>.
  </enumerate>

  When the factor graph contains no loops, SPA-BP yields exact posteriors
  after two rounds of message passing (i.e., forward and backward). But, in
  the presence of loops, convergence to the exact posteriors is not
  guaranteed. That said, there exist many problems to which loopy BP has been
  successfully applied, including LDPC decoding <cite|MacKay:Book:03>,
  inference on Markov random fields <cite|Freeman:IJCV:00>, and compressed
  sensing <cite|Baron:TSP:10|Donoho:PNAS:09|Bayati:10|Rangan:10b|Schniter:CISS:10|Rangan:10v2>.
  Our work not only leverages these three past successes, but unites them.

  <subsection|Background on GAMP><label|sec:gamp>

  An important sub-problem within our larger bit-inference problem is the
  estimation of a vector of independent possibly-non-Gaussian variables
  <math|<wide|x|\<vect\>>> that are linearly mixed via
  <math|<wide|\<Phi\>|\<vect\>>\<in\><Complex><rsup|N\<times\>L>> to form
  <math|<wide|z|\<vect\>><space|-0.17em>=<space|-0.17em><wide|\<Phi\>*x|\<vect\>><space|-0.17em>=<space|-0.17em><around|[|z<rsub|0>,\<ldots\>,z<rsub|L-1>|]><tran>>,
  and subsequently observed as noisy measurements <math|<wide|y|\<vect\>>>
  through the possibly non-Gaussian pdfs <math|<around|{|p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|.<giv>.|)>|}><rsub|i=0><rsup|N-1>>.
  In our case, <eqref|pxj> specifies a GM2 prior on <math|x<rsub|j>> and
  <eqref|yi>\Vgiven the finite-alphabet uncertainty in
  <math|s<rsub|i>>\Vyields the non-Gaussian measurement pdf
  <math|p<rsub|Y<rsub|i>\|Z<rsub|i>>>. This \Plinear mixing\Q sub-problem is
  described by the factor graph shown within the middle dashed box in
  <figref|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust>, where each node
  \P<math|y<rsub|i>>\Q represents the measurement pdfs
  <math|p<rsub|Y<rsub|i>\|Z<rsub|i>>> and the node to the right of each node
  \P<math|x<rsub|j>>\Q represents the GM2 prior on <math|x<rsub|j>>.

  Building on recent work on multiuser detection by Guo and Wang
  <cite|Guo:ISIT:07>, as well as recent work on message passing algorithms
  for compressed sensing by Donoho, Maleki, and Montanari
  <cite|Donoho:PNAS:09|Bayati:10>, Rangan proposed a so-called
  <em|generalized approximate message passing> (GAMP) scheme that, for the
  sub-problem described above, yields posteriors that become asymptotically
  exact as <math|N,L<space|-0.17em>\<rightarrow\><space|-0.17em>\<infty\>>
  <cite|Rangan:10b>. The main ideas behind GAMP are the following. First,
  although the beliefs flowing leftward from the nodes
  <math|<around|{|x<rsub|j>|}>> are clearly non-Gaussian, the corresponding
  belief about <math|z<rsub|i>=<big|sum><rsub|j=0><rsup|L-1>\<Phi\><rsub|i*j>*x<rsub|j>>
  can be accurately approximated as Gaussian, when <math|L> is large, using
  the central limit theorem. Moreover, to calculate the parameters of this
  distribution (i.e., its mean and variance), only the mean and variance of
  each <math|x<rsub|j>> are needed. Thus, it suffices to pass only means and
  variances leftward from each <math|x<rsub|j>> node. It is similarly
  desirable to pass only means and variances rightward from each measurement
  node. Although the exact rightward flowing beliefs would be non-Gaussian
  (due to the non-Gaussian assumption on the measurement channels
  <math|p<rsub|Y<rsub|i>\|Z<rsub|i>>>), GAMP approximates them as Gaussian
  using a 2nd-order Taylor series, and passes only the resulting means and
  variances. A further simplification employed by GAMP is to approximate the
  <em|differences> among the outgoing means/variances of each left node, and
  the incoming means/variances of each right node, using Taylor series. The
  GAMP algorithm<footnote|To be precise, the GAMP algorithm in <tabref|gamp>
  is an extension of that proposed in <cite|Rangan:10b>. <tabref|gamp>
  handles circular <em|complex-valued> distributions and <em|non-identically>
  distributed signals and measurements.> is summarized in <tabref|gamp>.

  <\putTable|gamp|The GAMP Algorithm>
    <\equation*>
      <tabular*|<tformat|<cwith|1|-1|1|1|cell-lborder|1ln>|<cwith|1|-1|1|1|cell-halign|l>|<cwith|1|-1|2|2|cell-halign|r>|<cwith|1|-1|3|3|cell-halign|c>|<cwith|1|-1|4|4|cell-halign|l>|<cwith|1|1|1|-1|cell-tborder|1ln>|<cwith|1|1|1|1|cell-col-span|2>|<cwith|1|1|1|1|cell-lborder|1ln>|<cwith|1|1|1|1|cell-halign|l>|<cwith|1|1|1|1|cell-rborder|0ln>|<cwith|1|1|1|-1|cell-valign|top>|<cwith|1|1|1|-1|cell-vmode|exact>|<cwith|1|1|1|-1|cell-height|<plus|1fn|-1mm>>|<cwith|8|8|1|1|cell-col-span|2>|<cwith|8|8|1|1|cell-lborder|1ln>|<cwith|8|8|1|1|cell-halign|l>|<cwith|8|8|1|1|cell-rborder|0ln>|<cwith|12|12|1|1|cell-col-span|2>|<cwith|12|12|1|1|cell-lborder|1ln>|<cwith|12|12|1|1|cell-halign|l>|<cwith|12|12|1|1|cell-rborder|0ln>|<cwith|22|22|1|1|cell-col-span|2>|<cwith|22|22|1|1|cell-lborder|1ln>|<cwith|22|22|1|1|cell-halign|l>|<cwith|22|22|1|1|cell-rborder|0ln>|<cwith|22|22|1|-1|cell-bborder|1ln>|<table|<row|<cell|<with|font-family|ss|d*e*f*i*n*i*t*i*o*n*s:>>|<cell|>|<cell|>|<cell|>|<cell|>>|<row|<cell|>|<cell|p<rsub|Z<rsub|i>\|Y<rsub|i>><around|(|z\|y;<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|=>|<cell|<frac|p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>|<big|int><rsub|z<rprime|'>>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z<rprime|'>|)><space|0.17em><mc|C*N><around|(|z<rprime|'>;<wide|z|^>,\<mu\><rsup|z>|)>>>|<cell|<text|(D1)>>>|<row|<cell|>|<cell|g<out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|=>|<cell|<frac|1|\<mu\><rsup|z>>*<around*|(|<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z\|y;<wide|z|^>,\<mu\><rsup|z>|}>-<wide|z|^>|)>>|<cell|<text|(D2)>>>|<row|<cell|>|<cell|g<out><rprime|'><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|=>|<cell|<frac|1|\<mu\><rsup|z>>*<around*|(|<frac|<value|var><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z\|y;<wide|z|^>,\<mu\><rsup|z>|}>|\<mu\><rsup|z>>-1|)>>|<cell|<text|(D3)>>>|<row|<cell|>|<cell|p<rsub|X<rsub|j>><space|-0.17em><around|(|x;<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|<frac|p<rsub|X<rsub|j>><space|-0.17em><around|(|x|)><space|0.17em><mc|C*N><around|(|x;<wide|r|^>,\<mu\><rsup|r>|)>|<big|int><rsub|x<rprime|'>>p<rsub|X<rsub|j>><space|-0.17em><around|(|x<rprime|'>|)><space|0.17em><mc|C*N><around|(|x<rprime|'>;<wide|r|^>,\<mu\><rsup|r>|)>>>|<cell|<text|(D4)>>>|<row|<cell|>|<cell|g<inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|<big|int><rsub|x>x*<space|0.17em>p<rsub|X<rsub|j>><space|-0.17em><around|(|x;<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|<text|(D5)>>>|<row|<cell|>|<cell|g<inp><rprime|'><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|<frac|1|\<mu\><rsup|r>>*<big|int><rsub|x><around|\||x-g<inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>|\|><rsup|2>*<space|0.17em>p<rsub|X<rsub|j>><space|-0.17em><around|(|x;<wide|r|^>,\<mu\><rsup|r>|)><space|1em>>|<cell|<text|(D6)>>>|<row|<cell|<with|font-family|ss|i*n*i*t*i*a*l*i*z*e:>>|<cell|>|<cell|>|<cell|>|<cell|>>|<row|<cell|>|<cell|\<forall\>j:<wide|x|^><rsub|j><around|(|1|)>>|<cell|=>|<cell|<big|int><rsub|x>x*<space|0.17em>p<rsub|X<rsub|j>><around|(|x|)>>|<cell|<text|(I1)>>>|<row|<cell|>|<cell|\<forall\>j:\<mu\><rsup|x><rsub|j><around|(|1|)>>|<cell|=>|<cell|<big|int><rsub|x><around|\||x-<wide|x|^><rsub|j><around|(|1|)>|\|><rsup|2>*p<rsub|X<rsub|j>><around|(|x|)>>|<cell|<text|(I2)>>>|<row|<cell|>|<cell|\<forall\>i:<wide|u|^><rsub|i><around|(|0|)>>|<cell|=>|<cell|0>|<cell|<text|(I3)>>>|<row|<cell|<with|font-family|ss|f*o*rn=1,2,3,\<ldots\>>>|<cell|>|<cell|>|<cell|>|<cell|>>|<row|<cell|>|<cell|\<forall\>i:<wide|z|^><rsub|i><around|(|n|)>>|<cell|=>|<cell|<with|math-display|false|<big|sum><rsub|j=0><rsup|L-1>\<Phi\><rsub|i*j>*<wide|x|^><rsub|j><around|(|n|)>>>|<cell|<text|(R1)>>>|<row|<cell|>|<cell|\<forall\>i:\<mu\><rsup|z><rsub|i><around|(|n|)>>|<cell|=>|<cell|<with|math-display|false|<big|sum><rsub|j=0><rsup|L-1><around|\||\<Phi\><rsub|i*j>|\|><rsup|2>*\<mu\><rsup|x><rsub|j><around|(|n|)>>>|<cell|<text|(R2)>>>|<row|<cell|>|<cell|\<forall\>i:<wide|p|^><rsub|i><around|(|n|)>>|<cell|=>|<cell|<wide|z|^><rsub|i><around|(|n|)>-\<mu\><rsup|z><rsub|i><around|(|n|)>*<space|0.17em><wide|u|^><rsub|i>*<around|(|n-1|)>>|<cell|<text|(R3)>>>|<row|<cell|>|<cell|\<forall\>i:<wide|u|^><rsub|i><around|(|n|)>>|<cell|=>|<cell|g<out><around|(|y<rsub|i>,<wide|p|^><rsub|i><around|(|n|)>,\<mu\><rsup|z><rsub|i><around|(|n|)>|)>>|<cell|<text|(R4)>>>|<row|<cell|>|<cell|\<forall\>i:\<mu\><rsup|u><rsub|i><around|(|n|)>>|<cell|=>|<cell|-g*'<out><around|(|y<rsub|i>,<wide|p|^><rsub|i><around|(|n|)>,\<mu\><rsup|z><rsub|i><around|(|n|)>|)>>|<cell|<text|(R5)>>>|<row|<cell|>|<cell|\<forall\>j:\<mu\><rsup|r><rsub|j><around|(|n|)>>|<cell|=>|<cell|<with|math-display|false|<around*|(|<big|sum><rsub|i=0><rsup|N-1><around|\||\<Phi\><rsub|i*j>|\|><rsup|2>*\<mu\><rsup|u><rsub|i><around|(|n|)>|)><rsup|-1>>>|<cell|<text|(R6)>>>|<row|<cell|>|<cell|\<forall\>j:<wide|r|^><rsub|j><around|(|n|)>>|<cell|=>|<cell|<with|math-display|false|<wide|x|^><rsub|j><around|(|n|)>+\<mu\><rsup|r><rsub|j><around|(|n|)>*<big|sum><rsub|i=0><rsup|N-1>\<Phi\><rsub|i*j><rsup|\<nosymbol\>>*<wide|u|^><rsub|i><around|(|n|)>>>|<cell|<text|(R7)>>>|<row|<cell|>|<cell|\<forall\>j:\<mu\><rsup|x><rsub|j>*<around|(|n<space|-0.17em>+<space|-0.17em>1|)>>|<cell|=>|<cell|\<mu\><rsup|r><rsub|j><around|(|n|)>*g*'<inp><around|(|<wide|r|^><rsub|j><around|(|n|)>,\<mu\><rsup|r><rsub|j><around|(|n|)>|)>>|<cell|<text|(R8)>>>|<row|<cell|>|<cell|\<forall\>j:<wide|x|^><rsub|j>*<around|(|n<space|-0.17em>+<space|-0.17em>1|)>>|<cell|=>|<cell|g<inp><around|(|<wide|r|^><rsub|j><around|(|n|)>,\<mu\><rsup|r><rsub|j><around|(|n|)>|)>>|<cell|<text|(R9)>>>|<row|<cell|<with|font-family|ss|e*n*d>>|<cell|>|<cell|>|<cell|>|<cell|>>>>>
    </equation*>
  </putTable>

  <subsection|Joint estimation and decoding using GAMP><label|sec:jced>

  We now detail our application of GAMP to joint channel-estimation and
  decoding (JCED) under the GM2-HMM tap prior, frequently referring to the
  factor graph in <figref|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust>.

  Because our factor graph is loopy, there exists considerable freedom in the
  message passing schedule. Roughly speaking, we choose to pass messages from
  the left to the right of <figref|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust>
  and back again, several times, stopping as soon as the messages converge.
  Each of these full cycles of message passing will be referred to as a
  \Pturbo iteration.\Q However, during a single turbo iteration, there may be
  multiple iterations of message passing <em|between> the GAMP and MC
  sub-graphs, which will be referred to as \Pequalizer\Q iterations.
  Furthermore, during a single equalizer iteration, there may be multiple
  iterations of message passing <em|within> the GAMP sub-graph, while there
  is at most one forward-backward iteration <em|within> the MC sub-graph.
  Finally, the SISO decoding block may itself be implemented using message
  passing, in which case it may also use several internal iterations. The
  message passing details are discussed below.

  At the start of the first turbo iteration, there is total uncertainty about
  the information bits, so that <math|Pr <around|{|b<rsub|m><space|-0.17em>=<space|-0.17em>1|}><space|-0.17em>=<space|-0.17em><frac|1|2><nbsp>\<forall\>m>.
  Thus, the initial bit beliefs flowing rightward out of the
  coding/interleaving block are uniformly distributed. Meanwhile, the
  pilot/training bits are known with certainty.

  Coded-bit beliefs are then propagated rightward into the symbol mapping
  nodes. Since the symbol mapping is deterministic, the corresponding pdf
  factors take the form <math|p<around|(|s<of|k><giv><wide|c|\<vect\>><of|l>|)>=\<delta\><rsub|k-l>>.
  The SPA dictates that the message passed rightward from symbol mapping node
  \P<math|<mc|M><rsub|i>>\Q takes the form

  <eqnarray|<tformat|<table|<row|<cell|p<rsub|<mc|M><rsub|i>\<rightarrow\>s<rsub|i>><around|(|s<of|k>|)>>|<cell|\<propto\>>|<cell|<big|sum><rsub|<wide|c|\<vect\>>\<in\><around|{|0,1|}><rsup|M>>p<around|(|s<of|k>\|<wide|c|\<vect\>>|)>*<big|prod><rsub|m=1><rsup|M>p<rsub|c<rsub|i,m>\<rightarrow\><mc|M><rsub|i>><around|(|c<rsub|m>|)><space|1em><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<big|prod><rsub|m=1><rsup|M>p<rsub|c<rsub|i,m>\<rightarrow\><mc|M><rsub|i>><around|(|c<rsub|m><of|k>|)>,<eq-number>>>>>>

  which is then copied forward as the message passed rightward from node
  <math|s<rsub|i>> (i.e., <math|p<rsub|<mc|M><rsub|i>\<rightarrow\>s<rsub|i>><around|(|s<of|k>|)>=p<rsub|s<rsub|i>\<rightarrow\>y<rsub|i>><around|(|s<of|k>|)>>).

  Recall, from <secref|gamp>, that the symbol-belief passed rightward into
  the measurement node \P<math|y<rsub|i>>\Q determines the pdf
  <math|p<rsub|Y<rsub|i>\|Z<rsub|i>>> used in GAMP. Writing this symbol
  belief as <math|<wide|\<beta\>|\<vect\>><rsub|i><defn><around|[|\<beta\><rsub|i><of|1>,\<ldots\>,\<beta\><rsub|i><of|2<rsup|M>>|]><tran>>
  for <math|\<beta\><rsub|i><of|k><defn>p<rsub|s<rsub|i>\<rightarrow\>y<rsub|i>><around|(|s<of|k>|)>>,
  equation <eqref|yi> implies the measurement pdf

  <eqnarray|<tformat|<table|<row|<cell|p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<beta\><rsub|i><of|k><space|0.17em><mc|C*N><around|(|y;s<of|k>z;\<mu\><rsup|v>|)>.<eq-number><label|eq:pY\|Z>>>>>>

  From <eqref|pY\|Z>, it is shown in <appref|out> that the quantities in
  (D2)-(D3) of <tabref|gamp> become

  <eqnarray|<tformat|<table|<row|<cell|g<out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|=>|<cell|<frac|1|\<mu\><rsup|z>>*<wide|e|^><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)><eq-number><label|eq:gout>>>|<row|<cell|g<rprime|'><out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|=>|<cell|<frac|1|\<mu\><rsup|z>>*<around*|(|<frac|\<mu\><rsup|e><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>|\<mu\><rsup|z>>-1|)><eq-number><label|eq:g'out>>>>>>

  for

  <eqnarray|<tformat|<table|<row|<cell|\<xi\><rsub|i><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|<defn>>|<cell|<frac|\<beta\><rsub|i><of|k><mc|C*N><around|(|y;s<of|k><wide|z|^>,<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z><space|-0.17em>+<space|-0.17em>\<mu\><rsup|v>|)>|<big|sum><rsub|k<rprime|'>>\<beta\><rsub|i><of|k<rprime|'>><mc|C*N><around|(|y;s<of|k<rprime|'>><wide|z|^>,<around|\||s<of|k<rprime|'>>|\|><rsup|2>*\<mu\><rsup|z><space|-0.17em>+<space|-0.17em>\<mu\><rsup|v>|)>><space|1em><eq-number><label|eq:xi>>>|<row|<cell|\<zeta\><of|k><around|(|\<mu\><rsup|z>|)>>|<cell|<defn>>|<cell|<frac|<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>|<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>><eq-number><label|eq:zeta>>>|<row|<cell|<wide|e|^><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|<defn>>|<cell|<around*|(|<frac|y|s<of|k>>-<wide|z|^>|)>*\<zeta\><of|k><around|(|\<mu\><rsup|z>|)><eq-number>>>|<row|<cell|<wide|e|^><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|<defn>>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<xi\><rsub|i><of|k><space|-0.17em><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>*<space|0.17em><wide|e|^><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)><eq-number><label|eq:ei>>>|<row|<cell|\<mu\><rsup|e><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|<defn>>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<xi\><rsub|i><of|k><space|-0.17em><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>*<space|0.17em><around*|(|<around|\||<wide|e|^><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>-<wide|e|^><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>|\|><rsup|2>+<frac|\<mu\><rsup|v>*\<zeta\><of|k><around|(|\<mu\><rsup|z>|)>|s<of|k>>|)><eq-number><label|eq:muei>>>>>>

  where <math|<wide|\<xi\>|\<vect\>><rsub|i><space|-0.17em><defn><space|-0.17em><around|[|\<xi\><rsub|i><of|1>,\<ldots\>,\<xi\><rsub|i><of|2<rsup|M>>|]><tran>>
  characterizes the posterior pmf on <math|s<rsub|i>> under the channel model
  <math|z<rsub|i>\<sim\><mc|C*N><around|(|<wide|z|^>,\<mu\><rsup|z>|)>>.
  Likewise, from <eqref|pxj>, it is shown in <appref|in> that the quantities
  (D5)-(D6) take the form

  <eqnarray|<tformat|<table|<row|<cell|g<inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|<around*|(|\<alpha\><rsub|j>*<space|0.17em>\<gamma\><rsup|1><rsub|j>+<around*|(|1-\<alpha\><rsub|j>|)>*<space|0.17em>\<gamma\><rsup|0><rsub|j>|)>*<wide|r|^><eq-number><label|eq:gin>>>|<row|<cell|g<rprime|'><inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|\<alpha\><rsub|j>*<around|(|1-\<alpha\><rsub|j>|)>*<around|(|\<gamma\><rsup|1><rsub|j>-\<gamma\><rsup|0><rsub|j>|)><rsup|2><space|0.17em><around|\||<wide|r|^>|\|><rsup|2>/\<mu\><rsup|r>+\<alpha\><rsub|j>*\<gamma\><rsup|1><rsub|j>+<around|(|1-\<alpha\><rsub|j>|)>*\<gamma\><rsup|0><rsub|j>,<eq-number><label|eq:g'in>>>>>>

  for <math|\<gamma\><rsup|0><rsub|j><around|(|\<mu\><rsup|r>|)><defn><around|(|1+\<mu\><rsup|r>/\<mu\><rsub|j><rsup|0>|)><rsup|-1>>,
  <math|\<gamma\><rsup|1><rsub|j><around|(|\<mu\><rsup|r>|)><defn><around|(|1+\<mu\><rsup|r>/\<mu\><rsub|j><rsup|1>|)><rsup|-1>>,
  and

  <eqnarray|<tformat|<table|<row|<cell|\<alpha\><rsub|j><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|<defn>>|<cell|<frac|1|1+<around*|(|<wide*|<frac|\<lambda\><rsub|j>|1-\<lambda\><rsub|j>>|\<wide-underbrace\>><rsub|<with|math-display|true|<mc|L><pri><rsub|j>>><wide*|<frac|<mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsub|j><rsup|1>+\<mu\><rsup|r>|)>|<mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsub|j><rsup|0>+\<mu\><rsup|r>|)>>|\<wide-underbrace\>><rsub|<with|math-display|true|<mc|L><ext><rsub|j><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>>|)><rsup|-1>>.<eq-number><label|eq:alfj>>>>>>

  Above, <math|<mc|L><rsub|j><pri>> is the apriori likelihood ratio
  <math|<frac|Pr <around|{|d<rsub|j>=1|}>|Pr <around|{|d<rsub|j>=0|}>>> on
  the hidden state, <math|<mc|L><rsub|j><ext><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>
  is GAMP's extrinsic likelihood ratio, and
  <math|\<alpha\><rsub|j><around|(|<wide|r|^>,\<mu\><rsup|r>|)>> is the
  corresponding posterior probability that <math|d<rsub|j>=1>.

  Using <eqref|gout>-<eqref|alfj>, the GAMP algorithm in <tabref|gamp> is
  iterated until it converges. In doing so, GAMP generates (a close
  approximation to) both the conditional means <math|<hvec|x>> and variances
  <math|<wide|\<mu\>|\<vect\>><rsup|x><space|-0.17em><defn><space|-0.17em><around|[|\<mu\><rsub|0><rsup|x>,\<ldots\>,\<mu\><rsub|L-1><rsup|x>|]><tran>>
  given the observations <math|<wide|y|\<vect\>>>, the soft symbol priors
  <math|<wide|\<beta\>|\<vect\>><space|-0.17em><defn><space|-0.17em><around|[|<wide|\<beta\>|\<vect\>><rsub|0>,\<ldots\>,<wide|\<beta\>|\<vect\>><rsub|L-1>|]><tran>>
  and the sparsity prior <math|<wide|\<lambda\>|\<vect\>>>. Conveniently,
  GAMP also returns (close approximations to) both the conditional means
  <math|<hvec|z>> and variances <math|<wide|\<mu\>|\<vect\>><rsup|z>> of the
  subchannel gains <math|<wide|z|\<vect\>>>, as well as posteriors
  <math|<wide|\<xi\>|\<vect\>><space|-0.17em><defn><space|-0.17em><around|[|<wide|\<xi\>|\<vect\>><rsub|0>,\<ldots\>,<wide|\<xi\>|\<vect\>><rsub|L-1>|]><tran>>
  on the symbols <math|<wide|s|\<vect\>>>.

  Before continuing, we discuss some GAMP details that are specific to our
  OFDM-JCED application. First, we notice that, to guarantee that the
  variance <math|\<mu\><rsub|i><rsup|u><around|(|n|)>> in (R5) is positive,
  we must have <math|\<mu\><rsup|e><rsub|i><space|-0.17em>\<less\><space|-0.17em>\<mu\><rsub|z>>
  in <eqref|g'out>. Since this is not necessarily the case during the first
  few GAMP iterations, we clip <math|\<mu\><rsup|e><rsub|i>> at the value
  <math|0.99*\<mu\><rsup|z>>, where <math|0.99> was chosen heuristically.
  Second, due to unit-modulus property of the DFT elements
  <math|\<Phi\><rsub|i*j>>, step (R2) in <tabref|gamp> simplifies to
  <math|\<mu\><rsup|z><rsub|i><around|(|n|)><space|-0.17em>=<space|-0.17em><big|sum><rsub|j>\<mu\><rsub|j><rsup|x><around|(|n|)>>
  and (R6) simplifies to <math|\<mu\><rsub|j><rsup|r><around|(|n|)><space|-0.17em>=<space|-0.17em><around*|(|<big|sum><rsub|i>\<mu\><rsub|i><rsup|u><around|(|n|)>|)><rsup|-1>>.
  With these simplifications, the complexity of GAMP is dominated by either
  the matrix-vector products <math|<big|sum><rsub|j>\<Phi\><rsub|i*j>*<wide|x|^><rsub|j><around|(|n|)>>
  in (R1) and <math|<big|sum><rsub|i>\<Phi\><rsub|i*j><rsup|\<ast\>>*<wide|u|^><rsub|i><around|(|n|)>>
  in (R7), which can be implemented using a <math|N*log<rsub|2>N>-multiply
  FFT when <math|N> is a power-of-two, or by the calculation of
  <math|<around|{|<wide|e|^><rsub|i>,\<mu\><rsub|i><rsup|e>|}><rsub|i=0><rsup|N-1>>
  in <eqref|ei>-<eqref|muei>, which requires
  <math|<mc|O><around|(|N*2<rsup|M>|)>> multiplies. Thus, GAMP requires only
  <math|<mc|O><around|(|N*log<rsub|2>N+N*2<rsup|M>|)>> multiplies per
  iteration.

  After the messages within the GAMP sub-graph have converged, tap-state
  beliefs are passed rightward to the MC sub-graph. In particular, the SPA
  dictates that GAMP passes tap-state likelihoods or, equivalently, the
  extrinsic likelihood ratios <math|<mc|L><rsub|j><ext>>. Since the MC
  sub-graph is non-loopy, only one iteration of forward-backward message
  passing is performed,<footnote|Message passing on the MC factor graph is a
  standard procedure. For details, we refer the reader to
  <cite|MacKay:Book:03|Bishop:Book:07>.> after which the resulting tap-state
  likelihoods are passed leftward back to GAMP, where they are treated as
  tap-state priors <math|<wide|\<lambda\>|\<vect\>>> in the next equalizer
  iteration. This interaction between the GAMP and MC sub-blocks can be
  recognized as an incarnation of the structured-sparse reconstruction scheme
  recently proposed by the authors in <cite|Schniter:CISS:10>.

  When the tap-state likelihoods passed between GAMP and MC have converged,
  the equalizer iterations are terminated and messages are passed leftward
  from the GAMP block. For this, SPA dictates that a symbol-belief propagates
  leftward from the <math|y<rsub|i>> node with the form

  <eqnarray|<tformat|<table|<row|<cell|p<rsub|s<rsub|i>\<leftarrow\>y<rsub|i>><around|(|s|)>>|<cell|\<propto\>>|<cell|<big|int><rsub|z><mc|C*N><around|(|y<rsub|i>;s*z,\<mu\><rsup|v>|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^><rsub|i>,\<mu\><rsub|i><rsup|z>|)><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<mc|C*N><around|(|y<rsub|i>;s*<wide|z|^><rsub|i>,<around|\||s|\|><rsup|2>*\<mu\><rsub|i><rsup|z>+\<mu\><rsup|v>|)>,<eq-number>>>>>>

  where <math|<around|(|<wide|z|^><rsub|i>,\<mu\><rsub|i><rsup|z>|)>> play
  the role of soft channel estimates. The SPA then implies that
  <math|p<rsub|<mc|M><rsub|i>\<leftarrow\>s<rsub|i>><around|(|s|)>=p<rsub|s<rsub|i>\<leftarrow\>y<rsub|i>><around|(|s|)>>.

  Next, beliefs are passed leftward from each symbol-mapping node
  <math|<mc|M><rsub|i>> to the corresponding bit nodes <math|c<rsub|i,m>>.
  From the SPA, they take the form

  <eqnarray|<tformat|<table|<row|<cell|<lefteqn|p<rsub|c<rsub|i,m>\<leftarrow\><mc|M><rsub|i>><around|(|c|)>>>|<cell|>|<cell|>>|<row|<cell|>|<cell|\<propto\>>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><big|sum><rsub|<wide|c|\<vect\>>:c<rsub|m>=c>p<around|(|s<of|k><giv><wide|c|\<vect\>>|)><nbsp>p<rsub|<mc|M><rsub|i>\<leftarrow\>s<rsub|i>><around|(|s<of|k>|)>*<big|prod><rsub|m<rprime|'>\<neq\>m>p<rsub|c<rsub|i,m<rprime|'>>\<rightarrow\><mc|M><rsub|i>><around|(|c<rsub|m<rprime|'>>|)>>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k:c<rsub|m><of|k>=c>p<rsub|<mc|M><rsub|i>\<leftarrow\>s<rsub|i>><around|(|s<of|k>|)><frac|<big|prod><rsub|m<rprime|'>=1><rsup|M>p<rsub|c<rsub|i,m<rprime|'>>\<rightarrow\><mc|M><rsub|i>><around|(|c<rsub|m<rprime|'>><of|k>|)>|p<rsub|c<rsub|i,m>\<rightarrow\><mc|M><rsub|i>><around|(|c|)>><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<frac|1|p<rsub|c<rsub|i,m>\<rightarrow\><mc|M><rsub|i>><around|(|c|)>>*<big|sum><rsub|k:c<rsub|m><of|k>=c>p<rsub|<mc|M><rsub|i>\<leftarrow\>s<rsub|i>><around|(|s<of|k>|)>*p<rsub|<mc|M><rsub|i>\<rightarrow\>s<rsub|i>><around|(|s<of|k>|)><eq-number>>>>>>

  for pairs <math|<around|(|i,m|)>> that do not correspond to pilot/training
  bits. (Since the pilot/training bits are known with certainty, there is no
  need to update their pmfs.)

  Finally, messages are passed leftward into the coding/interleaving block.
  Doing so is equivalent to feeding extrinsic soft bit estimates to a
  soft-input/soft-output (SISO) decoder/deinterleaver, which treats them as
  priors. Since SISO decoding is a well-studied topic
  <cite|MacKay:Book:03|Richardson:Book:09> and high-performance
  implementations are readily available (e.g., <cite|Kozintsev:SW>), we will
  not elaborate on the details here. It suffices to say that, once the
  extrinsic outputs of the SISO decoder have been computed, they are
  re-interleaved and passed rightward from the coding/interleaving block to
  begin another turbo iteration. These turbo iterations continue until either
  the decoder detects no bit errors, the soft bit estimates have converged,
  or a maximum number of iterations has elapsed.

  <section|Numerical Results><label|sec:sims>

  In this section, we present numerical results that compare our GAMP-based
  JCED scheme to decoupled channel-estimation and decoding (DCED) based on
  pilot-aided linear MMSE (LMMSE) channel estimates, LASSO channel estimates
  <cite|Tibshirani:JRSSb:96>, and perfect channel state information (CSI).

  <subsection|Setup>

  For all results, we used irregular LDPC codes with codeword length
  <math|\<approx\><space|-0.17em>10000> and average column weight <math|3>,
  generated (and decoded) using the publicly available software
  <cite|Kozintsev:SW>, with random interleaving. We focus on the case of
  <math|N<space|-0.17em>=<space|-0.17em>1024> subcarrier OFDM with
  <math|16>-QAM (i.e., <math|M<space|-0.17em>=<space|-0.17em>4>) operating at
  a spectral efficiency of <math|\<eta\><space|-0.17em>=<space|-0.17em>2>
  bpcu. For bit-to-symbol mapping, we used multilevel Gray-mapping
  <cite|deJong:TCOM:05>, noting recent work <cite|Samuel:ASIL:09> that
  conjectures the optimality of Gray-mapping when BICM is used with a strong
  code. In some simulations, we used <math|<Np><space|-0.17em>\<gtr\><space|-0.17em>0>
  pilot-only subcarriers and <math|<Mt><space|-0.17em>=<space|-0.17em>0>
  interspersed training bits, whereas in others we used
  <math|<Np><space|-0.17em>=<space|-0.17em>0> and
  <math|<Mt><space|-0.17em>\<gtr\><space|-0.17em>0>. When
  <math|<Np><space|-0.17em>\<gtr\><space|-0.17em>0>, the pilot subcarriers
  were placed randomly and modulated with (known) QAM symbols chosen
  uniformly at random. When <math|<Mt><space|-0.17em>\<gtr\><space|-0.17em>0>,
  the training bits were placed at the most significant bits (MSBs) of
  uniformly spaced data-subcarriers and modulated with the bit value
  <math|1>.

  Realizations of the tap vector <math|<wide|x|\<vect\>><around|[|q|]>> were
  generated from IEEE<nbsp>802.15.4a outdoor-NLOS impulse responses and SRRC
  pulses, as described in <secref|IEEE>, and <em|not> from the GM2-HMM prior
  used by GAMP. The tap vectors generated for our simulations are thus as
  realistic as one can hope to obtain in software. All reported results are
  averaged over <math|\<geq\>1000> channel realizations (i.e.,
  <math|\<gtrsim\>10<rsup|7>> info bits).

  The GM2-HMM parameters <math|<wide|\<mu\>|\<vect\>><rsup|0>,<wide|\<mu\>|\<vect\>><rsup|1>,<wide|p|\<vect\>><rsup|01>,<wide|p|\<vect\>><rsup|10>>
  used in GAMP were fit from <math|10000> realizations of the tap-vector
  <math|<wide|x|\<vect\>>> using the procedure described in <secref|GM2>. In
  doing so, we implicitly assume<footnote|If, instead, we knew that the
  receiver would be used in a different operating scenario, then we could
  generate representative realizations of <math|<wide|x|\<vect\>>> for that
  scenario and fit the GM2-HMM parameters accordingly. Furthermore, one could
  optimize the receiver for any desired balance between ``typical'' and
  ``worst-case'' operating conditions by simply choosing appropriate training
  realizations <math|<wide|x|\<vect\>>>. > that the receiver has been
  designed for outdoor scenario, and we leverage the prior information made
  available by the extensive measurement campaign conducted for the IEEE
  802.15.4a standard <cite|Molisch:802.15.4a>. For JCED-GAMP, we used a
  <em|maximum> of <math|20> turbo iterations, <math|5> equalizer iterations,
  <math|15> GAMP iterations, and <math|25> LDPC decoder iterations, although
  we stress that these maxima were seldom (if ever) reached.<footnote|For
  example, after the second turbo iteration, it is typical to see only a
  single equalizer iteration and a single GAMP iteration.>

  <subsection|Comparison with DCED schemes>

  Given set of tap estimates <math|<around|{|<hvec|x><around|[|q|]>|}><rsub|q=1><rsup|Q>>,
  the following procedure was used to implement DCED. First, the subcarrier
  estimates <math|<hvec|z><around|[|q|]><space|-0.17em>=<space|-0.17em><wide|\<Phi\>|\<vect\>><hvec|x><around|[|q|]>>
  were computed, from which the (genie-aided empirical) variance
  <math|<wide|\<mu\>|^><rsup|z><around|[|q|]><space|-0.17em><defn><space|-0.17em><norm|<hvec|z><around|[|q|]>-<wide|z|\<vect\>><around|[|q|]>><rsub|2><rsup|2>/N>
  was calculated. Then, using the soft channel estimates
  <math|<around|{|<hvec|z><around|[|q|]>,<wide|\<mu\>|^><rsup|z><around|[|q|]>|}><rsub|q=1><rsup|Q>>,
  leftward SPA-BP on the factor graph in <figref|factor<rsub|g>raph<rsub|n>oncoh<rsub|c>lust>
  was performed exactly as described in <secref|jced<rsub|g>amp>, ensuring
  that the soft channel estimates were fairly/optimally used for SISO
  decoding.

  To compute the LMMSE tap-vector estimate
  <math|<hvec|x><lmmse><around|[|q|]>>, the pilot symbols
  <math|<wide|s|\<vect\>><pt><around|[|q|]>\<in\><const><rsup|<Np>>> and
  pilot subcarrier observations <math|<wide|y|\<vect\>><pt><around|[|q|]>\<in\><Complex><rsup|<Np>>>
  were used as follows:

  <eqnarray|<tformat|<table|<row|<cell|<hvec|x><lmmse><around|[|q|]>>|<cell|=>|<cell|<wide|A|\<vect\>><herm><around|[|q|]>*<around*|(|<wide|A|\<vect\>><around|[|q|]><Diag><around|(|<wide|\<rho\>|\<vect\>>|)><wide|A|\<vect\>><herm><around|[|q|]>+\<mu\><rsup|v>*<wide|I|\<vect\>>|)><rsup|-1>*<wide|y|\<vect\>><pt><around|[|q|]>,<space|1em><eq-number>>>>>>

  where <math|<wide|\<rho\>|\<vect\>>> denotes the PDP,
  <math|<wide|A|\<vect\>><around|[|q|]><defn><Diag><around|(|<wide|s|\<vect\>><pt><around|[|q|]>|)>*<wide|\<Phi\>|\<vect\>><pt>>,
  and <math|<wide|\<Phi\>|\<vect\>><pt>\<in\><Complex><rsup|<Np>\<times\>L>>
  denotes the matrix constructed from the pilot rows and the first <math|L>
  columns of the <math|N>-DFT matrix.

  To compute a pilot-based LASSO<footnote|The criterion employed by LASSO
  <cite|Tibshirani:JRSSb:96> is equivalent to the one employed in ``basis
  pursuit denoising'' <cite|Chen:JSC:98>.> tap estimate
  <math|<hvec|x><lasso><around|[|q|]>>, we invoked the celebrated SPGL1
  algorithm <cite|vandenBerg:JSC:08> with the measurement matrix
  <math|<wide|A|\<vect\>><around|[|q|]>> and a genie-optimized tuning
  parameter.<footnote|The performance of LASSO/SPGL1 is highly dependent on
  the value of a tuning parameter that determines the tradeoff between the
  estimate's sparsity and the residual's variance. To optimize this tradeoff,
  for each realization, SPGL1 was invoked over a dense grid of tuning
  parameters, and the one that minimized <math|<NMSE>> (with respect to the
  true channel) was chosen.> We note that, due to the two genie-aided steps,
  the performance attained by LASSO is somewhat optimistic.

  <subsection|<math|<BER>> versus the number of pilot subcarriers
  <math|<Np>>>

  <Figref|ber<rsub|v>s<rsub|N>p> shows bit error rate (<math|<BER>>) versus
  the number of pilot subcarriers <math|<Np>> at
  <math|E<rsub|b>/N<rsub|o><space|-0.17em>=<space|-0.17em>11> dB and a fixed
  spectral efficiency of <math|\<eta\><space|-0.17em>=<space|-0.17em>2> bpcu.
  In this and other figures, \P<with|font-family|ss|font-size|0.84|GAMP-#
  MC-5>,\Q refers to JCED-GAMP with # turbo
  iterations<footnote|``<with|font-family|ss|GAMP-fin>'' refers to JCED-GAMP
  after convergence. The number of turbo iterations it takes to converge is
  realization- and <math|E<rsub|b>/N<rsub|o>>-dependent.> and 5 equalizer
  iterations, whereas \P<with|font-family|ss|font-size|0.84|GAMP-#>\Q alone
  indicates that the MC block was disconnected (i.e., there was no attempt to
  exploit tap clustering).

  The curves in <figref|ber<rsub|v>s<rsub|N>p> exhibit a \Pnotched\Q shape
  because, as <math|<Np>> increases, the code rate <math|R> must decrease to
  maintain the fixed spectral efficiency <math|\<eta\>=2> bpcu; while an
  increase in <math|<Np>> generally makes channel estimation easier, the
  reduction in <math|R> makes data decoding more difficult. For all schemes
  under comparison, <figref|ber<rsub|v>s<rsub|N>p> suggests that the choice
  <math|<Np><space|-0.17em>\<approx\><space|-0.17em>224> is optimal under the
  chosen operating conditions. Overall, we see JCED-GAMP significantly
  outperforming both DCED-LMMSE and DCED-LASSO even after one turbo
  iteration. Moreover, we see a noticeable gain from the use of the MC block
  during the first two turbo iterations, and after turbo convergence if too
  few pilots are used (i.e., <math|<Np>\<less\>224>).
  <putFrag|ber<rsub|v>s<rsub|N>p|<math|<BER>> versus number of pilot
  subcarriers <math|<Np>>, for <math|E<rsub|b>/N<rsub|o><space|-0.17em>=<space|-0.17em>11>
  dB, <math|<Mt><space|-0.17em>=<space|-0.17em>0> training bits,
  <math|\<eta\><space|-0.17em>=<space|-0.17em>2> bpcu, and
  <math|16>-QAM.|<figsize>|<psfrag*|Np|t>[t][0.9]<math|<Np>> <psfrag*|average
  BER|>[][0.9]<math|<BER>> <psfrag*|M=4, SNR=14dB, bpcu=2|>[][0.9]>

  <subsection|<math|<BER>> versus <math|E<rsub|b>/N<rsub|o>>>

  <Figref|ber<rsub|v>s<rsub|s>nr<rsub|N>p> shows <math|<BER>> versus
  <math|E<rsub|b>/N<rsub|o>> using <math|<Np><space|-0.17em>=<space|-0.17em>224>
  pilot subcarriers (as suggested by <figref|ber<rsub|v>s<rsub|N>p>) and
  <math|<Mt><space|-0.17em>=<space|-0.17em>0>. There, we see DCED-LASSO
  performing about <math|5> dB from perfect-CSI, and DCED-LMMSE performing
  significantly worse. Remarkably, we see JCED-GAMP performing within
  <math|1> dB of soft decoding under perfect-CSI (and within <math|1.5> dB
  after only <math|2> turbo iterations). The proximity of the perfect-CSI and
  JCED-GAMP <math|<BER>> traces confirms that the proposed GM2-HMM prior does
  an excellent job of capturing the lag-dependent clustered-sparse
  characteristics of the true channel taps. Consistent with
  <figref|ber<rsub|v>s<rsub|N>p>, we see that JCED-GAMP benefits
  significantly from the use of the MC block during the initial turbo
  iterations, and less significantly after turbo convergence.
  <putFrag|ber<rsub|v>s<rsub|s>nr<rsub|N>p|<math|<BER>> versus
  <math|E<rsub|b>/N<rsub|o>>, for <math|<Np><space|-0.17em>=<space|-0.17em>224>
  pilot subcarriers, <math|<Mt><space|-0.17em>=<space|-0.17em>0> training
  bits, <math|\<eta\><space|-0.17em>=<space|-0.17em>2> bpcu, and
  <math|16>-QAM.|<figsize>|<psfrag*|Np|t>[t][0.9]<math|<Np>> <psfrag*|average
  BER|>[][0.9]<math|<BER>> <psfrag*|Eb / No
  [dB]|>[][0.9]<math|E<rsub|b>/N<rsub|o>> [dB] <psfrag*|M=4, Np=224, Mt=0,
  bpcu=2|>[][0.9]>

  <subsection|<math|<BER>> versus the number of interspersed training bits
  <math|<Mt>>>

  Although <math|<Np>\<gtr\>0> pilot subcarriers are required for DCED
  channel estimation, JCED can function with
  <math|<Np><space|-0.17em>=<space|-0.17em>0> as long as <math|<Mt>\<gtr\>0>
  interspersed training bits are used. To illustrate this fact,
  <figref|ber<rsub|v>s<rsub|M>t> shows <math|<BER>> versus <math|<Mt>> at
  <math|E<rsub|b>/N<rsub|o><space|-0.17em>=<space|-0.17em>10> dB, a fixed
  spectral efficiency of <math|\<eta\><space|-0.17em>=<space|-0.17em>2> bpcu,
  and <math|<Np><space|-0.17em>=<space|-0.17em>0>. There we see that there is
  a relatively wide tolerance on <math|<Mt>>, although the value
  <math|<Mt>\<approx\>450> appears best when convergence speed is taken into
  account. Moreover, we can see a small but noticeable BER improvement when
  the MC block is used. More importantly, by comparing the <math|<BER>>
  performance in <figref|ber<rsub|v>s<rsub|M>t> to that in
  <figref|ber<rsub|v>s<rsub|s>nr<rsub|N>p> at
  <math|E<rsub|b>/N<rsub|o><space|-0.17em>=<space|-0.17em>10> dB, we see that
  the <math|<BER>> is about <math|6\<times\>> lower in
  <figref|ber<rsub|v>s<rsub|M>t>. Thus, we conclude that the use of
  interspersed training bits is more efficient than the use of dedicated
  pilot subcarriers. A similar observation was made in
  <cite|Schniter:ASIL:10|Schniter:PHYCOM:11> under somewhat different
  channels and message passing algorithms.
  <putFrag|ber<rsub|v>s<rsub|M>t|<math|<BER>> versus number of interspersed
  training bits <math|<Mt>>, for <math|E<rsub|b>/N<rsub|o>=10> dB,
  <math|<Np><space|-0.17em>=<space|-0.17em>0> pilots subcarriers,
  <math|\<eta\><space|-0.17em>=<space|-0.17em>2> bpcu, and
  <math|16>-QAM.|<figsize>|<psfrag*|Mt|t>[t][0.9]<math|<Mt>> <psfrag*|average
  BER|>[][0.9]<math|<BER>> <psfrag*|M=4, SNR=13dB, bpcu=2|>[][0.9]>

  <section|Conclusion><label|sec:conc>

  In this paper, we presented a factor-graph approach to joint
  channel-estimation and decoding (JCED) for BICM-OFDM that combines recent
  advances in message-passing algorithms for structured-sparse signal
  reconstruction <cite|Rangan:10b|Schniter:CISS:10> and SISO decoding
  <cite|MacKay:Book:03>. Different from existing factor-graph approaches to
  JCED, ours is able to exploit the lag-dependent clustered sparsity that
  typifies channel taps under large communication bandwidths, such as those
  that result from pulse-shaped communication over IEEE<nbsp>802.15.4a
  channels. For this purpose, we proposed the use of a two-state Gaussian
  mixture prior with a Markov model on the hidden tap states. The
  implementation complexity of our JCED scheme is dominated by
  <math|<mc|O><around|(|N*log<rsub|2>N<space|-0.17em>+<space|-0.17em>N*2<rsup|M>|)>>
  multiplies per GAMP iteration, facilitating the application to systems with
  many subcarriers <math|N> and channel taps <math|L\<less\>N>. Experiments
  with IEEE<nbsp>802.15.4a channels showed <math|<BER>> performance within
  <math|1> dB of the known-channel bound, and about <math|4> dB better than
  the LASSO-based \Pcompressed channel sensing\Q approach. These experiments
  also suggested that, for JCED, the use of interspersed training bits is
  more efficient than the placement of known bits in dedicated pilot
  subcarriers.

  <appendices>

  <section|Derivation of GAMP Functions <math|g<out>> and
  <math|g<rprime|'><out>>><label|app:out>

  In this appendix, we derive the GAMP quantities
  <math|g<out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>> and
  <math|g<rprime|'><out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>> given in
  <eqref|gout>-<eqref|ei>.

  From (D1), we have that

  <eqnarray|<tformat|<table|<row|<cell|<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>>|<cell|=>|<cell|<frac|1|p<rsub|Y<rsub|i>><around|(|y|)>>*<big|int><rsub|z>z*<space|0.17em>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>,<space|1em><eq-number><label|eq:gout2>>>>>>

  where <math|p<rsub|Y<rsub|i>><around|(|y|)><defn><big|int><rsub|z>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>>.
  From <eqref|pY\|Z>, we rewrite <math|p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)>>
  as

  <eqnarray|<tformat|<table|<row|<cell|p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><frac|\<beta\><rsub|i><of|k>|s<of|k>><space|0.17em><mc|C*N><around*|(|z;<frac|y|s<of|k>>,<frac|\<mu\><rsup|v>|<around|\||s<of|k>|\|><rsup|2>>|)>,<eq-number>>>>>>

  so that

  <eqnarray|<tformat|<table|<row|<cell|<big|int><rsub|z>z*<space|0.17em>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><frac|\<beta\><rsub|i><of|k>|s<of|k>>*<big|int><rsub|z>z<space|0.17em><mc|C*N><around*|(|z;<frac|y|s<of|k>>,<frac|\<mu\><rsup|v>|<around|\||s<of|k>|\|><rsup|2>>|)><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)><space|1em><eq-number>>>|<row|<cell|p<rsub|Y<rsub|i>><around|(|y|)>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><frac|\<beta\><rsub|i><of|k>|s<of|k>>*<big|int><rsub|z><mc|C*N><around*|(|z;<frac|y|s<of|k>>,<frac|\<mu\><rsup|v>|<around|\||s<of|k>|\|><rsup|2>>|)><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>.<eq-number>>>>>>

  Using the property that

  <eqnarray|<tformat|<table|<row|<cell|<mc|C*N><around|(|x;<wide|\<theta\>|^>,\<mu\><rsup|\<theta\>>|)><mc|C*N><around|(|x;<wide|\<phi\>|^>,\<mu\><rsup|\<phi\>>|)>>|<cell|=>|<cell|<mc|C*N><around*|(|x;<frac|<wide|\<theta\>|^>/\<mu\><rsup|\<theta\>>+<wide|\<phi\>|^>/\<mu\><rsup|\<phi\>>|1/\<mu\><rsup|\<theta\>>+1/\<mu\><rsup|\<phi\>>>,<frac|1|1/\<mu\><rsup|\<theta\>>+1/\<mu\><rsup|\<phi\>>>|)><mc|C*N><around|(|0;<wide|\<theta\>|^>-<wide|\<phi\>|^>,\<mu\><rsup|\<theta\>>+\<mu\><rsup|\<phi\>>|)>,<eq-number><label|eq:pogr>>>>>>

  we can rewrite

  <eqnarray|<tformat|<table|<row|<cell|<lefteqn|<big|int><rsub|z>z*<space|0.17em>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>>>|<cell|>|<cell|>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><frac|\<beta\><rsub|i><of|k>|s<of|k>><mc|C*N><around*|(|0;<frac|y<rsub|i>|s>-<wide|z|^>,<frac|\<mu\><rsup|v>|<around|\||s<of|k>|\|><rsup|2>>+\<mu\><rsup|z>|)>*<big|int><rsub|z>z<space|0.17em><mc|C*N><around*|(|z;<frac|<frac|y|s<of|k>>*<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|<wide|z|^>|\<mu\><rsup|z>>|<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|1|\<mu\><rsup|z>>>,<frac|1|<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|1|\<mu\><rsup|z>>>|)><eq-number><label|eq:prod>>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><frac|\<beta\><rsub|i><of|k>|s<of|k>><mc|C*N><around*|(|<frac|y<rsub|i>|s>;<wide|z|^>,<frac|\<mu\><rsup|v>|<around|\||s<of|k>|\|><rsup|2>>+\<mu\><rsup|z>|)>*<frac|<frac|y|s<of|k>>*<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|<wide|z|^>|\<mu\><rsup|z>>|<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|1|\<mu\><rsup|z>>><space|1em><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<beta\><rsub|i><of|k><mc|C*N><around*|(|y<rsub|i>;s<of|k><wide|z|^>,<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>|)>*<around*|(|<wide*|<around*|(|<frac|y|s<of|k>>-<wide|z|^>|)>*<frac|<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>|<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>>|\<wide-underbrace\>><rsub|<with|math-display|true|<defn><wide|e|^><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>>+<wide|z|^>|)><eq-number><label|eq:intz>>>>>>

  and, using the same procedure, we get

  <eqnarray|<tformat|<table|<row|<cell|p<rsub|Y<rsub|i>><around|(|y|)>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<beta\><rsub|i><of|k><mc|C*N><around*|(|y<rsub|i>;s<of|k><wide|z|^>,<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>|)>.<eq-number><label|eq:pY>>>>>>

  With <math|\<xi\><rsub|i><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>
  defined in <eqref|xi>, equations <eqref|gout2> and <eqref|intz> and
  <eqref|pY> combine to give

  <eqnarray|<tformat|<table|<row|<cell|<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<xi\><rsub|i><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>*<around*|(|<wide|e|^><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>+<wide|z|^>|)>.<space|1em><eq-number><label|eq:EZY>>>>>>

  Finally, from <eqref|EZY> and the definition of
  <math|g<out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>> in (D2), equation
  <eqref|gout> follows immediately.

  From (D1), we have that

  <eqnarray|<tformat|<table|<row|<cell|<value|var><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>>|<cell|=>|<cell|<frac|1|p<rsub|Y<rsub|i>><around|(|y|)>>*<big|int><rsub|z><around|\||z-<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>|\|><rsup|2>*<space|0.17em>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>.<eq-number><label|eq:varZY2>>>>>>

  Similar to <eqref|prod>, we can write

  <eqnarray|<tformat|<table|<row|<cell|<lefteqn|<big|int><rsub|z><around|\||z-<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>|\|><rsup|2>*<space|0.17em>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>>>|<cell|>|<cell|>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>><frac|\<beta\><rsub|i><of|k>|s<of|k>><mc|C*N><around*|(|0;<frac|y<rsub|i>|s>-<wide|z|^>,<frac|\<mu\><rsup|v>|<around|\||s<of|k>|\|><rsup|2>>+\<mu\><rsup|z>|)>>>|<row|<cell|>|<cell|>|<cell|\<times\><big|int><rsub|z><around|\||z-<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>|\|><rsup|2><space|0.17em><mc|C*N><around*|(|z;<frac|<frac|y|s<of|k>>*<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|<wide|z|^>|\<mu\><rsup|z>>|<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|1|\<mu\><rsup|z>>>,<frac|1|<frac|<around|\||s<of|k>|\|><rsup|2>|\<mu\><rsup|v>>+<frac|1|\<mu\><rsup|z>>>|)>.<eq-number>>>>>>

  Then, using the change-of-variable <math|<wide|z|~><defn>z-<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>>,
  and absorbing the <math|s<of|k>> terms as done in <eqref|intz>, we get

  <eqnarray|<tformat|<table|<row|<cell|<lefteqn|<big|int><rsub|z><around|\||z-<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>|\|><rsup|2>*<space|0.17em>p<rsub|Y<rsub|i>\|Z<rsub|i>><around|(|y\|z|)><space|0.17em><mc|C*N><around|(|z;<wide|z|^>,\<mu\><rsup|z>|)>>>|<cell|>|<cell|>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<beta\><rsub|i><of|k><mc|C*N><around*|(|y<rsub|i>;s<of|k><wide|z|^>,<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>|)>>>|<row|<cell|>|<cell|>|<cell|\<times\><big|int><rsub|<wide|z|~>><around|\||<wide|z|~>|\|><rsup|2><space|0.17em><mc|C*N><around*|(|<wide|z|~>;<wide|e|^><of|k>+<wide*|<wide|z|^>-<E><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>|\<wide-underbrace\>><rsub|<with|math-display|true|=-<wide|e|^><rsub|i>>>,<frac|\<mu\><rsup|v>*\<mu\><rsup|z>|<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>>|)><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<beta\><rsub|i><of|k><mc|C*N><around*|(|y<rsub|i>;s<of|k><wide|z|^>,<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>|)>*<around*|(|<around|\||<wide|e|^><of|k>-<wide|e|^><rsub|i>|\|><rsup|2>+<frac|\<mu\><rsup|v>*\<mu\><rsup|z>|<around|\||s<of|k>|\|><rsup|2>*\<mu\><rsup|z>+\<mu\><rsup|v>>|)>.<eq-number><label|eq:intz2>>>>>>

  Using <math|\<xi\><rsub|i><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>
  defined in <eqref|xi> and <math|\<zeta\><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>
  defined in <eqref|zeta>, equations <eqref|pY> and <eqref|varZY2> and
  <eqref|intz2> combine to give

  <eqnarray|<tformat|<table|<row|<cell|<value|var><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>>|<cell|=>|<cell|<big|sum><rsub|k=1><rsup|2<rsup|M>>\<xi\><rsub|i><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>*<around*|(|<frac|\<mu\><rsup|v>*\<zeta\><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>|<around|\||s<of|k>|\|><rsup|2>>+<mid|\|><wide|e|^><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>-<wide|e|^><of|k><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)><mid|\|><rsup|2>|)>.<eq-number><label|eq:varZY>>>>>>

  which is rewritten as <math|\<mu\><rsup|e><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)><defn><value|var><rsub|Z<rsub|i>\|Y<rsub|i>><around|{|z<giv>y;<wide|z|^>,\<mu\><rsup|z>|}>>
  in <eqref|muei>. Finally, plugging <math|\<mu\><rsup|e><rsub|i><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>
  into the definition of <math|g<rprime|'><out><around|(|y,<wide|z|^>,\<mu\><rsup|z>|)>>
  in (D3), we immediately obtain <eqref|g'out>.

  <section|Derivation of GAMP Functions <math|g<inp>> and
  <math|g<rprime|'><inp>>><label|app:in>

  In this appendix, we derive the GAMP quantities
  <math|g<inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>> and
  <math|g<rprime|'><inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>> given in
  <eqref|gin>-<eqref|alfj>.

  From (D4)-(D6), we note that <math|g<inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>
  and <math|\<mu\><rsup|r>*g<rprime|'><inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>
  are the mean and variance, respectively, of the pdf

  <eqnarray|<tformat|<table|<row|<cell|<frac|1|Z<rsub|j>>*p<rsub|X<rsub|j>><space|-0.17em><around|(|r|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)>,<eq-number><label|eq:pdfin>>>>>>

  where <math|Z<rsub|j><defn><big|int><rsub|r>p<rsub|X<rsub|j>><space|-0.17em><around|(|r|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)>>.
  Using <eqref|pogr> together with the definition of
  <math|p<rsub|X<rsub|j>><space|-0.17em><around|(|.|)>> from <eqref|pxj>, we
  find that

  <eqnarray|<tformat|<table|<row|<cell|<lefteqn|p<rsub|X<rsub|j>><space|-0.17em><around|(|r|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)>>>|<cell|>|<cell|>>|<row|<cell|>|<cell|=>|<cell|\<lambda\><rsub|j><mc|C*N><around|(|r;0,\<mu\><rsub|j><rsup|1>|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)>+<around|(|1-\<lambda\><rsub|j>|)><mc|C*N><around|(|r;0,\<mu\><rsub|j><rsup|0>|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<with|math-display|false|<tformat|<table|<row|<cell|\<lambda\><rsub|j><mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsup|1><rsub|j>+\<mu\><rsup|r>|)><space|0.17em><mc|C*N><around*|(|r;<wide|r|^>*\<gamma\><rsup|1><rsub|j><around|(|\<mu\><rsup|r>|)>,\<mu\><rsup|r>*\<gamma\><rsup|1><rsub|j><around|(|\<mu\><rsup|r>|)>|)>>|<cell|>|<cell|>>|<row|<cell|>|<cell|>|<cell|+<around|(|1-\<lambda\><rsub|j>|)><mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsup|0><rsub|j>+\<mu\><rsup|r>|)><space|0.17em><mc|C*N><around*|(|r;<wide|r|^>*\<gamma\><rsup|0><rsub|j><around|(|\<mu\><rsup|r>|)>,\<mu\><rsup|r>*\<gamma\><rsup|0><rsub|j><around|(|\<mu\><rsup|r>|)>|)>>>>>>>>>>>

  for <math|\<gamma\><rsup|0><rsub|j><around|(|\<mu\><rsup|r>|)><defn><around|(|1+\<mu\><rsup|r>/\<mu\><rsub|j><rsup|0>|)><rsup|-1>>
  and <math|\<gamma\><rsup|1><rsub|j><around|(|\<mu\><rsup|r>|)><defn><around|(|1+\<mu\><rsup|r>/\<mu\><rsub|j><rsup|1>|)><rsup|-1>>.
  This implies that

  <eqnarray|<tformat|<table|<row|<cell|Z<rsub|j>>|<cell|=>|<cell|\<lambda\><rsub|j><mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsup|1><rsub|j>+\<mu\><rsup|r>|)>+<around|(|1-\<lambda\><rsub|j>|)><mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsup|0>+\<mu\><rsup|r>|)>.<eq-number>>>>>>

  Thus, the mean obeys

  <eqnarray|<tformat|<table|<row|<cell|g<inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|<frac|1|Z<rsub|j>>*<big|int><rsub|r>r*<space|0.17em>p<rsub|X<rsub|j>><space|-0.17em><around|(|r|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)><eq-number>>>|<row|<cell|>|<cell|=>|<cell|<wide*|<frac|\<lambda\><rsub|j><mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsub|j><rsup|1>+\<mu\><rsup|r>|)>|Z<rsub|j>>|\<wide-underbrace\>><rsub|<with|math-display|true|=\<alpha\><rsub|j><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>>\<gamma\><rsub|j><rsup|1><around|(|\<mu\><rsup|r>|)>*<space|0.17em><wide|r|^>+<wide*|<frac|<around|(|1-\<lambda\><rsub|j>|)><mc|C*N><around|(|<wide|r|^>;0,\<mu\><rsub|j><rsup|0>+\<mu\><rsup|r>|)>|Z<rsub|j>>|\<wide-underbrace\>><rsub|<with|math-display|true|=1-\<alpha\><rsub|j><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>>\<gamma\><rsub|j><rsup|0><around|(|\<mu\><rsup|r>|)>*<space|0.17em><wide|r|^>,<eq-number><label|eq:gin2>>>>>>

  yielding <eqref|gin>, where a straightforward manipulation relates the
  expression for <math|\<alpha\><rsub|j><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>
  above with its definition in <eqref|alfj>.

  Since, for the pdf in <eqref|pdfin>, <math|g<inp>> is the mean and
  <math|\<mu\><rsup|r>*g<rprime|'><inp>> is the variance, we can write

  <eqnarray|<tformat|<table|<row|<cell|\<mu\><rsup|r>*g<rprime|'><inp><around|(|<wide|r|^>,\<mu\><rsup|r>|)>>|<cell|=>|<cell|<frac|1|Z<rsub|j>>*<big|int><rsub|r><around|\||r|\|><rsup|2>*<space|0.17em>p<rsub|X<rsub|j>><space|-0.17em><around|(|r|)><space|0.17em><mc|C*N><around|(|r;<wide|r|^>,\<mu\><rsup|r>|)>-<around|\||g<inp>|\|><rsup|2><eq-number>>>|<row|<cell|>|<cell|=>|<cell|\<alpha\><rsub|j>*<around*|(|<around|\||<wide|r|^>*\<gamma\><rsup|1><rsub|j>|\|><rsup|2>+\<mu\><rsup|r>*\<gamma\><rsup|1><rsub|j>|)>+<around|(|1-\<alpha\><rsub|j>|)>*<around*|(|<around|\||<wide|r|^>*\<gamma\><rsup|0><rsub|j>|\|><rsup|2>+\<mu\><rsup|r>*\<gamma\><rsup|0><rsub|j>|)>-<mid|\|>\<alpha\><rsub|j>*\<gamma\><rsup|1><rsub|j>*<wide|r|^>+<around|(|1-\<alpha\><rsub|j>|)>*\<gamma\><rsup|0><rsub|j>*<wide|r|^><mid|\|><rsup|2>,<eq-number><label|eq:g'in2>>>>>>

  which can be simplified to yield <eqref|g'in>.

  <\bibliography|bib|ieeetr|macros_abbrev,books,misc,comm,multicarrier,sparse,stc,underwater>
    <bib-list|[99]|>
  </bibliography>

  <assign|baselinestretch|<macro|1.0>>
</body>