\documentclass[12pt]{article}

\usepackage[french]{babel}
\voffset -2.8 cm
\textwidth 17 cm
\textheight 24.7 cm
\oddsidemargin -.54 cm
\evensidemargin -1.54 cm
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{enumerate}
\usepackage[latin1]{inputenc}
%\usepackage[charter,cal=cmcal,sfscaled=false]{mathdesign}

\pagestyle{plain}


\begin{document}

\centerline {\Large \sc Information and coding theory}
\bigskip
\bigskip
\hrule
\bigskip
\bigskip
\centerline {\underline{\Large Solution of exercise sheet n$^\circ$ 5}:}
\bigskip


\noindent {\bf 5-1.}
\par
\medskip

Channel capacity:
$
C= \max_{p(x)} I(X:Y) $. There are three ways to calculate $I(X:Y)$:
\begin{enumerate}
\item $I(X:Y)
=\sum_{x,y}
p(x,y) \log_2 \frac{p(x,y)}{p(x)p(y)}$.
\item $I(X:Y)=H(X)-H(X|Y)$.
\item $I(X:Y)=H(Y)-H(Y|X)$.
\end{enumerate}
Note that for the (memoryless) additive noise channel where the input $X$ and the noise $Z$ are uncorrelated we can use the relation $H(Y|X)=H(X+Z|X)=H(Z)$. Therefore, for the calculation of the capacity we can use the equation
\begin{align}\label{eq:capacity}
	C = \max_{p(x)}\{H(Y)\} - H(Z),	
\end{align}
where the second term $H(Z)$ does not depend on $X$ (and thus, p(x)).

To compute the capacity as a function of $a$ we need to consider three cases:

\begin{enumerate}[(I)]
\item {\bf $a = 0$:} No noise is added, thus $Y=X$ and $H(Z)=0$. The capacity is therefore $C= \max_{p(x)} H(p,1-p)=1$ bit.
	
\item {\bf $a > 1$:}
The output alphabet reads ${\mathcal Y}= \{ 0,1,a,1+a \}$. For the input variable $X$ we define the general probability distribution $P(X=0)=p, P(X=1)=1-p$. Then we can compute probability distribution of the output $Y$: 
\begin{align*}
P(Y=0)&=P(X=0)\cdot P(Z=0)=\frac{p}{2},\\
P(Y=1)&=P(X=1)\cdot P(Z=0)=\frac{1-p}{2},\\
P(Y=a)&=P(X=0)\cdot P(Z=a)=\frac{p}{2},\\
P(Y=a+1)&=P(X=1)\cdot P(Z=a)=\frac{1-p}{2}.
\end{align*}
We conclude that each output can be associated to a unique combination of input $X$ and noise $Z$ and thus, there is no error. We can recover the result in (I) by substituting the probability distribution $p(y)$ in equation \eqref{eq:capacity}:
\[
	C = \max_p\{-p \log_2(\frac{p}{2}) -(1-p)\log_2(\frac{1-p}{2})\} - 1 \textrm{ bit}.
\]
This is maximized by $p=\frac{1}{2}$ for which $C=1$ bit.

\item {\bf $a=1$:} In this case, ${\mathcal Y} = \{ 0,1,2 \}$. Similar to the reasoning above, if one obtains $Y=0,2$ then there is no error when guessing the $X$ sent. However, $Y=1$ corresponds to either $X=0, Z=1$ or $X=1, Z=0$. We find the output probability distribution $p(y) = \{\frac{p}{2}, \frac{1}{2}, \frac{1-p}{2} \}$. Substituting this in the capacity formula \eqref{eq:capacity} we find that $C=\frac{1}{2}$ bits for $p=\frac{1}{2}$.
\end{enumerate}


\bigskip
\noindent {\bf 5-2.}
This exercise can be solved in the same way as in ex. {\bf 5-1}, \emph{i.e.} because the input $X$ and noise $Z$ are independent we can use Eq.~\eqref{eq:capacity} (warning: in general equation~\eqref{eq:capacity} is \textbf{not valid}!).
\par
\medskip
\noindent (a)
We again parametrize the input probability distribution, but now, as the input takes 4 values we set it to $p(x)=\{a,b,c,d\}$ where $a+b+c+d=1$ (alternatively one can include the constraint into the parametrization, so write $p(x)=\{a,b,c,1-a-b-c\}$). Note that $-1 \mod 4 = 3$, so the output alphabet reads ${\mathcal Y} = \{ 0,1,2,3 \}$. Now, we have to find the parameters $a,b,c,d$ that maximize the output entropy $H(Y)$. We can express (similar to ex. {\bf 5-1}) $p(y)$ as a function of $a,b,c,d$
\begin{align*}
p(y)=\{ \frac{a}{4}+\frac{c}{4}+\frac{d}{2}, \frac{b}{4}+\frac{c}{2}+\frac{d}{4}, \frac{a}{4}+\frac{b}{2}+\frac{c}{4}, \frac{a}{2}+\frac{b}{4}+\frac{d}{4} \}. 
\end{align*}

Now we try to find $a,b,c,d$ such that the (optimal) uniform distribution $p(y)=\{\frac{1}{4},\frac{1}{4},\frac{1}{4},\frac{1}{4}\}$ is reached (warning: in general it may \textbf{not be possible} to achieve this! Then one needs to use Lagrange multipliers). We have 4 equations + 1 equation for the constraint $a+b+c+d=1$ to solve (one equation is linearly dependent on the others). We find that $a=c$ and $b=d$. Namely, there is an infinite number of solutions and among them $a=b=c=d=1/4$, \emph{i.e.}, the uniform distribution for $X$ is a solution.

\par
\medskip
\noindent (b)
When we substitute the solution of (a) in Eq.~\eqref{eq:capacity} we find $C=\frac{1}{2}$ bits.
\par
\medskip
\noindent (c)
We need to sum both noises: $Z_{total}=Z_{1}+Z_{2}$ and again follow a calculation like in (a). One obtains the new probability distribution
by noting that $(-2 \mod{4})=(2 \mod{4})=2$. $Z_{total}$ takes the values: $\{-1, 0, 1, 2\}$ with probabilities $\{\frac{1}{4}, \frac{3}{8}, \frac{1}{4}, \frac{1}{8}\}$.
We have thus $H(Z)=H(\frac{1}{4}, \frac{3}{8}, \frac{1}{4}, \frac{1}{8})$ and we find $C=\log_2 4-1.91=0.09$ bits.
\par
\medskip
\noindent (d) $C_{total}=C_{1}+C_{2}=1$ bit. (The capacity is additive!)

\bigskip
\noindent {\bf 5-3.}
\par
\medskip (a) If $p(X=1)=p$ and $p(X=0)=1-p$. One obtains:
\begin{equation}
I(X:Y)=H(\frac{1-p}{2},\frac{p}{2})-p \nonumber
\end{equation}
Taking into account that:
\begin{equation}
\frac{\partial H(x,1-x)}{\partial x}=\log_2\frac{1-x}{x}
\end{equation}
The maximum is found for $p^*=\frac{2}{5}$ and
$C=I_{p=\frac{2}{5}}(X,Y)=\log_2 5 -2$ bits.

\par
\medskip (b)
The binary symmetric channel has capacity $C=1-H(\alpha)$ bits, where $\alpha$ is the error rate of the channel, because $I(X:Y)=H(Y)-\sum p(x)H(Y|X=x)=H(Y)-H(\alpha)\leq 1-H(\alpha)$ bits.

\par
\medskip (c) We have $C= \max_{p(x)} I(X:Y)$. O\`u $I(X:Y)=H(Y)-H(Y|X)$.
The entropy at the output is given by $H(Y)=H(1-p\cdot q,p\cdot q)$ and the conditional entropy reads
$H(Y|X)=pH(q,1-q)$.
$I(X:Y)$ is maximal if $\frac{\partial I(X:Y)}{\partial p}=0$, which implies
\begin{equation}
q\cdot\log_2\frac{1-p\cdot q}{p\cdot q}=H(q,1-q).
\end{equation}
To simplify notations we write $H(q,1-q)=H$. The distribution $p$ that maximizes $I(X:Y)$ is
\begin{equation}
p=\frac{1}{q(1+2^{(\frac{H}{q})})}.
\end{equation}
We finally obtain the capacity of the channel:
\begin{equation}\label{eq:qCap}
C=\log_2(1+2^{(\frac{H}{q})})-\frac{H}{q}.
\end{equation}
To check consistency we can test Eq.~\eqref{eq:qCap} for $q=0.5$. Since $H(q=0.5)=1$ we confirm the result of (a), \emph{i.e.} $C(q=0.5) = \log_2 5 - 2$ bits.

\bigskip
\noindent {\bf 5-4.}
\par
\medskip (a)
It does not attain the capacity because the uniform distribution does not maximize the mutual information of the channel of exercises 5-3.
\par
\medskip (b) For a probability distribution $p(x)$, the maximal transmission rate $R$ is bounded from above by $I(X:Y)$.
For the channel of exercise 5-3: $R_{p=1/2} < I_{p=1/2}(X:Y) = 0.3113$ bits.


\bigskip
\noindent {\bf 5-5.}
\par
\medskip (a)
$C=\max_{p(x)}
I(X:Y)$ and $\tilde{C}=\max_{p(x)}I(X:\tilde{Y})$.
\newline
We have $I(X:Y,\tilde{Y})=H(X:Y) + H(X:\tilde{Y}|Y)$,
\newline
and $I(X:Y,\tilde{Y})=H(X:\tilde{Y})+H(X:Y|\tilde{Y})$.
\newline
Since $H(X:Y|\tilde{Y}) \geq 0$ and $H(X:\tilde{Y}|Y)=0$ (see exercise 2-3),
we deduce that
\newline
$I(X:\tilde{Y}) \leq I(X:Y)$. Thus, $\tilde{C} > C$ is impossible.

\par
\medskip (b) One requires $H(X:Y|\tilde{Y})=0$. This chain thus must satisfy
$X\rightarrow \tilde{Y} \rightarrow Y$. This is only possible if $Y \leftrightarrow \tilde{Y}$, \emph{i.e.}
iff $\tilde{Y}=f(Y)$ is a bijective function.

%\bigskip
%\noindent {\bf 5-6.}
%\par
%\medskip

%
%(a)  $
%I(X:Y)= H(Y) - H(Y|X)=
%H(Y)-\sum_x p(x)H(Y|X=x)
%=H(Y)- \log_2 2$. Puisque le maximum pour
%$H(Y)$ est $\log_2 5$, on obtient
%$C=  \log_2(5/2)= 1,32$ bits.

%(b)
%Avec 5 mots code $00, 13, 21, 34,  42$
%le canal à la sortie transmet des mots code
%distinguables des autres:

%$$
%\begin{array}{c|cccc}
%00& 44& 11& 14 & 41 \\
%13 & 02 & 04 & 22 & 24 \\
%21 &10 & 12 & 30 & 32 \\
%34 & 20 & 23 & 40 & 43 \\
%42 & 01 & 03 & 31 & 33
%\end{array}
%$$
%Le taux de transmission à erreur nulle est supérieur ou
%égal à $\log_2 5/2 = 1,16$ bits. Comme $R<C$ on obtient $C>1.16$ bits et donc $C>1$ bit.


\end{document}

\item {\bf $a\geq 2$:} Nous traiterons le cas
$a=2$. Le cas $a >2$ est semblable, il suffit de
changer les valeurs de $Y$. Le tableau de probabilité est
le suivant:

$$
\begin{array}{|c|cccc|c|}
\hline
p(x,y) & 0 & 1 & 2 & 3 & p(x) \\ \hline
0 &(1-p)/2 & 0 & (1-p)/2 & 0 & 1-p \\
1 & 0 & p/2 & 0 & p/2 & p \\ \hline
p(y) & (1-p)/2 & p/2 & (1-p)/2 & p/2 & 1 \\ \hline
\end{array}
$$

Donc,

$$
I(X:Y)=-(1-p)\log_2
(1-p)- p\log_2 p
$$

et $C=1$ bit. $I(X:Y)=H(X)$ car malgré le bruit introduit, on
ne fais pas d'erreur sur $X$ lorsqu'on interprète la sortie $Y$.