\documentclass[12pt]{article}
\usepackage[french]{babel}

\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage[latin1]{inputenc}


\voffset -2.8 cm 
\textwidth 17 cm
\textheight 24.7 cm
\oddsidemargin -.54 cm
\evensidemargin -1.54 cm

\pagestyle{plain}


\begin{document}

\centerline {\Large \sc Information and coding theory}
\bigskip
\bigskip
\hrule
\bigskip
\bigskip
\centerline {\underline{\Large Solution of exercise sheet n$^\circ$ 2}:} 
\bigskip


\noindent {\bf 2-1.} 
\par
\medskip
\noindent (a) First, one needs to find the marginal probability distributions $p(x)$ and $p(y)$. For this we use the relation $p(x)=\sum_y p(x,y)$. We obtain that
$p(x)=p(y)=\{\frac{1}{3},\frac{1}{3},\frac{1}{3}\}$. Thus, we find $H(X)=-\sum_{x}p(x) \log p(x)=H(Y)= \log 3$ bits. 
\par
\medskip
\noindent (b) In order to compute $H(X|Y)$, we need to find $p(x|y)$, which we obtain via $p(x|y)=p(x,y)/p(y)$. Using the definition of $H(X|Y)$, we obtain $H(X|Y)=-\sum_{x,y}p(x,y)\log_2 p(x|y)=\log 3 - 4/9$ bits. With the same method, we find $H(Y|X)=\log 3 - 4/9$ bits.  
\par
\medskip
\noindent (c) Using the results of (a) and (b), we directly compute $H(X,Y)=H(X)+H(Y|X)=2 \log 3 - 4/9$ bits. 
\par
\medskip
\noindent (d) Using (a) and (b), we find $H(X{\rm:}Y)=H(X)-H(X|Y)=4/9$ bits.
\par
\medskip
\noindent (e) See lecture.
\bigskip

\noindent {\bf 2-2.} 
\par
\medskip
\noindent (a) With the chain rule one one can write
\newline
$H(X_{1},X_{2},...,X_{k})=\sum_{i=1}^{k}H(X_{i}|X_{i-1},....,X_{1})$.
\newline
The $i$-th draw with replacement implies that $X_{i}$ is independent of $X_{j}$. Thus,
$H(X_{1},X_{2},...,X_{k})=\sum_{i=1}^{k}H(X_{i})$.
\newline 
As all draws have the same probability distribution, we obtain
\newline $H(X_{1},X_{2},...,X_{k})=kH(X)$.   
 
\par
\medskip
\noindent (b) The random variable $X_i$ corresponds to drawing a color at the $i$-th draw. Here, the calculation of the entropy $H(X_i)$ is not depenending on the previous draws, as we \emph{did not obtain any information}. All balls up to the $i$-th draw were drawn \emph{without looking at them}, that is, \emph{without gain of information}. Therefore, the probabilities did not change. (The experiment can be described as taking $i-1$ balls from one urn and putting them into another urn without looking at them. Logically the probability distribution of the $i$th draw is not affected by that.)
\par
\medskip
\noindent (c) We find that $p(X_{1}=c_{1},X_{2}=c_{2})=p(X_{1}=c_{2},X_{2}=c_{1})$, where $c_{i}$ is a certain color. Thus, a draw $c_{1}c_{2}$ has the same probability as a draw $c_{2}c_{1}$. 
A constructive way to understand this: Assume the total number of balls is $M = R + W + B$, where $R = p_R \, M$ is the number of red balls, $W = p_W \, M$ is the number of white balls, and $B = p_B \, M$ is the number of black balls. With this information it is possible to construct a tree, where on each branch one can write the probability to draw a certain color. Example: First draw is a red ball, probability is $p_R = R/M$. Second ball is a white ball, probability is $W/(M-1)$. Compare with firstly drawing a white ball (probability $p_W = W/M$), and secondly drawing a red ball (probability $R/(M-1)$). We find 
\[
	\frac{R}{M} \, \frac{W}{M-1} = \frac{W}{M} \, \frac{R}{M-1}.
\]
This example could be repeated for all color combinations, proving the relation.
\par
\medskip
\noindent (d) The probability to draw a red ball with the second draw is given by
\[
p(X_{2}=r)=p(X_{1}=r,X_{2}=r)+p(X_{1}=b,X_{2}=r)+p(X_{1}=n,X_{2}=r),
\]
since getting a red ball for the second draw may be preceded by drawing a red, black or white ball first. With the result of (c) one finds
\[
p(X_{2}=r)=p(X_{1}=r,X_{2}=r)+p(X_{1}=r,X_{2}=b)+p(X_{1}=r,X_{2}=n)=p(X_{1}=r).
\]

\par
\medskip
\noindent (e) The calculation of (d) shows that $p(X_{2}=r)=p(X_{1}=r)$. Similarly, $p(X_{2}=b)=p(X_{1}=b)$ and $p(X_{2}=n)=p(X_{1}=n)$.

\par
\medskip
\noindent (f) The marginal probabilities are the same for the first or second draw, i.e. $p(X_{2}=c_{i})=p(X_{1}=c_{i})$, thus $H(X_{2})=H(X_{1})$.

\par
\medskip
\noindent (g) By using the chain rule $H(X_{i}|X_{i-1},....,X_{1})\leq H(X_{i})$, one can write for dependent random variables
$\qquad H(X_{1},X_{2},...,X_{k})\leq \sum_{i=1}^{k}H(X_{i})$.
\newline
Using $H(X_{i})=H(X)$, one obtains $H(X_{1},X_{2},...,X_{k})\leq kH(X)$.

\bigskip
\noindent {\bf 2-3.} 
\par
\medskip
\noindent (a) Using the definition of the conditional probability, one can write $p(x,z|y)=p(x|y)p(z|x,y)$.
However, for the Markov chain $p(z|x,y)=p(z|y)$, thus one obtains $p(x,z|y)=p(x|y)p(z|y)$.

\par
\medskip
\noindent (b) The chain rule for mutual entropies is given by
$$H(X_{1},X_{2},...,X_{n}{\rm:}Y)=\sum_{i=1}^{n}H(X_{i}{\rm:}Y|X_{1},X_{2},...,X_{i-1}).$$
Thus,
$H(X{\rm:}Y,Z)=H(Y,Z{\rm:}X)=H(Y{\rm:}X)+H(Z{\rm:}X|Y)$  
and
$H(Y,Z{\rm:}X)=H(Z{\rm:}X)+H(Y{\rm:}X|Z)$. Furthermore, we have the definition (see lecture)
\[
H(Z{\rm:}X|Y)=-\sum_{xyz} p(x,y,z)\log \frac{p(x|y)p(z|y)}{p(z,x|y)}.
\]
Using the result of (a), we conclude that $H(Z{\rm:}X|Y)=0$. Taking into account that $H(Y{\rm:}X|Z)\geq 0$, one obtains $H(X{\rm:}Y)\geq H(X{\rm:}Z)$.

\par
\medskip
\noindent (c) Using the result of (b), $H(X{\rm:}Z)\leq H(X{\rm:}Y)=H(Y)-H(Y|X)$.
Now max$\{H(X{\rm:}Y)\}=\log k$ as $H(Y|X)\geq 0$ and max$\{H(Y)\}=\log k$. The limit is reached if $Y=f(X)$ and $Y$ is uniformly distributed. One finally obtains the inequality $H(X{\rm:}Z)\leq \log k$.  

\par
\medskip
\noindent (d) If $k=1$, then $H(X{\rm:}Z)=0$. The set ${\mathcal{Y}}$ contains only one element, thus all information contained in $X$ is lost by the operation $X\rightarrow Y$.


\bigskip
\noindent {\bf 2-4.} 
\par
\medskip
\noindent (a) The probability of a Bernoulli experiment in general reads $p(x_{1},x_{2},...x_{n}) = p^k (1-p)^{n-k}$. Since for a typical sequence $k \approx np$, we find
the probability to emit a particular typical sequence: $p(x_{1},x_{2},...x_{n})=p^k(1-p)^{n-k}\approx p^{np}(1-p)^{n(1-p)}$.\newline \noindent
We can approximate as a function of the entropy:
$$\log p(x_{1},x_{2},...x_{n})\approx np\log p + n(1-p)\log (1-p)=-nH(p).$$
Thus,
$p(x_{1},x_{2},...x_{n})\approx 2^{-nH(p)}$.
\par

\par
\medskip
\noindent (b) The number of typical sequences $N_{ST}$ is given by the number of ways to have $np$ ones in a sequence of length $n$ (or to get $np$ successes for $n$ trials in a Bernoulli experiment). Thus, $N_{ST}=\binom{n}{np}=\frac{n!}{(np)!(n(1-p))!}$.
\newline
By using the Stirling approximation one obtains $\log N_{ST}\approx  nH(p)$.
\newline
Comparison to the total number of sequences that can be emitted by the source: $N_{ST}=2^{nH(p)}\leq 2^{n}$.\newline
The probability that the source emits a sequence that is typical is $P_{ST}=p_{ST}N_{ST}\approx 1$ for $n\gg 1$.
\par
\medskip
\noindent (c) 
The most probable sequence $1 1 1 1 ..... 1$ if $p>1/2$ or $0 0 0 0 ..... 0$ if $p<1/2$.
This sequence is not typical.

\bigskip
\noindent {\bf 2-5.} 
\par
\medskip
\noindent (a) By replacing $H(Y|X)=H(X,Y)-H(X)$ in the definition of the distance, we obtain
$2H(X,Y)-H(X)-H(Y)$. Furthermore, the definition $H(X{\rm:}Y)=H(X)+H(Y)-H(X,Y)$ gives us another expression for the distance.
\par
\medskip
\noindent (b) Proof of the properties in order of appearance:
\begin{enumerate} 
\item $\rho(x,y)\ge 0$ since $H(X|Y)\geq 0$ and $H(Y|X)\geq 0$.
\item $\rho(x,y)=\rho(y,x)$ is trivially given by its definition.
\item $\rho(x,y)=0$ iff $H(Y|X)=H(X|Y)=0$ , which holds iff there exists a bijection between $X$ and $Y$.
\item Let $A=\rho(x,y)+\rho(y,z)-\rho(x,z)$. Using a), $A=2[H(X,Y)+H(Y,Z)-H(Y)-H(X,Z)]$.
\newline
Using the strong subadditivity $H(X,Y)+H(Y,Z)-H(Y)\geq H(X,Y,Z)$), we have $A\geq 2[H(X,Y,Z)-H(X,Z)]\equiv 2H(Y|X,Z)\geq 0$.
\end{enumerate}

\bigskip
\noindent {\bf 2-6.} 
\par
\medskip
\noindent (a) For instance if $\mathcal{X}=\mathcal{Y}=\mathcal{Z}=\{0,1\}$,
 $X=Y=Z$ with uniform distributions.
\newline
We have
$H(X{\rm:}Y)=1$ bit since $H(X{\rm:}Y)=H(Y)-H(Y|X)$ and $H(Y|X)=0$ (because $X$ are $Y$ perfectly correlated). We find $H(X{\rm:}Y|Z)=0$ bit since $(X,Y)=f(Z)$.
One verifies that $H(X{\rm:}Y{\rm:}Z)>0$ and $H(X{\rm:}Y|Z) < H(X{\rm:}Y)$.

\par
\medskip
\noindent (b) For instance if $\mathcal{X}=\mathcal{Y}=\mathcal{Z}=\{0,1\}$
and $Z=X\oplus Y$ (sum mod $2$), with:

\vspace*{4mm}
\hspace*{0mm}
\begin{tabular}{|cc|cc|c|}
\hline
&&\multicolumn{2}{|c|}{$Y=$}&
\\
\multicolumn{2}{|c|}{P($X,Y$)}
&\multicolumn{1}{c}{$0$}
&\multicolumn{1}{c}{$1$}
&\multicolumn{1}{|c|}{}
\\
\hline
     & $0$ & $1/4$ & $1/4$ & $1/2$  \\
$X=$ & $1$ & $1/4$ & $1/4$ & $1/2$  \\
\hline
    && $1/2$ & $1/2$ & $1$    \\
\hline
\end{tabular}
\vspace*{4mm}

We obtain $H(X{\rm:}Y)=0$ bit since $X$ and $Y$ are independent and thus $H(Y|X)=H(Y)$. 
\newline
Furthermore, $H(X{\rm:}Y|Z)=H(X|Z)-H(X|Y,Z)$. In our example $X$ is fixed if one knows $Y$ and $Z$. Thus, $H(X|Y,Z)=0$. This implies $H(X{\rm:}Y|Z)=H(X|Z)$. One obtains $H(X{\rm:}Y|Z)=1$ bit. 
One verifies that $H(X{\rm:}Y{\rm:}Z)=-1 \textrm{ bit}<0 \textrm{ bit}$ and $H(X{\rm:}Y|Z) > H(X{\rm:}Y)$. We confirm furthermore, that $H(X:Z)=H(Y:Z)=0$. Therefore, the corresponding Venn diagram is like in Fig.~\ref{fig:venn}, which shows that there is a \emph{negative} overlap between the three random variables $X,Y$ and $Z$.
\begin{figure}
	\centering
		\includegraphics[width=0.7\textwidth]{Inf_Th_TP2_venn.pdf}
	\caption{Venn diagram depicting example 2-6. (b).}
	\label{fig:venn}
\end{figure}
\bigskip

\noindent \underline{\textrm{Optional:}} An interesting exercise is to determine under which conditions (independence, perfect correlation) on the three variables $X,Y$ and $Z$ one obtains a maximal or minimal $H(X{\rm:}Y{\rm:}Z)$.


\end{document}