\documentclass[a4paper,11pt]{article}

\usepackage{amsmath}
\usepackage{amstext}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{braket}
\usepackage{microtype}
\usepackage{pifont}
\usepackage{pdfpages,diagbox}
\usepackage[utf8]{inputenc}
\usepackage{color}
\usepackage{enumerate}
\usepackage{hyperref}
\definecolor{ulbblue}{cmyk}{1,0.75,0.2,0.18}
\hypersetup{colorlinks,
  citecolor=ulbblue,
  linkcolor=ulbblue,
  urlcolor=ulbblue
}

\makeatletter
\renewcommand\@pnumwidth{1.8em}
\makeatother

\usepackage[T1]{fontenc}
\usepackage[charter,cal=cmcal,sfscaled=false]{mathdesign}

\usepackage[paper=a4paper,innermargin=4cm,outermargin=3cm,top=3.5cm,bottom=3.5cm]{geometry}
\setlength{\headheight}{13.6pt}
\usepackage{setspace}
\usepackage[PetersLenny]{fncychap}
\usepackage{fancyhdr}


\theoremstyle{definition}
\newtheorem{exercise}{Exercise}
\newtheorem*{definition}{Definition}

\setlength {\textheight}{26 cm} \setlength {\textwidth}{16.5 cm}
\setlength {\topmargin}{-2 cm} \setlength {\oddsidemargin}{-0.4 cm}

\begin{document}
	\noindent
	\fbox{\parbox{\textwidth}{
			INFO-H-422\hfill 2016-2017\\ 
			\centering{
				\textsc{\Large Information and Coding Theory}\\
				\phantom{}}
		}}
		
		\begin{center}
			\section*{\textcolor{red}{Solutions to Exercise Sheet 2}}
		\end{center}
		
		
\begin{exercise}~
	\begin{enumerate}[(a)]
\item\label{1.a} First, we need to find the marginal probability distributions $p(x)$ and $p(y)$. \\
For this we use the relation $p(x)=\sum_y p(x,y)$, which gives
$p(x)=p(y)=\{\frac{1}{3},\frac{1}{3},\frac{1}{3}\}$. \\
Therefore $H(X)=-\sum_{x}p(x) \log p(x)=H(Y)= \log 3$ bits. 
\item\label{1.b} $H(X,Y)=-\sum_{x,y}p(x,y)\log_2 p(x,y)=2\log 3 - 4/9$.
\item\label{1.c} In order to find $H(X|Y)$, we need to find $p(x|y)$, which is given by $p(x|y)=p(x,y)/p(y)$. \\
Using the definition of $H(X|Y)$, we obtain $H(X|Y)=-\sum_{x,y}p(x,y)\log_2 p(x|y)=\log 3 - 4/9$ bits. \\
With the same method, we find $H(Y|X)=\log 3 - 4/9$ bits. 

 Alternatively, using the results of (\ref{1.a}) and (\ref{1.b}), we directly compute $H(Y|X)=H(X, Y)-H(X)=\log 3 - 4/9=H(Y|X)$. 
\item Using (\ref{1.a}) and (\ref{1.b}), we find $I(X;Y)=H(Y)-H(Y|X)=4/9$ bits.
\item Cf. lecture notes or the Wikipedia page on mutual information\footnote{\url{http://en.wikipedia.org/wiki/Mutual_information}}.
\end{enumerate}
\end{exercise}

\begin{exercise}~
	\begin{enumerate}[(a)]
\item\label{2.a} By using the chain rule, $H(X_{1},X_{2},...,X_{k})=\sum_{i=1}^{k}H(X_{i}|X_{i-1},....,X_{1})$.\\
The $i$-th draw with replacement implies that $X_{i}$ is independent of $X_{j}$. \\
Thus,
$H(X_{1},X_{2},...,X_{k})=\sum_{i=1}^{k}H(X_{i})$.\\
As all draws have the same probability distribution, $H(X_{1},X_{2},...,X_{k})=kH(X)$.
\item\label{2.b} The $i$-th draw is described by the random variable $X_i$. Since the $i$-th draw is independent  of all previous ones, and the color of the balls drawn during the first $i-1$ draws is not known (e.g., it is forgotten; the experiment can be also described as taking $i-1$ balls from one urn and putting them into another urn without looking at them), \textbf{no information is gained} prior to the $i$ draw. Therefore, the entropy does not change with $i$, yielding $H(X_i)=H(X)$, where $X$ stands for the color of the ball at an arbitrary draw. 
\item\label{2.c} We find that $p(X_{1}=c_{1},X_{2}=c_{2})=p(X_{1}=c_{2},X_{2}=c_{1})$, where $c_{i}$ is a certain color.

To prove this, let the total number of balls in the urn be $t=r+g+b$. Then model the experiment by a tree where each level represents a draw and each branch is labeled by a particular color. For example, the probability that the first ball drawn is red is $p_r=\frac{r}{t}$, and the second ball drawn is green is $p_g=\frac{g}{t-1}$. Now if the order of the balls drawn is reversed, the probabilities become $p_g=\frac{g}{t}$ and $p_r=\frac{r}{t-1}$, respectively. However, the product of the two probabilities remain the same:
\begin{align*}
\frac{r}{t}\cdot\frac{g}{t-1}=\frac{r}{t-1}\cdot\frac{g}{t}
\end{align*}
This reasoning can be used for any path in the tree, proving the relation.
\item\label{2.d} The probability to draw a red ball with the second draw is given by
\[
p(X_{2}=r)=p(X_{1}=r,X_{2}=r)+p(X_{1}=g,X_{2}=r)+p(X_{1}=b,X_{2}=r),
\]
since getting a red ball for the second draw may be preceded by drawing a red, green or blue ball first. By using the result of (\ref{2.c}), we have
\[
p(X_{2}=r)=p(X_{1}=r,X_{2}=r)+p(X_{1}=r,X_{2}=g)+p(X_{1}=r,X_{2}=b)=p(X_{1}=r).
\]
\item\label{2.e} The previous result shows that $p(X_{2}=r)=p(X_{1}=r)$. Similarly, $p(X_{2}=g)=p(X_{1}=g)$ and $p(X_{2}=b)=p(X_{1}=b)$.
\item\label{2.f} The marginal probabilities are the same for the first and second draw, i.e. $p(X_{2}=c_{i})=p(X_{1}=c_{i})$, thus $H(X_{2})=H(X_{1})$.

The results of (e) and (f) can be trivially generalized for the subsequent draws: $p(X_1=c_i)=p(X_2=c_i)=\cdots=p(X_k=c_i)$, yielding $H(X_{1})=H(X_{2})=\cdots = H(X_k)$, what constitutes the constructive proof of (b).

\item\label{2.g} By using the chain rule $H(X_{i}|X_{i-1},....,X_{1})\leq H(X_{i})$, we have (for dependent random variables)
$H(X_{1},X_{2},...,X_{k})\leq \sum_{i=1}^{k}H(X_{i})$.\\
Using $H(X_{i})=H(X)$, we get $H(X_{1},X_{2},...,X_{k})\leq kH(X)$.
\end{enumerate}
\end{exercise}


\begin{exercise}~
	\begin{enumerate}[(a)]
\item\label{3.a} Using the definition of the conditional probability, one can write $p(x,z|y)=p(x|y)p(z|x,y)$.
However, for the Markov chain $p(z|x,y)=p(z|y)$, thus one obtains $p(x,z|y)=p(x|y)p(z|y)$.
\item\label{3.b} The chain rule for mutual information is given by
$$I(X_{1},X_{2},...,X_{n}{\rm;}Y)=\sum_{i=1}^{n}I(X_{i}{\rm;}Y|X_{1},X_{2},...,X_{i-1}).$$
Thus,
$I(X{\rm;}Y,Z)=I(Y,Z{\rm;}X)=I(Y{\rm;}X)+I(Z{\rm;}X|Y)$  
and
$I(Y,Z{\rm;}X)=I(Z{\rm;}X)+I(Y{\rm;}X|Z)$. \\
Furthermore, we have the definition (see lecture)
\[
I(Z{\rm;}X|Y)=-\sum_{xyz} p(x,y,z)\log \frac{p(x|y)p(z|y)}{p(z,x|y)}.
\]
Using the result of (a), we conclude that $I(Z{\rm;}X|Y)=0$. Taking into account that $I(Y{\rm;}X|Z)\geq 0$, one obtains $I(X{\rm;}Y)\geq I(X{\rm;}Z)$.
\item\label{3.c} Using the result of (\ref{3.b}), $I(X{\rm;}Z)\leq I(X{\rm;}Y)=H(Y)-H(Y|X)$.
Now max$\{I(X{\rm;}Y)\}=\log k$ as $H(Y|X)\geq 0$ and max$\{H(Y)\}=\log k$. The limit is reached if $Y=f(X)$ and $Y$ is uniformly distributed. One finally obtains the inequality $I(X{\rm;}Z)\leq \log k$.  
\item\label{3.d} If $k=1$, then $I(X{\rm;}Z)=0$. 
The set ${\mathcal{Y}}$ contains only one element, thus all information contained in $X$ is lost by the operation $X\rightarrow Y$.
\end{enumerate}
\end{exercise}

\begin{exercise}~
	\begin{enumerate}[(a)]
\item\label{4.a} The probability of a Bernoulli experiment in general reads $p(x_{1},x_{2},...x_{n}) = p^k (1-p)^{n-k}$. Since for a typical sequence $k \approx np$, we find
the probability to emit a particular typical sequence: $p(x_{1},x_{2},...x_{n})=p^k(1-p)^{n-k}\approx p^{np}(1-p)^{n(1-p)}$.\newline \noindent
The latter can be approximate as a function of the entropy:
$$\log p(x_{1},x_{2},...x_{n})\approx np\log p + n(1-p)\log (1-p)=-nH(p).$$
Thus,
$p(x_{1},x_{2},...x_{n})\approx 2^{-nH(p)}$.
\item\label{4.b} The number of typical sequences $N_{ST}$ is given by the number of ways to have $np$ ones in a sequence of length $n$ (or to get $np$ successes for $n$ trials in a Bernoulli experiment). Thus 
\begin{align*}
N_{ST}=\binom{n}{np}=\frac{n!}{(np)!(n(1-p))!}.
\end{align*}
\newline
By using the Stirling approximation one obtains $\log N_{ST}\approx  nH(p)$.
\newline
Comparison to the total number of sequences that can be emitted by the source: $N_{ST}=2^{nH(p)}\leq 2^{n}$.\newline
The probability that the source emits a sequence that is typical is $P_{ST}=p_{ST}N_{ST}\approx 1$ for $n\gg 1$.
\item\label{4.c} The most probable sequence $1 1 1 1 ..... 1$ if $p>1/2$ or $0 0 0 0 ..... 0$ if $p<1/2$.
This sequence is not typical.
\end{enumerate}
\end{exercise}

\newpage
\begin{exercise}~
	\begin{enumerate}[(a)]
\item\label{5.a} By replacing $H(Y|X)=H(X,Y)-H(X)$ in the definition of the distance, we obtain a desired equation\\
$\rho(X,Y)= 2H(X,Y)-H(X)-H(Y)$. Furthermore, the definition $I(X{\rm;}Y)=H(X)+H(Y)-H(X,Y)$ gives us the second expression.
\item Proof of the properties in order of appearance:
\begin{enumerate}[(1)] 
\item $\rho(x,y)\ge 0$ since $H(X|Y)\geq 0$ and $H(Y|X)\geq 0$.
\item $\rho(x,y)=\rho(y,x)$ is trivially given by its definition.
\item $\rho(x,y)=0$ iff $H(Y|X)=H(X|Y)=0$ , which holds iff there exists a bijection between $X$ and $Y$.
\item Let $A=\rho(x,y)+\rho(y,z)-\rho(x,z)$. Using (a) we get  $A=2[H(X,Y)+H(Y,Z)-H(Y)-H(X,Z)]$.
Using the strong subadditivity $H(X,Y)+H(Y,Z)-H(Y)\geq H(X,Y,Z)$), we have $A\geq 2[H(X,Y,Z)-H(X,Z)]\equiv 2H(Y|X,Z)\geq 0$.
\end{enumerate}
\end{enumerate}
\end{exercise}

\begin{exercise}~
	\begin{enumerate}[(a)]
\item\label{6.a} For instance if $\mathcal{X}=\mathcal{Y}=\mathcal{Z}=\{0,1\}$,
 $X=Y=Z$ with uniform distributions.\\
We have
$I(X{\rm;}Y)=1$ bit since $I(X{\rm;}Y)=H(Y)-H(Y|X)$ and $H(Y|X)=0$ (because $X$ are $Y$ perfectly correlated). We find $I(X{\rm;}Y|Z)=0$ bit since $(X,Y)=f(Z)$.
One verifies that $I(X{\rm;}Y{\rm;}Z)>0$ and $I(X{\rm;}Y|Z) < I(X{\rm;}Y)$.
\item For instance if $\mathcal{X}=\mathcal{Y}=\mathcal{Z}=\{0,1\}$
and $Z=X\oplus Y$ (sum mod $2$), with:
\begin{center}
\begin{tabular}[htbp!]{|cc|cc|c|}
\hline
&&\multicolumn{2}{|c|}{$Y=$}&
\\
\multicolumn{2}{|c|}{P($X,Y$)}
&\multicolumn{1}{c}{$0$}
&\multicolumn{1}{c}{$1$}
&\multicolumn{1}{|c|}{}
\\
\hline
     & $0$ & $1/4$ & $1/4$ & $1/2$  \\
$X=$ & $1$ & $1/4$ & $1/4$ & $1/2$  \\
\hline
    && $1/2$ & $1/2$ & $1$    \\
\hline
\end{tabular}
\end{center}
We obtain $I(X{\rm;}Y)=0$ bit since $X$ and $Y$ are independent and thus $H(Y|X)=H(Y)$.\\
Furthermore, $I(X{\rm;}Y|Z)=H(X|Z)-H(X|Y,Z)$. In our example $X$ is fixed if one knows $Y$ and $Z$. Thus, $H(X|Y,Z)=0$. This implies $I(X{\rm;}Y|Z)=H(X|Z)$. One obtains $I(X{\rm;}Y|Z)=1$ bit. 
One verifies that $I(X{\rm;}Y{\rm;}Z)=-1 \textrm{ bit}<0 \textrm{ bit}$ and $I(X{\rm;}Y|Z) > I(X{\rm;}Y)$. We confirm furthermore, that $I(X{\rm;}Z)=I(Y{\rm;}Z)=0$. Therefore, the corresponding Venn diagram is like in Fig.~\ref{fig:venn}, which shows that there is a \emph{negative} overlap between the three random variables $X,Y$ and $Z$.
\begin{figure}[htbp!]
	\centering
		\includegraphics[width=0.7\textwidth]{td2-venn.eps}
	\caption{Venn diagram depicting the example of the Exercise 2(b).}
	\label{fig:venn}
\end{figure}

\textbf{Optional:} An interesting exercise is to determine under which conditions (independence, perfect correlation) on the three variables $X,Y$ and $Z$ one obtains a maximal or minimal $I(X{\rm;}Y{\rm;}Z)$.
\end{enumerate}
\end{exercise}

\end{document}