\documentclass[a4paper,11pt]{article} \usepackage{amsmath} \usepackage{amstext} \usepackage{amsthm} \usepackage{mathtools} \usepackage{braket} \usepackage{microtype} \usepackage{pifont} \usepackage{pdfpages,diagbox} \usepackage[utf8]{inputenc} \usepackage{color} \usepackage{enumerate} \usepackage{hyperref} \definecolor{ulbblue}{cmyk}{1,0.75,0.2,0.18} \hypersetup{colorlinks, citecolor=ulbblue, linkcolor=ulbblue, urlcolor=ulbblue } \makeatletter \renewcommand\@pnumwidth{1.8em} \makeatother \usepackage[T1]{fontenc} \usepackage[charter,cal=cmcal,sfscaled=false]{mathdesign} \usepackage[paper=a4paper,innermargin=4cm,outermargin=3cm,top=3.5cm,bottom=3.5cm]{geometry} \setlength{\headheight}{13.6pt} \usepackage{setspace} \usepackage[PetersLenny]{fncychap} \usepackage{fancyhdr} \theoremstyle{definition} \newtheorem{exercise}{Exercise} \newtheorem*{definition}{Definition} \setlength {\textheight}{26 cm} \setlength {\textwidth}{16.5 cm} \setlength {\topmargin}{-2 cm} \setlength {\oddsidemargin}{-0.4 cm} \begin{document} \noindent \fbox{\parbox{\textwidth}{ INFO-H-422\hfill 2016-2017\\ \centering{ \textsc{\Large Information and Coding Theory}\\ \phantom{}} }} \begin{center} \section*{\textcolor{red}{Solutions to Exercise Sheet 2}} \end{center} \begin{exercise}~ \begin{enumerate}[(a)] \item\label{1.a} First, we need to find the marginal probability distributions $p(x)$ and $p(y)$. \\ For this we use the relation $p(x)=\sum_y p(x,y)$, which gives $p(x)=p(y)=\{\frac{1}{3},\frac{1}{3},\frac{1}{3}\}$. \\ Therefore $H(X)=-\sum_{x}p(x) \log p(x)=H(Y)= \log 3$ bits. \item\label{1.b} $H(X,Y)=-\sum_{x,y}p(x,y)\log_2 p(x,y)=2\log 3 - 4/9$. \item\label{1.c} In order to find $H(X|Y)$, we need to find $p(x|y)$, which is given by $p(x|y)=p(x,y)/p(y)$. \\ Using the definition of $H(X|Y)$, we obtain $H(X|Y)=-\sum_{x,y}p(x,y)\log_2 p(x|y)=\log 3 - 4/9$ bits. \\ With the same method, we find $H(Y|X)=\log 3 - 4/9$ bits. Alternatively, using the results of (\ref{1.a}) and (\ref{1.b}), we directly compute $H(Y|X)=H(X, Y)-H(X)=\log 3 - 4/9=H(Y|X)$. \item Using (\ref{1.a}) and (\ref{1.b}), we find $I(X;Y)=H(Y)-H(Y|X)=4/9$ bits. \item Cf. lecture notes or the Wikipedia page on mutual information\footnote{\url{http://en.wikipedia.org/wiki/Mutual_information}}. \end{enumerate} \end{exercise} \begin{exercise}~ \begin{enumerate}[(a)] \item\label{2.a} By using the chain rule, $H(X_{1},X_{2},...,X_{k})=\sum_{i=1}^{k}H(X_{i}|X_{i-1},....,X_{1})$.\\ The $i$-th draw with replacement implies that $X_{i}$ is independent of $X_{j}$. \\ Thus, $H(X_{1},X_{2},...,X_{k})=\sum_{i=1}^{k}H(X_{i})$.\\ As all draws have the same probability distribution, $H(X_{1},X_{2},...,X_{k})=kH(X)$. \item\label{2.b} The $i$-th draw is described by the random variable $X_i$. Since the $i$-th draw is independent of all previous ones, and the color of the balls drawn during the first $i-1$ draws is not known (e.g., it is forgotten; the experiment can be also described as taking $i-1$ balls from one urn and putting them into another urn without looking at them), \textbf{no information is gained} prior to the $i$ draw. Therefore, the entropy does not change with $i$, yielding $H(X_i)=H(X)$, where $X$ stands for the color of the ball at an arbitrary draw. \item\label{2.c} We find that $p(X_{1}=c_{1},X_{2}=c_{2})=p(X_{1}=c_{2},X_{2}=c_{1})$, where $c_{i}$ is a certain color. To prove this, let the total number of balls in the urn be $t=r+g+b$. Then model the experiment by a tree where each level represents a draw and each branch is labeled by a particular color. For example, the probability that the first ball drawn is red is $p_r=\frac{r}{t}$, and the second ball drawn is green is $p_g=\frac{g}{t-1}$. Now if the order of the balls drawn is reversed, the probabilities become $p_g=\frac{g}{t}$ and $p_r=\frac{r}{t-1}$, respectively. However, the product of the two probabilities remain the same: \begin{align*} \frac{r}{t}\cdot\frac{g}{t-1}=\frac{r}{t-1}\cdot\frac{g}{t} \end{align*} This reasoning can be used for any path in the tree, proving the relation. \item\label{2.d} The probability to draw a red ball with the second draw is given by \[ p(X_{2}=r)=p(X_{1}=r,X_{2}=r)+p(X_{1}=g,X_{2}=r)+p(X_{1}=b,X_{2}=r), \] since getting a red ball for the second draw may be preceded by drawing a red, green or blue ball first. By using the result of (\ref{2.c}), we have \[ p(X_{2}=r)=p(X_{1}=r,X_{2}=r)+p(X_{1}=r,X_{2}=g)+p(X_{1}=r,X_{2}=b)=p(X_{1}=r). \] \item\label{2.e} The previous result shows that $p(X_{2}=r)=p(X_{1}=r)$. Similarly, $p(X_{2}=g)=p(X_{1}=g)$ and $p(X_{2}=b)=p(X_{1}=b)$. \item\label{2.f} The marginal probabilities are the same for the first and second draw, i.e. $p(X_{2}=c_{i})=p(X_{1}=c_{i})$, thus $H(X_{2})=H(X_{1})$. The results of (e) and (f) can be trivially generalized for the subsequent draws: $p(X_1=c_i)=p(X_2=c_i)=\cdots=p(X_k=c_i)$, yielding $H(X_{1})=H(X_{2})=\cdots = H(X_k)$, what constitutes the constructive proof of (b). \item\label{2.g} By using the chain rule $H(X_{i}|X_{i-1},....,X_{1})\leq H(X_{i})$, we have (for dependent random variables) $H(X_{1},X_{2},...,X_{k})\leq \sum_{i=1}^{k}H(X_{i})$.\\ Using $H(X_{i})=H(X)$, we get $H(X_{1},X_{2},...,X_{k})\leq kH(X)$. \end{enumerate} \end{exercise} \begin{exercise}~ \begin{enumerate}[(a)] \item\label{3.a} Using the definition of the conditional probability, one can write $p(x,z|y)=p(x|y)p(z|x,y)$. However, for the Markov chain $p(z|x,y)=p(z|y)$, thus one obtains $p(x,z|y)=p(x|y)p(z|y)$. \item\label{3.b} The chain rule for mutual information is given by $$I(X_{1},X_{2},...,X_{n}{\rm;}Y)=\sum_{i=1}^{n}I(X_{i}{\rm;}Y|X_{1},X_{2},...,X_{i-1}).$$ Thus, $I(X{\rm;}Y,Z)=I(Y,Z{\rm;}X)=I(Y{\rm;}X)+I(Z{\rm;}X|Y)$ and $I(Y,Z{\rm;}X)=I(Z{\rm;}X)+I(Y{\rm;}X|Z)$. \\ Furthermore, we have the definition (see lecture) \[ I(Z{\rm;}X|Y)=-\sum_{xyz} p(x,y,z)\log \frac{p(x|y)p(z|y)}{p(z,x|y)}. \] Using the result of (a), we conclude that $I(Z{\rm;}X|Y)=0$. Taking into account that $I(Y{\rm;}X|Z)\geq 0$, one obtains $I(X{\rm;}Y)\geq I(X{\rm;}Z)$. \item\label{3.c} Using the result of (\ref{3.b}), $I(X{\rm;}Z)\leq I(X{\rm;}Y)=H(Y)-H(Y|X)$. Now max$\{I(X{\rm;}Y)\}=\log k$ as $H(Y|X)\geq 0$ and max$\{H(Y)\}=\log k$. The limit is reached if $Y=f(X)$ and $Y$ is uniformly distributed. One finally obtains the inequality $I(X{\rm;}Z)\leq \log k$. \item\label{3.d} If $k=1$, then $I(X{\rm;}Z)=0$. The set ${\mathcal{Y}}$ contains only one element, thus all information contained in $X$ is lost by the operation $X\rightarrow Y$. \end{enumerate} \end{exercise} \begin{exercise}~ \begin{enumerate}[(a)] \item\label{4.a} The probability of a Bernoulli experiment in general reads $p(x_{1},x_{2},...x_{n}) = p^k (1-p)^{n-k}$. Since for a typical sequence $k \approx np$, we find the probability to emit a particular typical sequence: $p(x_{1},x_{2},...x_{n})=p^k(1-p)^{n-k}\approx p^{np}(1-p)^{n(1-p)}$.\newline \noindent The latter can be approximate as a function of the entropy: $$\log p(x_{1},x_{2},...x_{n})\approx np\log p + n(1-p)\log (1-p)=-nH(p).$$ Thus, $p(x_{1},x_{2},...x_{n})\approx 2^{-nH(p)}$. \item\label{4.b} The number of typical sequences $N_{ST}$ is given by the number of ways to have $np$ ones in a sequence of length $n$ (or to get $np$ successes for $n$ trials in a Bernoulli experiment). Thus \begin{align*} N_{ST}=\binom{n}{np}=\frac{n!}{(np)!(n(1-p))!}. \end{align*} \newline By using the Stirling approximation one obtains $\log N_{ST}\approx nH(p)$. \newline Comparison to the total number of sequences that can be emitted by the source: $N_{ST}=2^{nH(p)}\leq 2^{n}$.\newline The probability that the source emits a sequence that is typical is $P_{ST}=p_{ST}N_{ST}\approx 1$ for $n\gg 1$. \item\label{4.c} The most probable sequence $1 1 1 1 ..... 1$ if $p>1/2$ or $0 0 0 0 ..... 0$ if $p<1/2$. This sequence is not typical. \end{enumerate} \end{exercise} \newpage \begin{exercise}~ \begin{enumerate}[(a)] \item\label{5.a} By replacing $H(Y|X)=H(X,Y)-H(X)$ in the definition of the distance, we obtain a desired equation\\ $\rho(X,Y)= 2H(X,Y)-H(X)-H(Y)$. Furthermore, the definition $I(X{\rm;}Y)=H(X)+H(Y)-H(X,Y)$ gives us the second expression. \item Proof of the properties in order of appearance: \begin{enumerate}[(1)] \item $\rho(x,y)\ge 0$ since $H(X|Y)\geq 0$ and $H(Y|X)\geq 0$. \item $\rho(x,y)=\rho(y,x)$ is trivially given by its definition. \item $\rho(x,y)=0$ iff $H(Y|X)=H(X|Y)=0$ , which holds iff there exists a bijection between $X$ and $Y$. \item Let $A=\rho(x,y)+\rho(y,z)-\rho(x,z)$. Using (a) we get $A=2[H(X,Y)+H(Y,Z)-H(Y)-H(X,Z)]$. Using the strong subadditivity $H(X,Y)+H(Y,Z)-H(Y)\geq H(X,Y,Z)$), we have $A\geq 2[H(X,Y,Z)-H(X,Z)]\equiv 2H(Y|X,Z)\geq 0$. \end{enumerate} \end{enumerate} \end{exercise} \begin{exercise}~ \begin{enumerate}[(a)] \item\label{6.a} For instance if $\mathcal{X}=\mathcal{Y}=\mathcal{Z}=\{0,1\}$, $X=Y=Z$ with uniform distributions.\\ We have $I(X{\rm;}Y)=1$ bit since $I(X{\rm;}Y)=H(Y)-H(Y|X)$ and $H(Y|X)=0$ (because $X$ are $Y$ perfectly correlated). We find $I(X{\rm;}Y|Z)=0$ bit since $(X,Y)=f(Z)$. One verifies that $I(X{\rm;}Y{\rm;}Z)>0$ and $I(X{\rm;}Y|Z) < I(X{\rm;}Y)$. \item For instance if $\mathcal{X}=\mathcal{Y}=\mathcal{Z}=\{0,1\}$ and $Z=X\oplus Y$ (sum mod $2$), with: \begin{center} \begin{tabular}[htbp!]{|cc|cc|c|} \hline &&\multicolumn{2}{|c|}{$Y=$}& \\ \multicolumn{2}{|c|}{P($X,Y$)} &\multicolumn{1}{c}{$0$} &\multicolumn{1}{c}{$1$} &\multicolumn{1}{|c|}{} \\ \hline & $0$ & $1/4$ & $1/4$ & $1/2$ \\ $X=$ & $1$ & $1/4$ & $1/4$ & $1/2$ \\ \hline && $1/2$ & $1/2$ & $1$ \\ \hline \end{tabular} \end{center} We obtain $I(X{\rm;}Y)=0$ bit since $X$ and $Y$ are independent and thus $H(Y|X)=H(Y)$.\\ Furthermore, $I(X{\rm;}Y|Z)=H(X|Z)-H(X|Y,Z)$. In our example $X$ is fixed if one knows $Y$ and $Z$. Thus, $H(X|Y,Z)=0$. This implies $I(X{\rm;}Y|Z)=H(X|Z)$. One obtains $I(X{\rm;}Y|Z)=1$ bit. One verifies that $I(X{\rm;}Y{\rm;}Z)=-1 \textrm{ bit}<0 \textrm{ bit}$ and $I(X{\rm;}Y|Z) > I(X{\rm;}Y)$. We confirm furthermore, that $I(X{\rm;}Z)=I(Y{\rm;}Z)=0$. Therefore, the corresponding Venn diagram is like in Fig.~\ref{fig:venn}, which shows that there is a \emph{negative} overlap between the three random variables $X,Y$ and $Z$. \begin{figure}[htbp!] \centering \includegraphics[width=0.7\textwidth]{td2-venn.eps} \caption{Venn diagram depicting the example of the Exercise 2(b).} \label{fig:venn} \end{figure} \textbf{Optional:} An interesting exercise is to determine under which conditions (independence, perfect correlation) on the three variables $X,Y$ and $Z$ one obtains a maximal or minimal $I(X{\rm;}Y{\rm;}Z)$. \end{enumerate} \end{exercise} \end{document}