\documentclass[12pt]{article}
\usepackage[margin=1.06in]{geometry}                
\geometry{letterpaper}                  
\usepackage{graphicx}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{hyperref, multicol}
\usepackage{pifont}
\usepackage{marvosym}
\hypersetup{colorlinks=false, allcolors=blue}
\newcommand{\noin}{\noindent}        
\newcommand{\SD}{\textnormal{SD}}
\newcommand{\var}{\textnormal{Var}}    
\newcommand{\cov}{\textnormal{Cov}}                                 
\newcommand{\corr}{\textnormal{Corr}}                  
\newcommand{\Bern}{\textnormal{Bern}}
\newcommand{\Bin}{\textnormal{Bin}}
\newcommand{\Geom}{\textnormal{Geom}}
\newcommand{\FS}{\textnormal{FS}}
\newcommand{\HGeom}{\textnormal{HGeom}}
\newcommand{\NBin}{\textnormal{NBin}}
\newcommand{\Pois}{\textnormal{Pois}}
\newcommand{\Expo}{\textnormal{Expo}}
\newcommand{\Unif}{\textnormal{Unif}}
\newcommand{\Beta}{\textnormal{Beta}}
\newcommand{\Gam}{\textnormal{Gamma}}
\newcommand{\N}{\mathcal{N}}

                     
\begin{document}
 
\noindent {\large  \textbf{Stat 110 Ultimate Homework, Fall 2017}} 

\bigskip

\noindent \textbf{Due}: Friday 12/1 at 5:00 pm, submitted as a PDF via the  \href{https://canvas.harvard.edu/courses/27764}{{course webpage}}. Please check carefully to make sure you upload the correct file. Your submission must be a single PDF file, no more than $20$ MB in size. It can be typeset or scanned, but must be clear and easily legible (not blurry or faint) and correctly rotated. No submissions on paper or by email will be accepted. Please show your work and give clear, careful, convincing justifications. See the syllabus for the collaboration policy. 

\bigskip

\noindent 1. (BH 10.16) Let $X,Y,Z,W$ be i.i.d.~positive r.v.s with CDF $F$ and $E(X)=1$. Write the most appropriate of $\leq$, $ \geq$, $=$, or ?~in each blank (where ``?" means that no relation holds in general). Explain your reasoning. 

\bigskip

\noindent (a) $F(3)$  \underline{\phantom{blah}}  $2/3$ 

\bigskip

\noindent (b) $(F(3))^3$  \underline{\phantom{blah}} $P(X+Y+Z \leq 9)$

\bigskip

\noindent (c)  $E\left(\frac{X^2}{X^2+Y^2+Z^2+W^2}\right)$  \underline{\phantom{blah}} $1/4$

\bigskip

\noindent (d) $E(XYZW)$  \underline{\phantom{blah}} $E(X^4)$

\bigskip

\noindent (e)  $\var(E(Y|X))$  \underline{\phantom{blah}} $\var(Y)$

\bigskip

\noindent (f) $\cov(X+Y,X-Y)$ \underline{\phantom{blah}} $0$

\bigskip

\noindent 2. (BH 10.27) As in Exercise 36 from Chapter 3, there are $n$ voters in an upcoming election in a certain country, where $n$ is a large, even number. There are two candidates, A and B. Each voter chooses randomly whom to vote for, independently and with equal probabilities. 

\medskip

\noin (a) Use a Normal approximation (with continuity correction) to get an approximation for the probability of a tie, in terms of $\Phi$.

\medskip

\noin (b) Use a first-order Taylor expansion (linear approximation) to the approximation from Part (a) to show that the probability of a tie is approximately $1/\sqrt{cn}$, where $c$ is a constant (which you should specify).


\bigskip

\noindent 3. (BH 10.28) A handy rule of thumb in statistics and life is as follows:

\medskip

\emph{Conditioning often makes things better.}

\medskip

\noin This problem explores how the above rule of thumb applies to estimating unknown parameters. Let $\theta$ be an unknown parameter that we wish to estimate based on data $X_1,X_2,\dots,X_n$ (these are r.v.s before being observed, and then after the experiment they ``crystallize" into data). In this problem, $\theta$ is viewed as an unknown constant, and is not treated as an r.v.~as in the Bayesian approach. Let $T_1$ be an estimator for $\theta$ (this means that $T_1$ is a function of $X_1,\dots,X_n$ which is being used to estimate $\theta$). 

\medskip

\noin A strategy for improving $T_1$ (in some problems) is as follows.  Suppose that we have an r.v.~$R$ such that $T_2=E(T_1|R)$ is  a function of $X_1,\dots,X_n$ (in general, $E(T_1|R)$ might involve unknowns such as $\theta$ but then it  couldn't be used as an estimator). Also suppose that $P(T_1=T_2) < 1$, and that $E(T_1^2)$ is finite.

\medskip

\noin (a) Use Jensen's inequality to show that $T_2$ is better than $T_1$ in the sense that the mean squared error is less, i.e., $$E (T_2 - \theta)^2 < E(T_1 - \theta)^2.$$ 

\noin Hint: Use Adam's law on the right-hand side.

\medskip

\noin (b) The \emph{bias} of an estimator $T$ for $\theta$ is defined to be $b(T)=E(T)-\theta$. An important identity in statistics, a form of the \emph{bias-variance tradeoff}, is that mean squared error is variance plus squared bias: $$E(T-\theta)^2  = \var(T) + (b(T))^2.$$ Use this identity and Eve's law to give an alternative proof of the result from (a).

\medskip

\noin (c) Now suppose that $X_1,X_2,\dots$ are i.i.d.~with mean $\theta$, and consider the special case $T_1=X_1$, $R=\sum_{j=1}^n X_j$. Find $T_2$ in simplified form, and check that it has lower mean squared error than $T_1$ for $n \geq 2$. Also, say what happens to $T_1$ and $T_2$ as $n \to \infty$.

\bigskip

\noindent 4. (BH 10.29) Each page of an $n$-page book has a $\Pois(\lambda)$ number of typos, where $\lambda$ is unknown (but is not treated as an r.v.). Typos on different pages are independent. Thus we have i.i.d.~$X_1, \dots, X_n \sim \Pois(\lambda)$, where $X_j$ is the number of typos on page $j$.  Suppose we are interested in estimating the probability $\theta$ that a page has no typos: $$\theta = P(X_j = 0) = e^{-\lambda}.$$

\medskip

\noin (a) Let $\bar{X}_n = \frac{1}{n}(X_1 + \dots + X_n)$. Show that $T_n = e^{-\bar{X}_n}$ is biased for estimating $\theta$. (In other words, $E(T_n)  \neq \theta$.)

\medskip

\noin (b) Show that as $n \to \infty$, $T_n \to \theta$ with probability 1.

\medskip

\noin (c) Show that $W = \frac{1}{n} (I(X_1 = 0) + \dots + I(X_n = 0))$ is unbiased for $\theta$. Using the fact that $X_1 | (X_1 + \dots + X_n = s) \sim \Bin(s,1/n)$, find $E(W|X_1 + \dots + X_n)$. Is $\tilde{W} = E(W|X_1 + \dots + X_n)$ also unbiased for $\theta$?

\medskip

\noin (d) Using Eve's law or otherwise, show that $\tilde{W}$ has lower variance than $W$, and relate this to the previous question.

\bigskip

\noindent 5. (BH 11.12) A chess piece is wandering around on an otherwise vacant $8 \times 8$ chessboard. At each move, the piece (a king, queen, rook, bishop, or knight) chooses uniformly at random where to go, among the legal choices (according to the rules of chess, which you should look up if you are unfamiliar with them).

\noin (a) For each of these cases, determine whether the Markov chain is irreducible, and whether it is aperiodic. 

\smallskip

\noin Hint for the knight: Note that a knight's move always goes from a light square to a dark square or vice versa. A \emph{knight's tour} is a sequence of knight moves on a chessboard such that the knight visits each square exactly once. Many knight's tours exist. 

\medskip

\noin (b) Suppose for this part that the piece is a rook, with initial position chosen uniformly at random. Find the distribution of where the rook is after $n$ moves.

\medskip

\noin (c) Now suppose that the piece is a king, with initial position chosen deterministically to be the upper left corner square. Determine the expected number of moves it takes him to return to that square, fully simplified, preferably in at most 140 characters.

\medskip

\noin (d) The stationary distribution for the random walk of the king from the previous part is not uniform over the 64 squares of the chessboard. A recipe for modifying the chain to obtain a uniform stationary distribution is as follows. Label the squares as $1,2,\dots,64$, and let $d_i$ be the number of legal moves from square $i$. Suppose the king is currently at square $i$. The next move of the chain is determined as follows:

\medskip

\noin {Step 1}: Generate a \emph{proposal square} $j$ by picking uniformly at random among the legal moves from $i$.

\medskip

\noin {Step 2}: Flip a coin with probability $\min(d_i/d_j,1)$ of Heads. If the coin lands Heads, go to $j$. Otherwise, stay at $i$.  

\medskip 

\noin Show that this modified chain has a stationary distribution that is uniform over the 64 squares.

\bigskip

\noindent 6. (BH 11.15) Nausicaa Distribution sells distribution plushies on Etsy. They have two different photos of the Evil Cauchy plushie but do not know which is more effective in getting a customer to purchase an Evil Cauchy plushie. Each visitor to their website is shown one of the two photos (call them Photo A and Photo B, as in Figure 1), and then the visitor either does buy an Evil Cauchy (``success") or does not buy one (``failure"). 

%\begin{figure}[h!] \label{EvilCauchy}
%   \centering
%   \includegraphics[width=2.75in]{figures/EvilCauchyA.jpg} 
%      \includegraphics[width=2.75in]{figures/EvilCauchyB.jpg} 
%   \caption{Two photos of the Evil Cauchy distribution plushie from Nausicaa Distribution. Shown on the left is Photo A, and shown on the right is Photo B.}
%   \label{fig:example}
%\end{figure}
 
\smallskip

\noin Let $a$ and $b$ be the probabilities of success when Photo A is shown and when Photo B is shown, respectively. Even though the Evil Cauchy is irresistible, suppose that $0<a<1$ and $0<b<1$. Suppose that the following strategy is followed (note that the strategy can be followed  without knowing $a$ and $b$). Show the first visitor Photo A. If that visitor buys an Evil Cauchy, continue with Photo A for the next visitor; otherwise, switch to Photo B. Similarly, if the $n$th visitor is a ``success" then show the $(n+1)$st visitor the same photo, and otherwise switch to the other photo.

\medskip
 
\noin (a) Show how to represent the resulting process as a Markov chain, drawing a diagram and giving the transition matrix. The states are $\textrm{A1, B1, A0, B0}$ (use this order for the transition matrix and stationary distribution), where, for example, being at state A1 means that the current visitor was shown Photo A and was a success.

\noin (b) Determine whether this chain is reversible.

\smallskip

\noin Hint: First think about which transition probabilities are zero and which are nonzero.

\medskip

\noin (c) Show that the stationary distribution is proportional to $\left(\frac{a}{1-a},\frac{b}{1-b},1,1\right)$, and find the stationary distribution.

\medskip

\noin (d) Show that for $a \neq b$, the stationary probability of success for each visitor is strictly better than the success probability that would be obtained by independently, randomly choosing (with equal probabilities) which photo to show to each visitor.

\bigskip

\noin 7. (BH 11.16) This exercise considers random walk on a \emph{weighted} undirected network.  Suppose that an undirected network is given, where each edge $(i,j)$ has a nonnegative weight $w_{ij}$ assigned to it (we allow $i=j$ as a possibility). We assume that $w_{ij} = w_{ji}$ since the edge from $i$ to $j$ is considered the same as the edge from $j$ to $i$. To simplify notation, define $w_{ij}=0$ whenever $(i,j)$ is not an edge.

When at node $i$, the next step is determined by choosing an edge attached to $i$ with probabilities proportional to the weights. For example, if the walk is at node 1 and there are 3 possible edges coming out from node 1, with weights $7,1,4$, then the first of these 3 edges is traversed with probability $7/12$, the second is traversed with probability $1/12$, and the third is traversed with probability $4/12$. If all the weights equal $1$, then the process reduces to the kind of random walk on a network that we studied earlier. 

\smallskip

\noin (a) Let $v_i = \sum_j w_{ij}$ for all nodes $i$. Show that the stationary distribution of node $i$ is proportional to $v_i$.

\medskip

\noin (b) Show that \emph{every} reversible Markov chain can be represented as a random walk on a weighted undirected network. 

\smallskip

\noin Hint: Let $w_{ij} = s_i q_{ij}$, where $\mathbf{s}$ is the stationary distribution and $q_{ij}$ is the transition probability from $i$ to $j$. 

\end{document}