@techreport{gruen06,
Title = {Optimality of {LSTD} and its Relation to {TD} and {MC}},
Author = {S. Gr\"unew\"alder and K. Obermayer},
Year = {2006},
Institution = {Berlin University of Technology},
Abstract = {In this analytical study we compare the risk of three well known reinforcement estimators: temporal difference learning (TD), Monte Carlo estimation (MC) and least-squares TD (LSTD). We find that neither TD nor Monte Carlo estimation are in general superior to each other. However, we can prove that for the case of acyclic Markov Reward Processes (MRPs) LSTD, which is related to TD, has minimal risk for any convex loss function inthe class of unbiased estimators. We analyze the relation of TD and LSTD by means of a new estimator which is both similar to LSTD and to TD. We proof that the new estimator converges almost sure and in the average. When comparing the Monte Carlo estimator, which does not assume a Markov structure, and LSTD, we find that the Monte Carlo estimator is equivalent to LSTD if both estimators have the same amount of information. Theoretical results are supported by an empirical evaluation of the estimators.},
Url = {http://www.ni.tu-berlin.de/fileadmin/fg215/articles/gruen06_MDP_TechReport.ps.gz}
}