## Y ~ B(\theta)
## n=200  y=43
##

## L(\theta) = \binom{200}{43} \theta^43 (1-\theta)^(200-43)

L <- function(theta, n, y){
    choose(200, 43) * theta^y * (1-theta)^(n-y)
}
curve(L(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(L(theta)))
curve(L(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(L(theta)))

## ou ...
L <- function(theta, n, y){
    dbinom(y, size=n, prob=theta)
}
curve(L(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(L(theta)))
curve(L(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(L(theta)))

## l(\theta) = log(L(\theta)) = \log{\binom{n}{y}} +  y \log(\theta) +  (n-y) \log(1-\theta)  
l <- function(theta, n, y){
    ## seria:
    #log(choose(n,y)) + y * log(theta) + (n-y)*log(1-theta) 
    ## mas, melhorando a forma de calculo do 1o termo:
   (lgamma(n+1)-lgamma(y+1)-lgamma(n-y+1)) + y * log(theta) + (n-y)*log(1-theta) 
}
curve(l(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))
curve(l(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))

## ou
l <- function(theta, n, y){
   dbinom(y, size=n, prob=theta, log=TRUE)
}
curve(l(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))
curve(l(x, n=200, y=43), from=0, to=0.5, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))

## ou ainda juntando as duas L e l em uma unica funcao:
vero <- function(theta, n, y, log=FALSE){
   dbinom(y, size=n, prob=theta, log=log)
}

curve(vero(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(L(theta)))
curve(vero(x, n=200, y=43, log=TRUE), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))
## 

## Estimativa de máxima verossimilhança \hat{\theta}, e valor de L(\hat{\theta}) e l(\hat{\theta})
## i. obtida analiticamente
(theta.EMV <- 43/200)
(L.EMV <- vero(theta.EMV, n=200, y=43))
(l.EMV <- vero(theta.EMV, n=200, y=43, log=TRUE))
## ii. obtida numericamente
args(optimize)
(EMVnL <- optimize(vero, c(0,1), n=200, y=43, maximum=TRUE))
(EMVnl <- optimize(vero, c(0,1), n=200, y=43, log=TRUE, maximum=TRUE))
## obs: optimiza é apenas usada para parametro escalar (1 paramatro)
## no caso de mais parametros outras funções podem ser usadas como por exemplo optim()

(theta.EMVn <- EMVnl$maximum)
(L.EMVn <- EMVnL$objective)
## ou
exp(EMVnl$objective)
##
(l.EMVn <- EMVnl$objective)

##
c(L.EMV, L.EMVn)
c(l.EMV, l.EMVn)

## Verossimilhança relativa LR(\theta)
LR <- function(theta, n, y, EMV=theta.EMV){
    L(theta, n, y) / L(EMV, n, y)  
}
curve(LR(x, n=200, y=43, EMV=theta.EMV), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(LR(theta)))
curve(LR(x, n=200, y=43, EMV=theta.EMV), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(LR(theta)))

## Deviance = -2 log{Verossimilhança relativa}
## D(\theta) = -2 log[LR(\theta)] = -2[l(\theta) - l(\hat{\theta})]

Dev <- function(theta, n, y, EMV=theta.EMV){
    -2 * (l(theta, n, y) - l(EMV, n, y))  
}

curve(Dev(x, n=200, y=43, EMV = theta.EMV), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)))
curve(Dev(x, n=200, y=43, EMV = theta.EMV), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)))

## Função escore U(\theta) = \frac{y}{\theta} - \frac{n-y}{1-\theta} 
U <- function(theta, n, y){
    (y/theta) - ((n-y)/(1-theta))
}

curve(U(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(U(theta)))
abline(h=0, v=theta.EMV)
curve(U(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(U(theta)))
abline(h=0, v=theta.EMV)

## EMV como raiz da função escore
uniroot(U, interval=c(0,1), n=200, y=43)

## Hessiano  H(\theta) = - \frac{y}{\theta^2} - \frac{n-y}{(1-\theta)^2}
##                     = -n^2 * ((1/y) + (1/(n-y)))
H <- function(theta, n, y){
    -(y/theta^2) - ((n-y)/(1-theta)^2)
}

curve(H(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(H(theta)))
curve(H(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(H(theta)))


## Note que em geral utilizamos o negativo do inverso do hessiano H^{-1}
## -H^{-1} = (\frac{y}{\theta^2} + \frac{n-y}{1-\theta^2})^{-1}  
neginvH <- function(theta, n, y){
    1/((y/theta^2) + ((n-y)/(1-theta)^2)) 
}

curve(neginvH(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(-{H^{-1}}(theta)))
curve(neginvH(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(-{H^{-1}}(theta)))


## EMV via Newton-Raphson
## \theta^{(r+1)} = \theta^r - \frac{U(\theta)}{H(\theta)}

(theta0 <- 0.30)
(theta1 <- theta0 - U(theta0, n=200, y=43)/H(theta0, n=200, y=43))
(theta2 <- theta1 - U(theta1, n=200, y=43)/H(theta1, n=200, y=43))
(theta3 <- theta2 - U(theta2, n=200, y=43)/H(theta2, n=200, y=43))
(theta4 <- theta3 - U(theta3, n=200, y=43)/H(theta3, n=200, y=43))
(theta5 <- theta4 - U(theta4, n=200, y=43)/H(theta4, n=200, y=43))
(theta6 <- theta5 - U(theta5, n=200, y=43)/H(theta5, n=200, y=43))

## outra forma
maxit <- 100; minDiff <- 1e-10; diff.theta <- 1; theta0 <- 0.35; it <- 0
while(it < maxit & diff.theta > minDiff){
    theta <- theta0 - U(theta0, n=200, y=43)/H(theta0, n=200, y=43)
    it <- it+1
    diff.theta <- abs(theta - theta0)
    theta0 <- theta
    print(c(it, theta))
}
c(it, theta)

## ou usando -H^{-1}
maxit <- 100; minDiff <- 1e-10; diff.theta <- 1; theta0 <- 0.35; it <- 0
while(it < maxit & diff.theta > minDiff){
    theta <- theta0 + U(theta0, n=200, y=43)*neginvH(theta0, n=200, y=43)
    it <- it+1
    diff.theta <- abs(theta - theta0)
    theta0 <- theta
    print(c(it, theta))
}
c(it, theta)

## Intervalos de confiança

## baseado na deviance e considerando que D(\theta) \approx \chi^2_{(1)}

## i) via Newton Raphson resolvendo D(\theta) - corte = 0 (2 vezes, a esquerda e direita do EMV) 
## usa-se a aproximação (Taylor de 1a ordem) de D(\theta) - corte = 0
## theta = theta_0 + (D(theta_0) - C)/(2 U(\theta_0))
(corte <- qchisq(0.95, df=1))

## O limite inferior:
maxit <- 100; minDiff <- 1e-10; diff.theta <- 1; it <- 0
theta0 <- 0.10   ## para limite superior vamos tomar um valor abaixo da EMV
while(it < maxit & diff.theta > minDiff){
    thetaI <- theta0 + (Dev(theta0, n=200, y=43)-corte)/(2*U(theta0, n=200, y=43))
    it <- it+1
    diff.theta <- abs(thetaI - theta0)
    theta0 <- thetaI
    print(c(it, thetaI))
}
c(it, thetaI)

## O limite superior:
maxit <- 100; minDiff <- 1e-10; diff.theta <- 1; it <- 0
theta0 <- 0.35   ## para limite superior vamos tomar um valor acima da EMV
while(it < maxit & diff.theta > minDiff){
    thetaS <- theta0 + (Dev(theta0, n=200, y=43)-corte)/(2*U(theta0, n=200, y=43))
    it <- it+1
    diff.theta <- abs(thetaS - theta0)
    theta0 <- thetaS
    print(c(it, thetaS))
}
c(it, thetaS)

## O intervalo
c(thetaI, thetaS)

## ii) Usando uma função do pacote rootSolve para encontrar múltiplas raízes 
IC.dev <- function(theta, size, y, EMV, conf){
    Dev(theta, n=size, y=y, EMV=EMV) - qchisq(conf, df=1)
}
require(rootSolve)
args(uniroot.all)
(theta.IC <- uniroot.all(IC.dev, c(0,1), size=200, y=43, EMV=theta.EMV, conf=0.95))

## baseado na verossimilhança relativa
IC.LR <- function(theta, size, y, EMV, corte){
    LR(theta, n=size, y=y, EMV=EMV) - corte
}
require(rootSolve)
args(uniroot.all)
uniroot.all(IC.LR, c(0,1), size=200, y=43, EMV=theta.EMV, corte=0.50)


## Aproximação quadrática (Taylor 2a ordem) da log-verossimilhanca
## l(\theta) ~ l(\hat{\theta}) + (1/2) (\theta - \hat{theta})^2 H(\hat{\theta})
l.aq <- function(theta, n, y, EMV){
    (lgamma(n+1)-lgamma(y+1)-lgamma(n-y+1)) + y*log(EMV) + (n-y)*log(1-EMV) -
        0.5*(n^2)*((theta-EMV)^2)*((1/y)+(1/(n-y)))
}

curve(l(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))
curve(l.aq(x, n=200, y=43, EMV=theta.EMV), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)), col=2, add=T)
abline(v=theta.EMV)

curve(l(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)))
curve(l.aq(x, n=200, y=43, EMV=theta.EMV), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(l(theta)), col=2, add=T)
abline(v=theta.EMV)


## Aproximação quadrática (Taylor 2a ordem) da função deviance
## D(\theta) ~ D(\hat{\theta}) + (1/2) (\theta - \hat{theta})^2 H(\hat{\theta})
##           = (n*(\theta - \hat{\theta}))^2 (((1/y)+(1/(n-y)))
Dev.aq <- function(theta, n, y, EMV){
    ((n*(theta-EMV))^2) * ((1/y)+1/(n-y))
}

curve(Dev(x, n=200, y=43), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)))
curve(Dev.aq(x, n=200, y=43, EMV=theta.EMV), from=0, to=1, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)), col=2, add=T)
abline(v=theta.EMV)

curve(Dev(x, n=200, y=43), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)))
curve(Dev.aq(x, n=200, y=43, EMV=theta.EMV), from=0.1, to=0.4, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)), col=2, add=T)
abline(v=theta.EMV)

curve(Dev(x, n=200, y=43), from=0.15, to=0.3, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)))
curve(Dev.aq(x, n=200, y=43, EMV=theta.EMV), from=0.15, to=0.3, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)), col=2, add=T)
abline(v=theta.EMV)


## IV baseado na aproximação quadratica da deviance
## poderia ser obtido numericamente tal como para D(\theta)
## mas por ser uma função quadrática, pode ser obtido analiticamente
## C = valor de corte na verossimilhança

#(n(\theta - \hat{\theta}))^2 (\frac{1}{y} - \frac{1}{n-y}) = C
#(\theta - \hat{\theta})^2 = C (\frac{y(n-y)}{n^3}
#(\theta - \hat{\theta}) = \sqrt{C} \sqrt{\frac{\hat{\theta} (1-\hat{\theta})}{n}}
(theta.IC.aq <- theta.EMV + c(-1,1) * 1.96 * sqrt(theta.EMV*(1-theta.EMV)/200))
## Comparando a original e aproximada
curve(Dev(x, n=200, y=43), from=0.15, to=0.3, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)))
abline(h=3.84)
arrows(theta.IC, 3.84, theta.IC, 0, length=0.1)

curve(Dev.aq(x, n=200, y=43, EMV=theta.EMV), from=0.15, to=0.3, n=1001,
      xlab=expression(theta), ylab=expression(D(theta)), col=2, add=T)
arrows(theta.IC.aq, 3.84, theta.IC.aq, 0, length=0.1, col=2)


## Tópicos adicionais:

## Passando a função gradiente
## Alguns algoritmos podem se beneficiar de função gradiente como "BFGS", "L_BFGS_B" e "CG"
args(optim)
(op1 <- optim(0.30, l, n=200, y=43, control=list(fnscale=-1), method="BFGS"))
(op2 <- optim(0.30, l, gr=U, n=200, y=43, control=list(fnscale=-1), method="BFGS"))

## o algoritmo de maximização pode retornar o hessiano numérico
optimHess(op1$par, l, y=43, n=200)
optimHess(op2$par, l, gr=U, y=43, n=200)

## ou ainda
(op1 <- optim(0.30, l, n=200, y=43, control=list(fnscale=-1), method="BFGS", hessian=TRUE))
op1$hessian

## comparando com o analitico:
H(theta.EMV, n=200, y=43)

## Função Escore e hessiana obtidas simbólicamente

lUH <- deriv3(~(lgamma(n+1)-lgamma(y+1)-lgamma(n-y+1)) + y * log(theta) + (n-y)*log(1-theta),
              "theta", function.arg=function(theta, n, y){})

args(lUH)
(temp <- lUH(0.2, n=200, y=43))
c(l(theta.EMV, n=200, y=43),U(theta.EMV, n=200, y=43),H(theta.EMV, n=200, y=43))
attr(temp, "gradient")
attr(temp, "hessian")
rm(temp)


## o algoritmo Newton-Raphson fica então:
maxit <- 100; minDiff <- 1e-10; diff.theta <- 1; theta0 <- 0.35; it <- 0
while(it < maxit & diff.theta > minDiff){
    dds <- lUH(theta0, n=200, y=43)
    theta <- theta0 - drop(attr(dds, "gradient"))/drop(attr(dds, "hessian"))
    it <- it+1
    diff.theta <- abs(theta - theta0)
    theta0 <- theta
    print(c(iter=it, theta))
}
c(iter=it, theta)

## funções alternativas para encontrar o EMV:
## alem de optim() tem-se nlminb()
## ou ainda stats4:mle() e bbmle:::mle2()
## dentre outras disponiveis no R (ver CRAN Task Views)
## uma vantegem destas funções é que eles possem métodos ṕara intervaloes de confiança
## e outras medidas úteis para inferência

require(stats4)
#?mle

## mle() egige uma função que retorne o negativo da log-verossimilhança, portanto vamos redefinir:

neg.l <- function(theta, n, y){
    -(lgamma(n+1)-lgamma(y+1)-lgamma(n-y+1)) - y * log(theta) - (n-y)*log(1-theta) 
}

args(mle)
mle(neg.l, start=list(theta=0.35), fixed=list(n=200, y=43))
mle(neg.l, start=list(theta=0.35), fixed=list(n=200, y=43), method="L-BFGS-B", lower=0.00001, upper=0.999999)
## como tem apenas 1 parametro para estimar pode-se usar:
mle(neg.l, start=list(theta=0.35), fixed=list(n=200, y=43), method="Brent", lower=0, upper=1)

## OBS: pode-se ainda restringir o espaço paramétrico de outra forma, retornando NA's
neg.l <- function(theta, n, y){
    if(theta > 0 & theta < 1)
    -(lgamma(n+1)-lgamma(y+1)-lgamma(n-y+1)) - y * log(theta) - (n-y)*log(1-theta) 
    else NA
}
mle(neg.l, start=list(theta=0.35), fixed=list(n=200, y=43))


## Agora colocando o resultado em um objeto e usando os métodos disponíveis
theta.mle  <- mle(neg.l, start=list(theta=0.35), fixed=list(n=200, y=43))

coef(theta.mle)
AIC(theta.mle)
show(theta.mle)
summary(theta.mle)
logLik(theta.mle)
vcov(theta.mle)
-1/vcov(theta.mle)   ## igual ao hessiano calculado com H(theta.EMV, n=200, y=43)
confint(theta.mle)   ## retorna o intervalo baseado na verossimilhança
profile(theta.mle)   ## no caso de 1 parametro é igual a fc de verossimilhança
plot(profile(theta.mle))   ## correponde ao grafico da sqrt da deviance


## Teste de Hipótese

## Exemplo: H_0: \theta = 0,20  vs  H_a: \theta > 0,20

## i. Teste da razão de de verosimilhança
## D(\theta_0) = -2[l(\theta_0) - l((\hat{\theta})]~ \chisq_{(1)}

## notando que l(\hat{\theta}) é retornado pelos algoritmos de maximização
l(theta.EMV, y=43, n=200)
op1$value
logLik(theta.mle)
-theta.mle@min

## a estatística para o TRV pode ser calculada de diferentes usando as funções definidas até aqui:
(trv <- Dev(0.20, n=200, y=43))
-2*(l(0.20, n=200, y=43) - l(theta.EMV, n=200, y=43))
2 * (neg.l(0.20, n=200, y=43) + logLik(theta.mle))

## e o p-valor: (teste unilateral, baseado na distribuição assintótica da deviance)
pchisq(trv, df=1, lower=FALSE)/2

## ii. Wald
## \hat{\theta} ~ N(0, I^{-1}(\hat{\theta}))
## W = (\hat{\theta} - \theta_0)' I(\hat{\theta}) (\hat{\theta} - \theta_0) \chisq_{()}
## Para 1 parâmetro
##  (\hat{\theta} - \theta_0) I(\hat{\theta}) ~ \chi^2_{(1)}
##  \sqrt{(\hat{\theta} - \theta_0) I(\hat{\theta)}} ~ N(0,1)

## I_O(\theta) = -H(\theta) = \frac{y}{\theta^2} + \frac{n-y}{(1-\theta)^2}
## I_E(\theta) = E[I_O] = \frac{n}{\theta(1-theta)}
## neste caso:
## I_O(\hat{\theta}) = I_E(\hat{\theta}) = \frac{n}{\hat{theta}(1-\hat{theta})}
twald0 <- (theta.EMV - 0.20)^2*(-H(theta.EMV, n=200, y=43))
twald <- (theta.EMV - 0.20)*(sqrt(-H(theta.EMV, n=200, y=43)))

pchisq(twald0, df=1, lower=FALSE)/2
pnorm(twald, lower=FALSE)


## iii. Teste escore (ou teste escore de Rao, ou teste do multiplicador de Lagrange)
## frac{U^2(\theta_0)}{I(\theta_0)} ~ \chisq_{(1)}
##
## \sqrt{frac{U^2(\theta_0)}{I(\theta_0)}} ~ N(0,1)
## I(\theta) = -H(\theta)

(tesc <- U(0.20, n=200, y=43)^2/-H(0.20, n=200, y=43))
temp <- lUH(0.2, n=200, y=43)
drop(attr(temp, "gradient"))^2/-drop(attr(temp, "hessian"))

pchisq(tesc, df=1, lower=FALSE)/2


## reparametrização
#\psi = log(\frac{\theta}{1 - \theta})
# \theta = \frac{exp(\psi)}{1 + exp(\psi)}

logit <- function(x) log(x/(1-x))
ilogit <- function(x) exp(x)/(1+exp(x))

## função de verossimilhança com reparametrização
neg.l.psi <- function(psi, n, y){
    theta <- ilogit(psi)
    -(lgamma(n+1)-lgamma(y+1)-lgamma(n-y+1)) - y * log(theta) - (n-y)*log(1-theta) 
}

(psi.mle  <- mle(neg.l.psi, start=list(psi=logit(0.35)), fixed=list(n=200, y=43)))

coef(psi.mle)[1]
ilogit(coef(psi.mle)[1])

## pela invariância da verossimilhança, o intevalo BASEADO NA VEROSSIMILHANÇA pode ser transformado diretamente
confint(psi.mle)
ilogit(confint(psi.mle))