library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1707 obs. of 2 variables:
## $ ID : int 1 2 3 6 7 8 9 11 12 13 ...
## $ myVar: num 1.336 0.964 0.456 0.956 0.365 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002241 0.395249 0.767733 0.902416 1.271127 4.188167
Plot histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Add empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Get maximum likelihood parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars) # provides mean and sd
## mean sd
## 0.90241559 0.66719302
## (0.01614859) (0.01141878)
str(normPars) # is a list of 5
## List of 5
## $ estimate: Named num [1:2] 0.902 0.667
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0161 0.0114
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000261 0 0 0.00013
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1707
## $ loglik : num -1731
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.9024156
Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# load in data
z <- read.table("raw_data.csv",header=TRUE,sep=",")
z <- data.frame(1:46,z$WhitePopVax)
head(z)
## X1.46 z.WhitePopVax
## 1 1 0.62
## 2 2 0.49
## 3 3 0.55
## 4 4 0.60
## 5 5 0.74
## 6 6 0.79
str(z)
## 'data.frame': 46 obs. of 2 variables:
## $ X1.46 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ z.WhitePopVax: num 0.62 0.49 0.55 0.6 0.74 0.79 0.8 0.74 0.69 0.62 ...
summary(z)
## X1.46 z.WhitePopVax
## Min. : 1.00 Min. :0.4400
## 1st Qu.:12.25 1st Qu.:0.5500
## Median :23.50 Median :0.6200
## Mean :23.50 Mean :0.6274
## 3rd Qu.:34.75 3rd Qu.:0.6900
## Max. :46.00 Max. :0.8600
Plot histogram of data
p1 <- ggplot(data=z, aes(x=z.WhitePopVax, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Add empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Get maximum likelihood parameters for normal
normPars <- fitdistr(z$z.WhitePopVax,"normal")
print(normPars) # provides mean and sd
## mean sd
## 0.62739130 0.10805463
## (0.01593179) (0.01126547)
str(normPars) # is a list of 5
## List of 5
## $ estimate: Named num [1:2] 0.627 0.108
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0159 0.0113
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000254 0 0 0.000127
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 46
## $ loglik : num 37.1
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.6273913
Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$z.WhitePopVax),len=length(z$z.WhitePopVax))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$z.WhitePopVax), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot exponential probability density
expoPars <- fitdistr(z$z.WhitePopVax,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$z.WhitePopVax), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$z.WhitePopVax), args = list(min=min(z$z.WhitePopVax), max=max(z$z.WhitePopVax)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot gamma probability density
gammaPars <- fitdistr(z$z.WhitePopVax,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$z.WhitePopVax), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=z.WhitePopVax/(max(z.WhitePopVax + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$z.WhitePopVax/max(z$z.WhitePopVax + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$z.WhitePopVax), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
normPars <- fitdistr(z$z.WhitePopVax,"normal")
print(normPars)
## mean sd
## 0.62739130 0.10805463
## (0.01593179) (0.01126547)
SimData <- rnorm(n=z$X1.46, mean = normPars$estimate["mean"], sd = normPars$estimate["sd"])
SimData <- data.frame(1:46, SimData)
p2 <- ggplot(data=SimData, aes(x=SimData, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gammaPars <- fitdistr(SimData$SimData,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(SimData$SimData), args = list(shape=shapeML, rate=rateML))
p2 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The probability curves look very similar across the original data set and the simulated data set!
p2 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.