Homework

Running Sample Code

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)

## 'data.frame':    1707 obs. of  2 variables:
##  $ ID   : int  1 2 3 6 7 8 9 11 12 13 ...
##  $ myVar: num  1.336 0.964 0.456 0.956 0.365 ...

summary(z$myVar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002241 0.395249 0.767733 0.902416 1.271127 4.188167

Plot histogram of data

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add empirical density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get maximum likelihood parameters for normal

normPars <- fitdistr(z$myVar,"normal")
print(normPars) # provides mean and sd

##       mean          sd    
##   0.90241559   0.66719302 
##  (0.01614859) (0.01141878)

str(normPars) # is a list of 5

## List of 5
##  $ estimate: Named num [1:2] 0.902 0.667
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0161 0.0114
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000261 0 0 0.00013
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1707
##  $ loglik  : num -1731
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##      mean 
## 0.9024156

Plot normal probability density

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability density

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot uniform probability density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability density

gammaPars <- fitdistr(z$myVar,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot beta probability density

pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Trying it with inputting data

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# load in data
z <- read.table("raw_data.csv",header=TRUE,sep=",")
z <- data.frame(1:46,z$WhitePopVax)
head(z)

##   X1.46 z.WhitePopVax
## 1     1          0.62
## 2     2          0.49
## 3     3          0.55
## 4     4          0.60
## 5     5          0.74
## 6     6          0.79

str(z)

## 'data.frame':    46 obs. of  2 variables:
##  $ X1.46        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ z.WhitePopVax: num  0.62 0.49 0.55 0.6 0.74 0.79 0.8 0.74 0.69 0.62 ...

summary(z)

##      X1.46       z.WhitePopVax   
##  Min.   : 1.00   Min.   :0.4400  
##  1st Qu.:12.25   1st Qu.:0.5500  
##  Median :23.50   Median :0.6200  
##  Mean   :23.50   Mean   :0.6274  
##  3rd Qu.:34.75   3rd Qu.:0.6900  
##  Max.   :46.00   Max.   :0.8600

Plot histogram of data

p1 <- ggplot(data=z, aes(x=z.WhitePopVax, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add empirical density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get maximum likelihood parameters for normal

normPars <- fitdistr(z$z.WhitePopVax,"normal")
print(normPars) # provides mean and sd

##       mean          sd    
##   0.62739130   0.10805463 
##  (0.01593179) (0.01126547)

str(normPars) # is a list of 5

## List of 5
##  $ estimate: Named num [1:2] 0.627 0.108
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0159 0.0113
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000254 0 0 0.000127
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 46
##  $ loglik  : num 37.1
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##      mean 
## 0.6273913

Plot normal probability density

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$z.WhitePopVax),len=length(z$z.WhitePopVax))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$z.WhitePopVax), args = list(mean = meanML, sd = sdML))
 p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability density

expoPars <- fitdistr(z$z.WhitePopVax,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$z.WhitePopVax), args = list(rate=rateML))
 p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot uniform probability density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$z.WhitePopVax), args = list(min=min(z$z.WhitePopVax), max=max(z$z.WhitePopVax)))
p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability density

gammaPars <- fitdistr(z$z.WhitePopVax,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$z.WhitePopVax), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot beta probability density

pSpecial <- ggplot(data=z, aes(x=z.WhitePopVax/(max(z.WhitePopVax + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$z.WhitePopVax/max(z$z.WhitePopVax + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$z.WhitePopVax), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Simulating New Data Set

normPars <- fitdistr(z$z.WhitePopVax,"normal")
print(normPars)

##       mean          sd    
##   0.62739130   0.10805463 
##  (0.01593179) (0.01126547)

SimData <- rnorm(n=z$X1.46, mean = normPars$estimate["mean"], sd = normPars$estimate["sd"])

SimData <- data.frame(1:46, SimData)

p2 <- ggplot(data=SimData, aes(x=SimData, y=..density..)) +
    geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
 print(p2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gammaPars <- fitdistr(SimData$SimData,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(SimData$SimData), args = list(shape=shapeML, rate=rateML))
 p2 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Graph of Original Data Set:

p1 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Graph of Simulated Data Set:

The probability curves look very similar across the original data set and the simulated data set!

p2 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Homework_06

Gillian Berglund

2/23/2022

Running Sample Code

Trying it with inputting data

Simulating New Data Set

Graph of Original Data Set:

Graph of Simulated Data Set: