Homework 8 – Simulating and Fitting Data Distributions

Data for this homework was obtained from www.data.gov and is titled “Harmful Algal Bloom Statewide Occurrence Summary: 2012-2018” from the State of New York. For context, HABs/habs = Harmful Algal Blooms

Importing necessary libraries:

library(ggplot2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

Importing dataset:

habs <- read.table("HABs_2012-2018.csv", header=TRUE,sep=",")

Cleaning up data and assigning variable of interest:

habs <- na.omit(habs)
habs$myVar <- habs$Number.of.Weeks.on.DEC.Notification.List

Summarizing dataset:

str(habs$myVar)

##  int [1:737] 10 12 25 4 4 14 15 18 5 33 ...

summary(habs$myVar)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00    9.00   10.26   15.00   33.00

Histogram of Number of Weeks HABs were listed on NYSDEC Notification List:

p1 <- ggplot(data=habs, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Adding empirical density curve:

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Maximum likelihood parameters for normal distribution

normPars <- fitdistr(habs$myVar,"normal")
print(normPars)

##       mean          sd    
##   10.2605156    6.8003094 
##  ( 0.2504926) ( 0.1771250)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 10.3 6.8
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.25 0.177
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.0627 0 0 0.0314
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 737
##  $ loglik  : num -2459
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"]

##     mean 
## 10.26052

Plotting normal probability density on the histogram:

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(habs$myVar),len=length(habs$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(habs$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plotting exponential probability density on the histogram:

expoPars <- fitdistr(habs$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(habs$myVar), args = list(rate=rateML))
 p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plotting uniform probability density on the histogram:

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(habs$myVar), args = list(min=min(habs$myVar), max=max(habs$myVar)))
 p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plotting gamma probability density on the histogram:

gammaPars <- fitdistr(habs$myVar,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(habs$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plotting beta probability density on the histogram:

pSpecial <- ggplot(data=habs, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=habs$myVar/max(habs$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(habs$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Simulated New Dataset

Creating a simulated dataset based on gamma probability density. Here I used “rgamma” instead of “dgamma”, which will generate random values.

simulated_dataset <- rgamma(n=length(habs$myVar),shape=shapeML,rate=rateML)
simulated_z <- 1:length(habs$myVar)
simulated_df <- data.frame(simulated_dataset,simulated_z)

Summary of simulated dataset:

str(simulated_dataset)

##  num [1:737] 6.95 26.69 6.74 8.45 6.49 ...

summary(simulated_dataset)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1169  5.2108  8.4450 10.1992 13.7058 44.9376

Plotting the histogram based on gamma probability density, which appears to best fit the data compared to normal, exponential, uniform, and beta probability.

p2 <- ggplot(data=simulated_df, aes(x=simulated_dataset, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",linewidth=0.2) 
p2 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Homework 8 – Simulating and Fitting Data Distributions

Grace (Rei) Jia

2025-02-26