Week02_Lecture

1. Calculating confidence interval

calculate the confidence interval when sample size is 800, sample proportion is 0.34 for a 95% confidence interval

library(interpretCI)

## Warning: package 'interpretCI' was built under R version 4.2.3

x=propCI(n = 800, p = 0.34, alpha = 0.05)
x

## $data
## # A tibble: 1 × 1
##   value
##   <lgl>
## 1 NA   
## 
## $result
##   alpha   n  df    p P         se critical         ME     lower     upper
## 1  0.05 800 799 0.34 0 0.01674813 1.959964 0.03282574 0.3071743 0.3728257
##                       CI        z      pvalue alternative
## 1 0.34 [95CI 0.31; 0.37] 20.30077 1.26589e-91   two.sided
## 
## $call
## propCI(n = 800, p = 0.34, alpha = 0.05)
## 
## attr(,"measure")
## [1] "prop"

n = 800
p = 0.34

se = sqrt(p*(1-p)/n)
print(c('stand error:', se))

## [1] "stand error:"       "0.0167481342244442"

margin <- qnorm(0.975)*se
print(c('margin of error:', margin))

## [1] "margin of error:"   "0.0328257398881533"

print(c('confidence interval: ', c(p - margin, p + margin)))

## [1] "confidence interval: " "0.307174260111847"     "0.372825739888153"

2. Variability of sample

pop_size <- 250000000
possible_entries <- c(rep("support", 0.88 * pop_size), rep("not", 0.12 * pop_size))

sampled_entries <- sample(possible_entries, size = 1000)

sum(sampled_entries == "support") / 1000

## [1] 0.905

2.1 Sampling distribution of proportion

stu_size <- 250000000
public_use <- c(rep("public", 0.88 * stu_size),rep("nopublic", 0.12 * stu_size))
# 2. Sample 1000 entries without replacement.
p_hat_values <- c()

# Repeat the sampling process 1000 times
for (i in 1:1000) {
  # Sample 1000 entries without replacement
  sample <- sample(public_use, size = 1000)
  # Compute p-hat for the current sample
  p_hat_values <- c(p_hat_values, sum(sample == "public") / 1000)
}

# Display the first few p-hat values
mean_value = mean(p_hat_values)
# Plot the sampling distribution of p-hat
hist(p_hat_values, main = "Sampling distribution of sample proportion", xlab = "sample_proportion", col = "lightblue", border = "white")
abline(v = mean_value, col = "red", lwd = 2, lty = 2)

# Add text to display the mean value
text(mean_value, max(hist(p_hat_values, plot = FALSE)$counts), 
     labels = paste("Mean =", round(mean_value, 3)), 
     pos = 4, col = "red", cex = 1.2)

2.2 Sampling distribution from sample

Suppose you want to estimate the proportion of people who support solor energy expansion, but you don’t have access to the entire population. In this case, you could take a sample of population and use the proportion of sample in your sample who support solar energy expansion as your best guess for the unknown proportion in the overall population.

stu_size <- 250000000
public_use <- c(rep("public", 0.88 * stu_size),rep("nopublic", 0.12 * stu_size))
# 2. Sample 1000 entries without replacement.
p_hat_values <- c()

samples <- sample(public_use, size = 1000)

# Repeat the sampling process 1000 times
for (i in 1:1000) {
  # Sample 1000 entries without replacement
  sampled_students <- sample(samples, size = 500)
  # Compute p-hat for the current sample
  p_hat_values <- c(p_hat_values, sum(sampled_students == "public") / 500)
}

# Display the first few p-hat values
mean_value = mean(p_hat_values)
# Plot the sampling distribution of p-hat
hist(p_hat_values, main = "Simulated sample proportion", xlab = "sample_proportion", col = "blue", border = "white")

abline(v = mean_value, col = "red", lwd = 2, lty = 2)

# Add text to display the mean value
text(mean_value, max(hist(p_hat_values, plot = FALSE)$counts), 
     labels = paste("Mean =", round(mean_value, 3)), 
     pos = 4, col = "red", cex = 1.2)

3. Code about graphs in slides

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

# Create a data frame with two normal distributions
data <- data.frame(
  x = seq(-5, 8, length.out = 1000),
  y1 = dnorm(seq(-5, 8, length.out = 1000), mean = 0, sd = 1),
  y2 = dnorm(seq(-5, 8, length.out = 1000), mean = 3, sd = 1)
)

# Plot the two normal distributions using ggplot2
ggplot(data, aes(x = x)) +
  geom_line(aes(y = y1, color = "Mean = 0"), size = 1) +
  geom_line(aes(y = y2, color = "Mean = 3"), size = 1) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "blue", size = 1) +
  geom_vline(xintercept = 3, linetype = "dashed", color = "red", size = 1) +
  labs(title = "",
       x = "x",
       y = "Density") +
  scale_color_manual(values = c("Mean = 0" = "blue", "Mean = 3" = "red")) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.