if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
if (!requireNamespace("corrplot", quietly = TRUE)) install.packages("corrplot")
# Load packages
library(ggplot2)
library(corrplot)
corrplot 0.92 loaded
# Generate data with correlation
set.seed(123)
n <- 100
x <- rnorm(n)
y <- x + rnorm(n)
correlation_coeff <- cor(x, y)
# ggplot
library(ggplot2)
df <- data.frame(x = x, y = y)
ggplot(df, aes(x, y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Scatterplot showing correlation", x = "X", y = "Y") +
annotate("text", x = 1, y = 4, label = paste("Correlation coefficient:", round(correlation_coeff, 2)))
[1m[22m`geom_smooth()` using formula = 'y ~ x'
data("mtcars")
head(mtcars)
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
Mazda RX4 | 21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 258 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
cor_matrix <- cor(mtcars)
corrplot(cor_matrix, method = "circle")
cor_matrix <- cor(mtcars)
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
addCoef.col = "black", # Color of the correlation coefficients
tl.col = "black", tl.srt = 45, # Color and rotation of the labels
title = "Correlation Matrix of Mtcars Dataset")
cor_matrix <- cor(mtcars, method="spearman")
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
addCoef.col = "black",
tl.col = "black", tl.srt = 45,
title = "Correlation Matrix of Mtcars Dataset")
# Scatter plot of mpg vs. wt
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
ggtitle("MPG vs. Weight") +
theme_minimal()
[1m[22m`geom_smooth()` using formula = 'y ~ x'
# Simulate data and calculate correlation
simulate_correlation <- function(n, rho = 0.5) {
x <- rnorm(n)
y <- rho * x + sqrt(1 - rho^2) * rnorm(n)
cor(x, y)
}
# Different sample sizes
sample_sizes <- seq(10, 1000, by = 10)
# Calculate correlation for each sample size
set.seed(123)
correlations <- sapply(sample_sizes, simulate_correlation)
# Create a dataframe
data_for_plot <- data.frame(sample_size = sample_sizes, correlation = correlations)
# Plotting
ggplot(data_for_plot, aes(x = sample_size, y = correlation)) +
geom_line() +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
labs(title = "Effect of Sample Size on Correlation",
x = "Sample Size",
y = "Estimated Correlation Coefficient") +
theme_minimal() +
annotate("text", x = 500, y = 0.5, label = "True Correlation (0.5)", hjust = 0, color = "red")
library(tidyverse)
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mtibble [39m 3.2.1 [32m✔[39m [34mdplyr [39m 1.1.1
[32m✔[39m [34mtidyr [39m 1.3.0 [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr [39m 2.1.3 [32m✔[39m [34mforcats[39m 0.5.2
[32m✔[39m [34mpurrr [39m 1.0.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m masks [34mstats[39m::lag()
Imagine we have a dataset from examining the effect of a new educational program on student performance, where:
set.seed(123)
n <- 100
edu_data <- data.frame(
treatment = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5)),
pre_test = rnorm(n, mean = 75, sd = 10),
post_test = NA
)
# Assuming the treatment has a positive effect
edu_data$post_test[edu_data$treatment == 1] <- edu_data$pre_test[edu_data$treatment == 1] + rnorm(sum(edu_data$treatment == 1), mean = 5, sd = 5)
edu_data$post_test[edu_data$treatment == 0] <- edu_data$pre_test[edu_data$treatment == 0] + rnorm(sum(edu_data$treatment == 0), mean = 0, sd = 5)
head(edu_data)
treatment | pre_test | post_test | |
---|---|---|---|
<dbl> | <dbl> | <dbl> | |
1 | 1 | 77.53319 | 86.47188 |
2 | 0 | 74.71453 | 77.43050 |
3 | 1 | 74.57130 | 83.41651 |
4 | 0 | 88.68602 | 86.61432 |
5 | 0 | 72.74229 | 70.36106 |
6 | 1 | 90.16471 | 96.82572 |
# Calculate average improvement by group
avg_improvement <- edu_data %>%
mutate(improvement = post_test - pre_test) %>%
group_by(treatment) %>%
summarize(mean_improvement = mean(improvement))
print(avg_improvement)
[90m# A tibble: 2 × 2[39m
treatment mean_improvement
[3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m
[90m1[39m 0 -[31m0[39m[31m.[39m[31m391[39m
[90m2[39m 1 5.49
ggplot(edu_data, aes(x = factor(treatment), y = post_test - pre_test, fill = factor(treatment))) +
geom_boxplot() +
labs(x = "Treatment Group", y = "Improvement in Test Scores", fill = "Group") +
theme_minimal() +
ggtitle("Effect of Educational Program on Test Score Improvement")
t.test(post_test ~ treatment, data = edu_data)
Welch Two Sample t-test
data: post_test by treatment
t = -4.0852, df = 97.985, p-value = 9.011e-05
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
-13.50354 -4.67358
sample estimates:
mean in group 0 mean in group 1
72.37132 81.45988
Note: The slides used in this class can be found below.