Our whole goal here is to make some beautiful plots so we may well want to dive into ggplot universe
library(ggplot2)
library(ggridges)
Lets go ahead and find the files that we are working with
# List all CSV files in the folder
file_list <- list.files(path = "./", pattern = "\\.csv$", full.names = TRUE)
# Initialize list to store data
all_data <- list()
Lets write a loop that will work its way through each file.
# Loop through each file
for (file in file_list) {
df <- read.csv(file)
if (!"haploid" %in% names(df)) {
cat("Missing 'haploid' column in:", basename(file), "\n")
next
}
file_name <- tools::file_path_sans_ext(basename(file))
temp_df <- data.frame(file = file_name, haploid = df$haploid)
all_data[[length(all_data) + 1]] <- temp_df
}
Now we need to get our data in order
final_df <- do.call(rbind, all_data)
drop <- !is.na(as.numeric(final_df$haploid))
## Warning: NAs introduced by coercion
final_df <- final_df[drop,]
final_df$haploid <- as.numeric(final_df$haploid)
These are all bad examples but you could use them as starting points
# 1) Boxplot + jitter (good for comparing distributions across files)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
coord_flip() +
labs(x = "file", y = "haploid", title = "Boxplot + jitter by file") +
theme_minimal()
# 2) Violin + jitter (shows shape with individual points)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
geom_violin(trim = FALSE) +
geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
coord_flip() +
labs(x = "file", y = "haploid", title = "Violin + jitter by file") +
theme_minimal()
# 3) Ridgeline densities (compact comparison of distributions)
ggplot(final_df, aes(
x = haploid,
y = reorder(file, haploid, median, na.rm = TRUE),
height = after_stat(density)
)) +
geom_density_ridges(scale = 1, rel_min_height = 0.01, alpha = 0.8) +
labs(y = "file", x = "haploid", title = "Ridgeline density by file") +
theme_minimal()
## Picking joint bandwidth of 0.803
# 4) Faceted histograms (each file gets its own panel; handles uneven sample sizes)
ggplot(final_df, aes(x = haploid)) +
geom_histogram(bins = 30) +
facet_wrap(~ file, scales = "free_y") +
labs(x = "haploid", y = "count", title = "Histogram per file (faceted)") +
theme_minimal()
Phylogeny? Ordering of data? Transformations? Colors? Cartoons? What else?