Reading files into R

Lets say you have a bunch of data files you want to read into one data frame. This is the first step.

# Get list of all CSV files in directory
file_list <- list.files(path = "./", pattern = "\\.csv$", full.names = TRUE)

# Initialize list to store data
all_data <- list()

Reading the data

Lets write a loop that will work its way through each file.

# Loop through each file
for (file in file_list) {
  df <- read.csv(file)
  if (!"haploid" %in% names(df)) {
    cat("Missing 'haploid' column in:", basename(file), "\n")
    next
  }
  file_name <- tools::file_path_sans_ext(basename(file))
  temp_df <- data.frame(file = file_name, haploid = df$haploid)
  all_data[[length(all_data) + 1]] <- temp_df
}

Data Munging

Now we need to get our data in order

final_df <- do.call(rbind, all_data)
drop <- !is.na(as.numeric(final_df$haploid))
## Warning: NAs introduced by coercion
final_df <- final_df[drop,]
final_df$haploid <- as.numeric(final_df$haploid)

Plotting

These are all bad examples but you could use them as starting points

# 1) Boxplot + jitter (good for comparing distributions across files)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
  coord_flip() +
  labs(x = "file", y = "haploid", title = "Boxplot + jitter by file") +
  theme_minimal()
## `geom_boxplot()` removes missing values
Boxplot with jitter of haploid numbers by file
# 2) Density plot (good for looking at distribution shape)
ggplot(final_df, aes(x = haploid, fill = file)) +
  geom_density(alpha = 0.6) +
  labs(x = "haploid", y = "Density", title = "Density plot by file", fill = "File") +
  theme_minimal()
## Warning: Groups with fewer than two observations have been dropped
Density plot of haploid numbers by file
# 3) Violin plot (like a box plot but shows distribution)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
  geom_violin() +
  geom_jitter(width = 0.1, alpha = 0.5, size = 1) +
  coord_flip() +
  labs(x = "file", y = "haploid", title = "Violin plot by file") +
  theme_minimal()
## `geom_violin()` removes missing values
Violin plot of haploid numbers by file
# 4) Histogram faceted by file
ggplot(final_df, aes(x = haploid, fill = "#FF6B6B")) +
  geom_histogram(bins = 20) +
  facet_wrap(~ file) +
  labs(x = "haploid", y = "Count", title = "Histogram by file") +
  theme_minimal() +
  theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram of haploid numbers faceted by file
# 5) Cumulative distribution
ggplot(final_df, aes(x = haploid, color = file)) +
  stat_ecdf() +
  labs(x = "haploid", y = "Cumulative probability", title = "Cumulative distribution by file", color = "File") +
  theme_minimal()
Cumulative distribution of haploid numbers by file
# 6) Ridge plot using ggridges (good for looking at distributions across groups)
ggplot(final_df, aes(x = haploid, y = file, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient() +
  labs(x = "haploid", title = "Ridge plot by file", fill = "haploid") +
  theme_minimal() +
  theme(axis.title.y = element_blank())
Ridge plot of haploid numbers by file

Visualizations

Here is a summary table of the data

file min max mean median sd
chr10L0.020.990.520.520.28
chr10R0.020.990.520.520.28
chr2L0.011.000.480.470.30
chr2R0.011.000.500.510.29
chr3L0.011.000.480.480.30
chr3R0.011.000.490.500.29
chrX0.020.990.480.470.30

Final thoughts

The key here is that you can write a loop to do things to multiple files. The structure I showed above is useful for reading in a bunch of files, filtering data, merging data, and preprocessing. These are foundational skills for data wrangling in R.