Phenotypic Data Part I • geneticMapR

Intro

This script analyzes and prepares phenotype data for genetic mapping. Like any other phenotypic dataset there are some errors that we will go through and fix.

Notes:

IDs 5053–5060 are missing values in the Geno field; assign “Program-Round-2”.

The Geno value “P1-OG-Mcgregor” should be “OG-Mcgregor”—actual parent identity is irrelevant at this stage.

The ID column is numeric; append “-F2” to each entry (e.g., “1001-F2”) to match genotype file formatting.

Phenotype IDs match genotype entries, though the latter include extra well/plate info (e.g., “1001-F2-Alpha-A1”).

After visualization, remove Parents, F1s, and population 1633 to format the phenotype file for R/qtl.

Libraries

library(geneticMapR)
library(stringr)
library(dplyr)
library(ggdist)
library(ggplot2)
library(rlang)
library(ggpubr)
library(readxl)

Data

# Phenotype
url1 <- "https://raw.githubusercontent.com/vegaalfaro/geneticMapRFiles/refs/heads/main/phenotype/phenotypes-binary.csv"

# Download file
if (!file.exists("local_copy_phenotypes-binary.csv")) {
  download.file(url1, destfile = "local_copy_phenotypes-binary.csv")}

# Read
pheno <- read.csv("local_copy_phenotypes-binary.csv")

Feel free to explore the phenotype file. This file is more than we need because it contains data for 3 populations and we will be only working with one.

Curation

#--- Data curation ----

# 1. Populate correctly the Geno columns for IDs 5053:5060
pheno$Geno[pheno$ID %in% 5053:5060] <- "Program-Round2"

# 2. Rename parents OGMacgregor and Program Round1
pheno$Geno <- str_remove(pheno$Geno, "P1-|P2-")

# 3. Change the ID column
pheno$IDs <- paste(pheno$ID, pheno$Gen, sep = "-")


# Most important variables
vars <- names(pheno)[c(9, 10, 16, 22, 23, 24, 26)]
response_vars <- vars #make a copy

# Print variables we'll be keeping or working with
vars
#> [1] "biomass"            "max_width"          "width_50"          
#> [4] "length"             "length_width_ratio" "shoulder_top"      
#> [7] "tip_angle_top"

Data Exploration

We begin by calculating basic statistics. The Geno column indicates the group to which each individual belongs. There are three groups with 107 or more individuals, these correspond to the F₂ populations. In addition, the dataset includes entries for the parents and F_1s of these segregating F₂ populations.


# Means by the column genotype for one variable
means_LW <- pheno %>% 
  group_by(Geno) %>% 
  summarize(mean = mean(length_width_ratio),
            sd = sd(length_width_ratio),
            min = min(length_width_ratio),
            max = max(length_width_ratio),
            n = n()
            )
head(means_LW, n=10)
#> # A tibble: 10 × 6
#>    Geno                       mean     sd   min   max     n
#>    <chr>                     <dbl>  <dbl> <dbl> <dbl> <int>
#>  1 1592-ConicalxRound-F2     2.08  0.462   1.09  3.55   166
#>  2 1629-CylindricalxFlat-F2  1.52  0.407   0.79  2.56   107
#>  3 1633-RoundxCylindrical-F2 1.48  0.279   0.97  2.52   358
#>  4 Cylindra                  2.63  0.456   1.91  3.28    11
#>  5 F1-1592                   1.04  0.174   0.91  1.34     5
#>  6 F1-1633                   1.44  0.212   1.19  1.73     8
#>  7 Mono                      0.692 0.0897  0.59  0.88     8
#>  8 OG-Mcgregor               3.50  0.896   2.17  4.59     6
#>  9 Program-Round1            0.975 0.124   0.83  1.14     6
#> 10 Program-Round2            1.44  0.802   0.95  3.35     8

Multiple traits

If you have multiple traits, below is a loop that would help you to estimate the statistics for each trait and save it in a list.


# List of columns/traits to compute statistics
response_vars <- vars

# Create empty vector to store results 
response_var_stats <- list()

# Iterate for each column 
for (col in response_vars){
  # compute statistics 
  stats <- pheno %>%
    group_by(Geno) %>%
    summarize(across({{ col }}, list(
      mean = mean,
      sd = sd,
      min = min,
      max = max, 
      n = ~sum(!is.na(.))
    )),
    .groups = "drop")
  
  # Add to the results list
  response_var_stats[[col]] <- stats
  
}

print(response_var_stats)
#> $biomass
#> # A tibble: 10 × 6
#>    Geno                biomass_mean biomass_sd biomass_min biomass_max biomass_n
#>    <chr>                      <dbl>      <dbl>       <dbl>       <dbl>     <int>
#>  1 1592-ConicalxRound…        3648.      2573.        321.      14046.       166
#>  2 1629-CylindricalxF…        7140.      3633.       1119.      16989.       107
#>  3 1633-RoundxCylindr…        4440.      2830.        715.      15780.       358
#>  4 Cylindra                   6107.      2326.       2644.       9417.        11
#>  5 F1-1592                    4787.      2458.       2121.       8134.         5
#>  6 F1-1633                    7574.      4118.       2751.      14791.         8
#>  7 Mono                       4633.      1241.       2930.       6327.         8
#>  8 OG-Mcgregor                5112.      2290.       3041.       8550.         6
#>  9 Program-Round1             5498.      1004.       4489.       7215.         6
#> 10 Program-Round2             3763.      3337.       1356.      11839.         8
#> 
#> $max_width
#> # A tibble: 10 × 6
#>    Geno      max_width_mean max_width_sd max_width_min max_width_max max_width_n
#>    <chr>              <dbl>        <dbl>         <dbl>         <dbl>       <int>
#>  1 1592-Con…           54.4        19.6           17.4         119.          166
#>  2 1629-Cyl…           78.0        18.9           37.1         117.          107
#>  3 1633-Rou…           61.4        18.0           26.1         112.          358
#>  4 Cylindra            57.8        11.3           38.5          71.3          11
#>  5 F1-1592             78.8        23.2           49.8         107.            5
#>  6 F1-1633             81.1        17.5           53.2         107.            8
#>  7 Mono                92.4        14.5           70.0         112.            8
#>  8 OG-Mcgre…           47.7         7.69          37.4          56.7           6
#>  9 Program-…           86.3         9.55          73.2         103.            6
#> 10 Program-…           58.1        11.2           38.4          71.8           8
#> 
#> $width_50
#> # A tibble: 10 × 6
#>    Geno           width_50_mean width_50_sd width_50_min width_50_max width_50_n
#>    <chr>                  <dbl>       <dbl>        <dbl>        <dbl>      <int>
#>  1 1592-Conicalx…          29.6        14.9         7.31         73.5        166
#>  2 1629-Cylindri…          73.2        18.7        36.3         113.         107
#>  3 1633-RoundxCy…          54.1        16.3        20.5          98.0        358
#>  4 Cylindra                47.6        10.9        33.0          63.2         11
#>  5 F1-1592                 75.5        25.1        42.7         105.           5
#>  6 F1-1633                 74.9        18.2        44.1         103.           8
#>  7 Mono                    90.9        13.4        69.9         107.           8
#>  8 OG-Mcgregor             31.1        11.9        11.5          45.5          6
#>  9 Program-Round1          84.2        10.7        68.8         102.           6
#> 10 Program-Round2          51.7        13.4        24.5          65.8          8
#> 
#> $length
#> # A tibble: 10 × 6
#>    Geno                     length_mean length_sd length_min length_max length_n
#>    <chr>                          <dbl>     <dbl>      <dbl>      <dbl>    <int>
#>  1 1592-ConicalxRound-F2          112.      43.8        33.2      245.       166
#>  2 1629-CylindricalxFlat-F2       119.      42.2        38.7      228.       107
#>  3 1633-RoundxCylindrical-…        91.3     33.6        36.2      232.       358
#>  4 Cylindra                       150.      29.9        96.4      202.        11
#>  5 F1-1592                         79.4     15.7        63.5      100.         5
#>  6 F1-1633                        118.      37.4        81.9      175.         8
#>  7 Mono                            63.5      9.68       50.6       76.9        8
#>  8 OG-Mcgregor                    168.      53.8        98.4      252.         6
#>  9 Program-Round1                  83.2      8.02       72.6       94.9        6
#> 10 Program-Round2                  85.5     62.8        56.2      241.         8
#> 
#> $length_width_ratio
#> # A tibble: 10 × 6
#>    Geno      length_width_ratio_m…¹ length_width_ratio_sd length_width_ratio_min
#>    <chr>                      <dbl>                 <dbl>                  <dbl>
#>  1 1592-Con…                  2.08                 0.462                    1.09
#>  2 1629-Cyl…                  1.52                 0.407                    0.79
#>  3 1633-Rou…                  1.48                 0.279                    0.97
#>  4 Cylindra                   2.63                 0.456                    1.91
#>  5 F1-1592                    1.04                 0.174                    0.91
#>  6 F1-1633                    1.44                 0.212                    1.19
#>  7 Mono                       0.692                0.0897                   0.59
#>  8 OG-Mcgre…                  3.50                 0.896                    2.17
#>  9 Program-…                  0.975                0.124                    0.83
#> 10 Program-…                  1.44                 0.802                    0.95
#> # ℹ abbreviated name: ¹length_width_ratio_mean
#> # ℹ 2 more variables: length_width_ratio_max <dbl>, length_width_ratio_n <int>
#> 
#> $shoulder_top
#> # A tibble: 10 × 6
#>    Geno      shoulder_top_mean shoulder_top_sd shoulder_top_min shoulder_top_max
#>    <chr>                 <dbl>           <dbl>            <dbl>            <dbl>
#>  1 1592-Con…              86.7            86.6            0.665            651. 
#>  2 1629-Cyl…             185.            104.            22.6              530. 
#>  3 1633-Rou…             116.             80.5           10.9              614. 
#>  4 Cylindra              141.             68.9           58.2              275. 
#>  5 F1-1592               171.            102.            63.0              303. 
#>  6 F1-1633               218.            104.            93.5              380. 
#>  7 Mono                  116.             48.2           37.5              203. 
#>  8 OG-Mcgre…              35.6            33.1            0.886             92.5
#>  9 Program-…             162.             50.9           80.6              219. 
#> 10 Program-…              87.9            62.8           29.1              224. 
#> # ℹ 1 more variable: shoulder_top_n <int>
#> 
#> $tip_angle_top
#> # A tibble: 10 × 6
#>    Geno  tip_angle_top_mean tip_angle_top_sd tip_angle_top_min tip_angle_top_max
#>    <chr>              <dbl>            <dbl>             <dbl>             <dbl>
#>  1 1592…               19.1            12.9               0                 60.0
#>  2 1629…               44.1            15.6               2.84              74.4
#>  3 1633…               41.4            12.8               1.89              68.4
#>  4 Cyli…               28.2            16.6               8.52              60.7
#>  5 F1-1…               46.4            25.4               3.40              66.8
#>  6 F1-1…               32.5            16.1               3.66              52.4
#>  7 Mono                69.4             6.94             59.1               77.9
#>  8 OG-M…               19.0             9.91              1.05              29.9
#>  9 Prog…               58.1             6.96             47.0               65.9
#> 10 Prog…               42.8            21.0              11.3               63.6
#> # ℹ 1 more variable: tip_angle_top_n <int>

Subset

We are interested in population 2. There is more data here than we need so will subset it.

# Population 2
Pop2 <- pheno[pheno$Geno %in% c("1629-CylindricalxFlat-F2", 
                                "Cylindra", 
                                "F1-1629", 
                                "Mono"), ]

Parents data

We can also subset the data for the F₂parents. Including the parents in phenotypic evaluations is useful for comparison and helps interpret how they rank relative to the progeny of the segregating populations. It’s generally a good practice to include parental data when screening F₂ individuals or, at the very least, to collect some phenotypic data on the parents for reference.

#--- Subset parents only----

# Subset parents
all_parents <- pheno[pheno$Gen %in% c("P"),]

# Subset parents of population 1
parents <- pheno[pheno$Geno %in% c("OG-Mcgregor", 
                                   "Program-Round1" 
),]

Multiple populations

If you have multiple populations and traits combined in a single dataset, you can use code like the example below to compare them. In this case, I show data for all three populations and illustrate the approach using two traits as an example.


plots2 <- list()

for (col in response_vars) {
  plot <- ggplot(pheno %>% filter(Geno %in% c("1592-ConicalxRound-F2", 
                                              "1633-RoundxCylindrical-F2", 
                                              "1629-CylindricalxFlat-F2")), 
                 aes(x = Geno, 
                     y = !!sym(col), 
                     fill = Geno)) +
   
     geom_violin(alpha = 0.5, color = "#2C3E50") +  # Rain (density)
    geom_boxplot(width = 0.2, alpha = 0.7, color = "#2C3E50", outlier.shape = NA) +  # Cloud (boxplot)
    geom_jitter(width = 0.15, alpha = 0.6, size = 1.5, color = "#A680B8") +  # Drops (data points)
    labs(x = "Group", y = col, title = paste0("Raincloud Plot of ", col)) +
    theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(size = 13, face = "bold"),
      axis.text.x = element_text(size = 12, color = "#34495E"), 
      axis.text.y = element_text(size = 12, color = "#34495E"), 
      axis.title.x = element_text(size = 12.5, face = "bold"), 
      axis.title.y = element_text(size = 12.5, face = "bold"),
      panel.background = element_rect(fill = "#FAF3E0", color = NA), # Light beige background
      panel.grid.major = element_line(color = "#D5DBDB", size = 0.5),
      panel.grid.minor = element_blank(),
      legend.position = "none"
    ) + coord_flip()
  
  plots2[[col]] <- plot
}

print(plots2[vars][1:2])   # Run this
#> $biomass

#> 
#> $max_width

Publication ready plots for parent differences

In my previous linkage mapping projects, I’ve found it useful to highlight differences among the parents (see, for example, this publication: https://doi.org/10.1093/g3journal/jkae041). In this case the parents are Cylindra and Mono, two table beet varieties Below, I demonstrate how to create a high-quality, publication-ready plot that shows the differences among the parental types.

This code is designed to work for multiple traits but it would work for only one variable with minor refactoring. I show only 2 traits for example purposes.

# Population 2
response_vars <- vars

# Y-Axis labels

ylab <- c(expression(Biomass ~ (mm ^ 2)),
          "Maximum width (mm)",
          "Width 50 (mm)",
          "Length (mm)",
          "Length to width ratio (mm)",
          expression("Shoulder area" ~ (mm ^ 2)),
          "Tip angle (degrees)")

# Subset parents of population 2
parents2 <- pheno[pheno$Geno %in% c("Mono", 
                                   "Cylindra" 
),]


plots_pa2 <- list()  # Initialize the list

for (i in seq_along(vars)) {
  col <- vars[i]  # Extract column name
  
  plot <- ggplot(parents2,
                 aes(x = Geno, 
                     y = !!sym(col), 
                     fill = Geno)) +
    # Add half-eye distribution
    ggdist::stat_halfeye(
      adjust = 0.85,
      alpha = 0.85,
      width = 0.6,
      .width = 0,
      justification = -0.2,
      point_colour = NA
    ) +
    # Add boxplot
    geom_boxplot(
      width = 0.15,
      alpha = 0.4, 
      outlier.shape = NA,
      show.legend = FALSE
    ) +
    # Add points
    geom_point(
      aes(color = factor(Geno)),
      size = 1.5,
      alpha = 0.5, 
      position = position_jitter(seed = 1, width = 0.1),
      show.legend = FALSE # Remove legend for points
    ) +
    # Labels and aesthetics
    labs(x = "", 
         y = ylab[i], # Correctly indexing ylab
         title = "Population 2"
    ) +
    theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(size = 13),
      axis.text.x = element_text(size = 12, color = "#34495E"), 
      axis.text.y = element_text(size = 12, color = "#34495E"), 
      panel.background = element_rect(fill = "#FAF3E0", color = NA), #  background
      panel.grid.major = element_line(color = "#D5DBDB", size = 0.1),
      panel.grid.minor = element_blank(),
      legend.position = "none"
    ) + 
    coord_flip()
  
  plots_pa2[[col]] <- plot  # Store plot in the list
}

# Check stored plots
plots_pa2[1:2] # Run this
#> $biomass

#> 
#> $max_width