This report is automatically generated with the R package knitr (version 1.40) .

library(mgcv)       #gam()

predictionModels <- function(dataframe, .knownYcol=NULL, .knownXcol=NULL, knownX=NULL, preferExtrapModel=NULL){
  # dataframe - a AqFishSamplePairing() object

  if(is.null(knownX)) stop("Please include a knownX value.")

  # .knownYcol <- quo(Result) - to test function line-wise
  KnownYcol <- enquo(.knownYcol)

  # .knownXcol <- quo(TLAvgLength) - to test function line-wise
  KnownXcol <- enquo(.knownXcol)

  # Unique number of samples
  n <- nrow(dataframe %>% distinct(!!KnownXcol))


  # Standard Error of Regression (SER) used to determine best model
  # SER is std.dev (or measure of dispersion) around regression line
  # SER is std. deviation of predictions with dof correction, which helps select a simple model vs a more complicated model -> dof = unique(n) minus # of coefficients estimated
  # SER is also called Residual Standard Error & can be called using summary() with summary(Model)$sigma and dof using summary(model)$df[2]
  # RMSE is std. deviation of predictions without dof correction  
  # Info about how SER was calculated
  # https://www.youtube.com/watch?v=SM96OglU7Gg

  # GAM: Generalized additive models with integrated smoothness estimation
  iterations <- 1:(min(4, n))  # limit smoothness coefficient to k=4 or no higher than n, the number of unique samples
  # cap the smoothness of gam model at 4 or Unique number of samples, whichever is smaller
  df_gam <- NULL
  for(i in iterations){
    capture.output(    # use capture.output to stop printing cat warning
      i_gam <- try(
        dataframe %>%
          nest(data=colnames(dataframe)) %>%  # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped
          mutate(knownXcol = as_label(KnownXcol),
                 knownYcol = as_label(KnownYcol),
                 knownX    = knownX,
                 ModelName = paste0("gam_k=", i)) %>%
          mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("gam(", as_label(KnownYcol), " ~ s(", as_label(KnownXcol), ", k=i), data=.)"))) )) %>%
          mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>%
          mutate(RMSE  = purrr::map_dbl(.x=Model, .f = ~sqrt(mean((predict(.x) - dataframe %>% pull(!!KnownYcol))^2)) )) %>%
          mutate(SER   = purrr::map_dbl(.x=Model, .f = ~sqrt( sum((predict(.x) - dataframe %>% pull(!!KnownYcol))^2) / (n - length(.x$coefficients)) ) ))
      )
    )
    if(inherits(i_gam, "try-error")) next

    df_gam <- rbind(df_gam, i_gam)
  }


  # NLS: Nonlinear (weighted) least-squares estimates of a nonlinear model
  capture.output(    # use capture.output to stop printing cat warning
    df_nls <- try(
      dataframe %>%  # https://stackoverflow.com/a/18305916/7903538
        nest(data=colnames(dataframe)) %>%  # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped
        mutate(knownXcol = as_label(KnownXcol),
               knownYcol = as_label(KnownYcol),
               knownX    = knownX,
               ModelName = "nls") %>%
        mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("nls(", as_label(KnownYcol), " ~ ", as_label(KnownXcol),"^b, start=c(b=1), algorithm='plinear', data=.)"))) )) %>%
        mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>%
        mutate(RMSE  = purrr::map_dbl(.x=Model, .f = ~sqrt(mean(summary(.x)$residuals^2)) )) %>%
        mutate(SER   = purrr::map_dbl(.x=Model, .f = ~sqrt( sum(summary(.x)$residuals^2) / (n - 2) ) )) # nls() doesn't return $coefficients but it is 2 (m & b); y = m*x^b
    )
  )


  # LM: Fit a standard linear model y = ax + b
  df_lm <- dataframe %>%
    nest(data=colnames(dataframe)) %>%  # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped
    mutate(knownXcol = as_label(KnownXcol),
           knownYcol = as_label(KnownYcol),
           knownX    = knownX,
           ModelName = "lm") %>%
    mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(", as_label(KnownYcol), " ~", as_label(KnownXcol),", data=.)"))) )) %>%
    mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>%
    mutate(RMSE  = purrr::map_dbl(.x=Model, .f = ~sqrt(mean(summary(.x)$residuals^2)) )) %>%
    mutate(SER   = purrr::map_dbl(.x=Model, .f = ~sqrt( sum(summary(.x)$residuals^2) / (n - length(.x$coefficients)) ) ))


  # LM: Fit a Power function y = a * x^b as done by excel
  # reference: https://stackoverflow.com/a/18308359
  df_pwr <- dataframe %>%   # https://stackoverflow.com/questions/18305852/power-regression-in-r-similar-to-excel
    nest(data=colnames(dataframe)) %>%  # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped
    mutate(knownXcol = as_label(KnownXcol),
           knownYcol = as_label(KnownYcol),
           knownX    = knownX,
           ModelName = "pwr") %>%
    mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(log(", as_label(KnownYcol), ") ~ log(", as_label(KnownXcol),"), data=.)"))) )) %>%
    mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~exp(predict(.x, tibble(!!KnownXcol := knownX)))) ) %>%
    mutate(RMSE  = purrr::map_dbl(.x=Model, .f = ~sqrt(mean((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2)) )) %>%
    mutate(SER   = purrr::map_dbl(.x=Model, .f = ~sqrt( sum((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2) / (n - length(.x$coefficients)) ) ))


  # LM: Fit a Logarithmic function y = a + b*ln(X) as done by excel
  # reference: https://www.statology.org/logarithmic-regression-in-r/
  df_log <- dataframe %>%
    nest(data=colnames(dataframe)) %>%  # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped
    mutate(knownXcol = as_label(KnownXcol),
           knownYcol = as_label(KnownYcol),
           knownX    = knownX,
           ModelName = "log") %>%
    mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(", as_label(KnownYcol), " ~ log(", as_label(KnownXcol),"), data=.)"))) )) %>%
    mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>%
    mutate(RMSE  = purrr::map_dbl(.x=Model, .f = ~sqrt(mean(summary(.x)$residuals^2)) )) %>%
    mutate(SER   = purrr::map_dbl(.x=Model, .f = ~sqrt( sum(summary(.x)$residuals^2) / (n - length(.x$coefficients)) ) ))


  # LM: Fit a Exponential function y = a * b^(X) as done by excel
  # reference: https://www.statology.org/exponential-regression-in-r/
  df_exp <- dataframe %>%
    nest(data=colnames(dataframe)) %>%  # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped
    mutate(knownXcol = as_label(KnownXcol),
           knownYcol = as_label(KnownYcol),
           knownX    = knownX,
           ModelName = "exp") %>%
    mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(log(", as_label(KnownYcol), ") ~", as_label(KnownXcol),", data=.)"))) )) %>%
    mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~exp(predict(.x, tibble(!!KnownXcol := knownX)))) ) %>%
    mutate(RMSE  = purrr::map_dbl(.x=Model, .f = ~sqrt(mean((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2)) )) %>%
    mutate(SER   = purrr::map_dbl(.x=Model, .f = ~sqrt( sum((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2) / (n - length(.x$coefficients)) ) ))


  dataXcol <- dataframe %>% select(!!KnownXcol)


  if(is.null(df_gam) & inherits(df_nls, "try-error")) {
    predictionModels <- rbind(                df_lm, df_pwr, df_log, df_exp)  # exclude df_gam & df_nls
  }
  else if(is.null(df_gam)){
    predictionModels <- rbind(        df_nls, df_lm, df_pwr, df_log, df_exp)  # exclude df_gam
  }
  else if(inherits(df_nls, "try-error")){
    predictionModels <- rbind(df_gam,         df_lm, df_pwr, df_log, df_exp)  # exclude df_nls
  }
  else {
    predictionModels <- rbind(df_gam, df_nls, df_lm, df_pwr, df_log, df_exp)
  }

  predictionModels <- predictionModels %>%
    mutate(n = nrow(dataframe),
           PredictionType = case_when(knownX >= min(dataXcol) & knownX <= max(dataXcol) ~ "Interpolated",
                                    T ~ "Extrapolated"),
           ExtrapLength = case_when(PredictionType == "Extrapolated" ~ min(abs(min(dataXcol)-knownX), abs(max(dataXcol)-knownX)),
                                          T ~ 0)
    ) %>%
    arrange(
      if(all(PredictionType == "Extrapolated") & !is.null(preferExtrapModel)){
        # if there is a preferred model when prediction is extrapolated, put that model first
        match(ModelName, c(preferExtrapModel, ModelName[ModelName != preferExtrapModel]))
      }else {
        # Otherwise, sort by SER
        # make sure SER is real number. When n=2, SER can be NA or Inf. If SER is NA or Inf, use RMSE instead
        if(all(is.finite(SER))) SER  else RMSE
      }
    )

  return(predictionModels)
}

# EXAMPLE CODE TO USE MODEL FOR REGRESSION
# predictionModel_final <- predictionModels %>% 
#   filter(SER == min(SER))
# 
# model_predictOrig <- tibble(fish_Stdz350=seq(0.05,max(prediction2000_2019$fish_Stdz350), by=0.005))
# model_predictOrig <- model_predictOrig %>% 
#   mutate(aq_Samp=predict(predictionModel_final$Model[[1]], model_predictOrig %>% select(fish_Stdz350)))  

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mgcv_1.8-41        nlme_3.1-160       lubridate_1.8.0    plotly_4.10.0     
##  [5] readxl_1.4.1       actuar_3.3-0       NADA_1.6-1.1       forcats_0.5.2     
##  [9] stringr_1.4.1      dplyr_1.0.9        purrr_0.3.4        readr_2.1.2       
## [13] tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
## [17] fitdistrplus_1.1-8 survival_3.4-0     MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       googledrive_2.0.0  
## [21] htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1         compiler_4.2.2     
## [25] modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3     htmltools_0.5.3    
## [29] tidyselect_1.1.2    fansi_1.0.3         viridisLite_0.4.1   crayon_1.5.1       
## [33] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0         grid_4.2.2         
## [37] jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1     DBI_1.1.3          
## [41] magrittr_2.0.3      scales_1.2.1        cli_3.3.0           stringi_1.7.8      
## [45] fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2      generics_0.1.3     
## [49] vctrs_0.4.1         expint_0.1-7        tools_4.2.2         glue_1.6.2         
## [53] hms_1.1.2           fastmap_1.1.0       colorspace_2.0-3    gargle_1.2.0       
## [57] rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2023-12-26 09:47:26 PST"