This report is automatically generated with the R package knitr (version 1.40) .

## Random Sampling Function Script File ##

# library(tidyverse)   # functions to work with tibbles
# library(lubridate)   # makes working with dates easier

source(paste0(rstudioapi::getActiveProject(),"/R Functions/function_predictionModels.R"))

# Secondary functions used in primary function
# Allows removing NAs from sample & sampling from a single value to get to the same value (as expected); base sample() has unpredictable behavior
sampleFrom <- function(x, size=length(x), na.rm=F, ...) {
  if (length(x) == 1L) {
    res <- rep(x, size)
  } else if(na.rm==T){
    res <- sample(x[!is.na(x)], size=size, ...)
  } else {
    res <- sample(x, size=size, ...)
  }
  res
}

geomean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}


# Primary function
randomSampleLinkage <- function(aqData, fishData, fishgoal=fishgoal, iterations=1000, bylocationColumn=NULL, stdzFishLength=350, extrapAllowance=0, minFishSampleSize=8, aqPeriod=5, aqSampPerYr=12){

  # FUNCTION VARIABLES
  # aqData            - dataframe of Aq   data containing Result, SampleDate, waterYear
  # fishData          - dataframe of Fish data containing Result, TLAvgLength, SampleDate
  # iterations        - number of iterations to perform the bootstrap
  # bylocationColumn  - column of location names. LOcation must be shared between Fish & Aq to be paired
  # stdzFishLength    - length in mm to determine Standardized Fish [MeHg]
  # extrapAllowance   - additional length in mm allowed to standardize when random sampling bass
  # minFishSampleSize - this value minus 1 is randomly sampled for use in regression to find Std Length Hg Conc. (if a fish sample event has less than supplied value, that event will not be included in the linkage analysis)
  # aqPeriod       - number years of aq data to pair with fish, prior to fish sampling event
  # aqSampPerYr     - number of aq samples per year that would be used for compliance determination (e.g., 12 correlates to monthly sampling)



  # bylocationColumn <- quo(Subarea) # - to test function line-wise
  locationColumn <- enquo(bylocationColumn)

  # FISH DATA PREP
  fishPool <- fishData %>%
    mutate(Year = year(SampleDate)) %>%
    group_by(!!locationColumn, Year) %>%
    mutate(n = n()) %>%
    filter(n >= minFishSampleSize) %>% # Remove sample years that have less data than min fish required
    ungroup()


  # AQ DATA PREP
  # Water Year starts Nov to Oct 
  aqPool <- aqData %>%
    group_by(!!locationColumn, waterYear) %>%
    mutate(n = n()) %>%
    ungroup


  # START PAIRING
  Locations <- fishPool %>% distinct(!!locationColumn) %>% pull

  allLinkData <- NULL
  aqMeHgSafeConcs <- NULL
  progressBar <- txtProgressBar(1, iterations, style = 3)

  for (iter in 1:iterations){
    # Shows iteration progress
    setTxtProgressBar(progressBar, iter)

    linkData <- NULL
    for(location in Locations){
      # location <- Locations[1]   # - to test function line-wise

      # Subset Fish Data by Location
      fishLocation <- fishPool%>%
        filter(!!locationColumn == location)

      # Data Check that fish data exists for subarea - if not go to next subarea
      Years <- fishLocation %>% distinct(Year) %>% pull
      if(length(Years) == 0) {next}

      # Randomly sample data for each available fish sampling year & corresponding aq data years
      subareaDataYrs <- NULL
      for(yr in Years){
        # yr <- Years[1]   # - to test function line-wise

        ## Gather Aq Samples ##

        # filter for aq data location & "aqPeriod" years before fish sampling event
        sub_aqPool <- aqPool %>%
          filter(!!locationColumn == location,
                 waterYear<=yr & waterYear>(yr-aqPeriod)) # Includes year both fish & Aq are sampled plus aqPeriod-1 previous years of Aq data

        # Data Check that Aq data exists for subarea - if not go to next year
        if(nrow(sub_aqPool) == 0) next


        # Randomly Sample Aq data by wet and dry period
        aqRandDry <- sub_aqPool %>%
          filter(grepl('dry', Season, ignore.case=T))
        if(nrow(aqRandDry) > 0){
          aqRandDry <- aqRandDry %>%
            slice_sample(n=aqSampPerYr * aqPeriod * 5/12, replace=T)    # randomly sample 5/12 of Aq data from dry (5 months in year May 1 - Sept 30)
        }

        aqRandWet <- sub_aqPool %>%
          filter(grepl('wet', Season, ignore.case=T))
        if(nrow(aqRandWet) > 0){
          aqRandWet <- aqRandWet %>%
            slice_sample(n=aqSampPerYr * aqPeriod * 7/12, replace=T)    # randomly sample 7/12 of Aq data from wet (7 months in year Oct 1 - Apr 30)
        }

        aqRandSamp <- rbind(aqRandDry, aqRandWet)


        # Calculate Pooled Median
        aqResult <- aqRandSamp %>%
          summarize(Value = median(Result),
                    n = n())


        # Get Aq Water Years
        waterYrs <- paste(aqRandSamp %>% distinct(waterYear) %>% pull %>% sort, collapse=", ") # Collapse Aq Sample Years for storage in dataframe



        # Find fish sample size for specified location & year 
        fishPool_n <- fishLocation %>%
          filter(Year == yr) %>%
          head(1) %>%
          pull(n)

        # Random Sample fish (do not sample with replacement if standardizing to length; repeating values may force extrapolation and/or lead to unlikely regression models (when random sampling pop., not likely to reproduce same fish length & [Hg])
        extrapTest <- T
        tries <- 10 # number of tries to find random samples that meet extrapAllowance criteria
        n <- 0
        while(extrapTest == T & n < tries){
          fishRandSamp <- fishLocation %>%
            filter(Year == yr) %>%
            # sample_n(size=sampleFrom((minFishSampleSize-1):fishPool_n, 1), replace=F)  # sample without replacement; sample 1 minus minFishSampleSize up to fishPool_n to create variation in sampled results 
            slice_sample(n=sampleFrom((minFishSampleSize-1):fishPool_n, 1), replace=F) # First, randomly select sample size from (minFishSampleSize-1) to fishPool_n (to increase variation of sampled results), then randomly sample that number of rows without replacement


          extrapTest <- fishRandSamp %>%
            summarize(Extrapolated = case_when(min(TLAvgLength) <= stdzFishLength+extrapAllowance &  max(TLAvgLength) >=  stdzFishLength-extrapAllowance ~ F, # Allows a certain amount of extrapolation as specified by extrapAllowance  
                                            T ~ T)) %>%
            pull(Extrapolated)

          n <- n + 1
        }
      if(n >= tries) next

        # Calc Standardized Fish [MeHg]
        FishConc  <- predictionModels(dataframe=fishRandSamp, .knownYcol=Result, .knownXcol=TLAvgLength, knownX=stdzFishLength, preferExtrapModel='log') %>%
          rename(stdLengthConc = predictedY,
                 n = n) %>%
          filter(!grepl('gam', ModelName, ignore.case=T)) %>%   # remove 'gam' because regression model can increase then decrease as length goes up, which is unlikely
          head(n=1) %>%
          ungroup


        # Join Fish and Aq Results for each year in subarea - add 'FishModel=FishConc$Model,' to tibble to keep each fish model in function return
        subareaDataYr <- tibble(Iteration=iter, Location=location,
                         FishYr=yr,           FishStdConc=FishConc$stdLengthConc, FishPool_n=FishConc$n, FishConcExtrapolated=FishConc$Extrapolated,
                         AqWaterYrs=waterYrs, AqConc=median(aqResult$Value), AqPool_n=aqResult %>% pull(n) %>% sum)

        subareaDataYrs <- rbind(subareaDataYrs, subareaDataYr)
      }

      # Join results for each subarea
      linkData <- rbind(linkData, subareaDataYrs)
    }

    # Calc Conc. Value for each subarea
    linkData.Subareas <- linkData %>%
      group_by(Location) %>%
      summarize(FishStdConc = median(FishStdConc), # weighted.mean(FishStdConc, FishPool_n),
                AqConc   = median(AqConc))   # weighted.mean(AqConc, AqPool_n))

    # Calc Linkage aqMeHgSafeConc
    model_Linkage <- predictionModels(linkData.Subareas, .knownYcol=AqConc, .knownXcol=FishStdConc, knownX=fishgoal) %>%
      rename(aqMeHgSafeConc = predictedY) %>%
      head(n=1)

    # Stores each iteration's Linkgage Data
    # allLinkData <- rbind(allLinkData, linkData)

    # Stores Hg Water Goal calculated in each iteration - Add 'LinkageModel=model_Linkage$Model, ' to tibble() to keep each iteration's linkage model in function return
    aqMeHgSafeConcs <- rbind(aqMeHgSafeConcs, tibble(Iteration=iter, aqMeHgSafeConc=model_Linkage$aqMeHgSafeConc, ModelName=model_Linkage$ModelName, n=model_Linkage$n, SER=model_Linkage$SER, Extapolated=model_Linkage$Extrapolated))
  }

  close(progressBar)

  return(
    list(aqMeHgSafeConcs=aqMeHgSafeConcs, allLinkData=NULL)  # allLinkData=allLinkData
  )
}

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mgcv_1.8-41        nlme_3.1-160       lubridate_1.8.0    plotly_4.10.0     
##  [5] readxl_1.4.1       actuar_3.3-0       NADA_1.6-1.1       forcats_0.5.2     
##  [9] stringr_1.4.1      dplyr_1.0.9        purrr_0.3.4        readr_2.1.2       
## [13] tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
## [17] fitdistrplus_1.1-8 survival_3.4-0     MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       googledrive_2.0.0  
## [21] htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1         compiler_4.2.2     
## [25] modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3     htmltools_0.5.3    
## [29] tidyselect_1.1.2    fansi_1.0.3         viridisLite_0.4.1   crayon_1.5.1       
## [33] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0         grid_4.2.2         
## [37] jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1     DBI_1.1.3          
## [41] magrittr_2.0.3      scales_1.2.1        cli_3.3.0           stringi_1.7.8      
## [45] fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2      generics_0.1.3     
## [49] vctrs_0.4.1         expint_0.1-7        tools_4.2.2         glue_1.6.2         
## [53] hms_1.1.2           fastmap_1.1.0       colorspace_2.0-3    gargle_1.2.0       
## [57] rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2023-12-26 10:19:52 PST"