This report is automatically generated with the R
package knitr
(version 1.40
)
.
## Random Sampling Function Script File ## # library(tidyverse) # functions to work with tibbles # library(lubridate) # makes working with dates easier source(paste0(rstudioapi::getActiveProject(),"/R Functions/function_predictionModels.R")) # Secondary functions used in primary function # Allows removing NAs from sample & sampling from a single value to get to the same value (as expected); base sample() has unpredictable behavior sampleFrom <- function(x, size=length(x), na.rm=F, ...) { if (length(x) == 1L) { res <- rep(x, size) } else if(na.rm==T){ res <- sample(x[!is.na(x)], size=size, ...) } else { res <- sample(x, size=size, ...) } res } geomean = function(x, na.rm=TRUE){ exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) } # Primary function randomSampleLinkage <- function(aqData, fishData, fishgoal=fishgoal, iterations=1000, bylocationColumn=NULL, stdzFishLength=350, extrapAllowance=0, minFishSampleSize=8, aqPeriod=5, aqSampPerYr=12){ # FUNCTION VARIABLES # aqData - dataframe of Aq data containing Result, SampleDate, waterYear # fishData - dataframe of Fish data containing Result, TLAvgLength, SampleDate # iterations - number of iterations to perform the bootstrap # bylocationColumn - column of location names. LOcation must be shared between Fish & Aq to be paired # stdzFishLength - length in mm to determine Standardized Fish [MeHg] # extrapAllowance - additional length in mm allowed to standardize when random sampling bass # minFishSampleSize - this value minus 1 is randomly sampled for use in regression to find Std Length Hg Conc. (if a fish sample event has less than supplied value, that event will not be included in the linkage analysis) # aqPeriod - number years of aq data to pair with fish, prior to fish sampling event # aqSampPerYr - number of aq samples per year that would be used for compliance determination (e.g., 12 correlates to monthly sampling) # bylocationColumn <- quo(Subarea) # - to test function line-wise locationColumn <- enquo(bylocationColumn) # FISH DATA PREP fishPool <- fishData %>% mutate(Year = year(SampleDate)) %>% group_by(!!locationColumn, Year) %>% mutate(n = n()) %>% filter(n >= minFishSampleSize) %>% # Remove sample years that have less data than min fish required ungroup() # AQ DATA PREP # Water Year starts Nov to Oct aqPool <- aqData %>% group_by(!!locationColumn, waterYear) %>% mutate(n = n()) %>% ungroup # START PAIRING Locations <- fishPool %>% distinct(!!locationColumn) %>% pull allLinkData <- NULL aqMeHgSafeConcs <- NULL progressBar <- txtProgressBar(1, iterations, style = 3) for (iter in 1:iterations){ # Shows iteration progress setTxtProgressBar(progressBar, iter) linkData <- NULL for(location in Locations){ # location <- Locations[1] # - to test function line-wise # Subset Fish Data by Location fishLocation <- fishPool%>% filter(!!locationColumn == location) # Data Check that fish data exists for subarea - if not go to next subarea Years <- fishLocation %>% distinct(Year) %>% pull if(length(Years) == 0) {next} # Randomly sample data for each available fish sampling year & corresponding aq data years subareaDataYrs <- NULL for(yr in Years){ # yr <- Years[1] # - to test function line-wise ## Gather Aq Samples ## # filter for aq data location & "aqPeriod" years before fish sampling event sub_aqPool <- aqPool %>% filter(!!locationColumn == location, waterYear<=yr & waterYear>(yr-aqPeriod)) # Includes year both fish & Aq are sampled plus aqPeriod-1 previous years of Aq data # Data Check that Aq data exists for subarea - if not go to next year if(nrow(sub_aqPool) == 0) next # Randomly Sample Aq data by wet and dry period aqRandDry <- sub_aqPool %>% filter(grepl('dry', Season, ignore.case=T)) if(nrow(aqRandDry) > 0){ aqRandDry <- aqRandDry %>% slice_sample(n=aqSampPerYr * aqPeriod * 5/12, replace=T) # randomly sample 5/12 of Aq data from dry (5 months in year May 1 - Sept 30) } aqRandWet <- sub_aqPool %>% filter(grepl('wet', Season, ignore.case=T)) if(nrow(aqRandWet) > 0){ aqRandWet <- aqRandWet %>% slice_sample(n=aqSampPerYr * aqPeriod * 7/12, replace=T) # randomly sample 7/12 of Aq data from wet (7 months in year Oct 1 - Apr 30) } aqRandSamp <- rbind(aqRandDry, aqRandWet) # Calculate Pooled Median aqResult <- aqRandSamp %>% summarize(Value = median(Result), n = n()) # Get Aq Water Years waterYrs <- paste(aqRandSamp %>% distinct(waterYear) %>% pull %>% sort, collapse=", ") # Collapse Aq Sample Years for storage in dataframe # Find fish sample size for specified location & year fishPool_n <- fishLocation %>% filter(Year == yr) %>% head(1) %>% pull(n) # Random Sample fish (do not sample with replacement if standardizing to length; repeating values may force extrapolation and/or lead to unlikely regression models (when random sampling pop., not likely to reproduce same fish length & [Hg]) extrapTest <- T tries <- 10 # number of tries to find random samples that meet extrapAllowance criteria n <- 0 while(extrapTest == T & n < tries){ fishRandSamp <- fishLocation %>% filter(Year == yr) %>% # sample_n(size=sampleFrom((minFishSampleSize-1):fishPool_n, 1), replace=F) # sample without replacement; sample 1 minus minFishSampleSize up to fishPool_n to create variation in sampled results slice_sample(n=sampleFrom((minFishSampleSize-1):fishPool_n, 1), replace=F) # First, randomly select sample size from (minFishSampleSize-1) to fishPool_n (to increase variation of sampled results), then randomly sample that number of rows without replacement extrapTest <- fishRandSamp %>% summarize(Extrapolated = case_when(min(TLAvgLength) <= stdzFishLength+extrapAllowance & max(TLAvgLength) >= stdzFishLength-extrapAllowance ~ F, # Allows a certain amount of extrapolation as specified by extrapAllowance T ~ T)) %>% pull(Extrapolated) n <- n + 1 } if(n >= tries) next # Calc Standardized Fish [MeHg] FishConc <- predictionModels(dataframe=fishRandSamp, .knownYcol=Result, .knownXcol=TLAvgLength, knownX=stdzFishLength, preferExtrapModel='log') %>% rename(stdLengthConc = predictedY, n = n) %>% filter(!grepl('gam', ModelName, ignore.case=T)) %>% # remove 'gam' because regression model can increase then decrease as length goes up, which is unlikely head(n=1) %>% ungroup # Join Fish and Aq Results for each year in subarea - add 'FishModel=FishConc$Model,' to tibble to keep each fish model in function return subareaDataYr <- tibble(Iteration=iter, Location=location, FishYr=yr, FishStdConc=FishConc$stdLengthConc, FishPool_n=FishConc$n, FishConcExtrapolated=FishConc$Extrapolated, AqWaterYrs=waterYrs, AqConc=median(aqResult$Value), AqPool_n=aqResult %>% pull(n) %>% sum) subareaDataYrs <- rbind(subareaDataYrs, subareaDataYr) } # Join results for each subarea linkData <- rbind(linkData, subareaDataYrs) } # Calc Conc. Value for each subarea linkData.Subareas <- linkData %>% group_by(Location) %>% summarize(FishStdConc = median(FishStdConc), # weighted.mean(FishStdConc, FishPool_n), AqConc = median(AqConc)) # weighted.mean(AqConc, AqPool_n)) # Calc Linkage aqMeHgSafeConc model_Linkage <- predictionModels(linkData.Subareas, .knownYcol=AqConc, .knownXcol=FishStdConc, knownX=fishgoal) %>% rename(aqMeHgSafeConc = predictedY) %>% head(n=1) # Stores each iteration's Linkgage Data # allLinkData <- rbind(allLinkData, linkData) # Stores Hg Water Goal calculated in each iteration - Add 'LinkageModel=model_Linkage$Model, ' to tibble() to keep each iteration's linkage model in function return aqMeHgSafeConcs <- rbind(aqMeHgSafeConcs, tibble(Iteration=iter, aqMeHgSafeConc=model_Linkage$aqMeHgSafeConc, ModelName=model_Linkage$ModelName, n=model_Linkage$n, SER=model_Linkage$SER, Extapolated=model_Linkage$Extrapolated)) } close(progressBar) return( list(aqMeHgSafeConcs=aqMeHgSafeConcs, allLinkData=NULL) # allLinkData=allLinkData ) }
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] mgcv_1.8-41 nlme_3.1-160 lubridate_1.8.0 plotly_4.10.0 ## [5] readxl_1.4.1 actuar_3.3-0 NADA_1.6-1.1 forcats_0.5.2 ## [9] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 ## [13] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2 ## [17] fitdistrplus_1.1-8 survival_3.4-0 MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 googledrive_2.0.0 ## [21] htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 compiler_4.2.2 ## [25] modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 htmltools_0.5.3 ## [29] tidyselect_1.1.2 fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.1 ## [33] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 grid_4.2.2 ## [37] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 DBI_1.1.3 ## [41] magrittr_2.0.3 scales_1.2.1 cli_3.3.0 stringi_1.7.8 ## [45] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 generics_0.1.3 ## [49] vctrs_0.4.1 expint_0.1-7 tools_4.2.2 glue_1.6.2 ## [53] hms_1.1.2 fastmap_1.1.0 colorspace_2.0-3 gargle_1.2.0 ## [57] rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2023-12-26 10:19:52 PST"