This report is automatically generated with the R package knitr (version 1.40) .

source("R Functions/functions_QA data.R")


###Sample type code column is where it is recorded that a sample is a field duplicate.
  #we will want to average these samples with the environmental sample that they match to (with the same date/location)

###there are some samples that are listed for both dry and wet weight - we only want the dry weight - filter out accompanying wet weights



### LOAD DATA ###
R5SED <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/R5SED.xlsx', sheet='120418 Sed Hg USE THIS', guess_max = 30000)
nrow(R5SED) #number of rows should match the Excel file (minus the header row)
## [1] 607
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ###
#Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns
keep_cols <- c('SourceID','SourceRow','Source','Project','StationName','StationCode','SampleDate','LabBatch','LabSampleID','MatrixName',
               'WBT','MethodName','Analyte','Unit','Result','MDL','RL','ResultQualCode','SampleID','SampleComments','TargetLatitude',
               'TargetLongitude','QACode','BatchVerification','ComplianceCode','CollectionComments','ResultsComments','BatchComments','SampleTypeCode'
               )

temp_cols <- c('DataType', 'Percent Moisture', 'CollectionTime') #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL)
#temp_cols are removed before the data is merged with other datasets


R5SED_new <- R5SED %>%
  select( c(keep_cols,temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above
  mutate(              #Add user defined columns
    CoordSystem = NA,
    SampleTime = format(lubridate::ymd_hms(CollectionTime), "%H:%M:%S") #readxl adds "1899-12-31" to the time - this converts it back to time as.character
    ) %>%
  rename(
    'CitationCode' = 'Source'
  ) %>%
  filter( DataType == 'Reported' | DataType == 'Field dup' )  #only keep rows where DataType is 'Reported' or 'Field dup' - these values are not part of the lab QA/QC
nrow(R5SED_new)
## [1] 367
#str(R5SED_new)
#View(R5SED_new)


### FORMAT COLUMN PARAMETERS ###
  # Standardize MatrixName Groups - "Water", "Sediment", "Soil" #
unique(R5SED_new$MatrixName) #Identifies OLDNAMES
## [1] "sediment"
R5SED_new <- R5SED_new %>%
  mutate( MatrixName = recode(MatrixName, "sediment" = "Sediment") )
unique(R5SED_new$MatrixName) #New naming structure should now be listed
## [1] "Sediment"
  # Standardize WBT (WaterBodyType) Groups - "River/Stream", "Drain/Canal", "Wetland", "Spring", "Slough", 
  #                                          "Pond",  "Lake/Reservoir", "Delta", "Forebay/Afterbay", "Not Recorded" #
unique(R5SED_new$WBT) #Identifies OLDNAMES
##  [1] NA               "Marsh"          "Not Recorded"   "River"          "Drain/Canal"   
##  [6] "Spring"         "Creek"          "Lake"           "Settling Basin" "Slough"
R5SED_new <- R5SED_new %>%
  mutate(WBT = recode(WBT,
                      "Marsh" = "Wetland",
                      "Lake"  = "Lake/Reservoir",  #2019-04-02 ~ added by Robin so WBT is consistent for "Lake/Reservoir" between scripts
                      "River" = "River/Stream",
                      "Creek" = "River/Stream"),
         #EXAMPLE FOR WHEN "OLDNAME" is 'NA' but we want a NEWNAME - if this example is deleted, also delete the comma after "Not Recorded" above
         WBT = case_when(is.na(WBT) ~ "Not Recorded", #Use "Not Recorded" when WBT value is NA
                         TRUE ~ WBT)                  #Keep original WBT value in all other cases
         ) %>%
  filter( WBT != "Spring" ) #removes "Spring" data
unique(paste(R5SED_new$MatrixName, R5SED_new$WBT, sep='; ')) #New naming structure for Matrix Name & WBT Groupings
## [1] "Sediment; Not Recorded"   "Sediment; Wetland"        "Sediment; River/Stream"  
## [4] "Sediment; Drain/Canal"    "Sediment; Lake/Reservoir" "Sediment; Settling Basin"
## [7] "Sediment; Slough"
# Standardize Analyte Groups - "Mercury, Total", "Mercury, Dissolved", "Mercury, Suspended", & same for Methylmercury #
unique(R5SED_new$Analyte)
## [1] "Mercury, Total" "Methylmercury"
R5SED_new <- R5SED_new %>%
  mutate(Analyte = recode(Analyte,
                          "Methylmercury" = "Methylmercury, Total"
                          )
         )
unique(R5SED_new$Analyte) #New naming structure for Analyte Groupings
## [1] "Mercury, Total"       "Methylmercury, Total"
  # Standardize ResultQualCode Groups - "ND", "DNQ", NA#
unique(R5SED_new$ResultQualCode) #Identifies OLDNAMES
## [1] "="   "DNQ" "ND"
#[1] "="   "ND"  "DNQ" - no changes necessary


  # Format Result Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(any(grepl('<|[a-df-zA-DF-Z]',R5SED_new$Result))){
  old <-R5SED_new$Result
  new <-R5SED_new$Result
  new[grepl('<|[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "MDL and RL values exist in designated columns so Result of '<MDL' replaced with 'NA'.\n")
      )
  R5SED_new <- R5SED_new %>%
    mutate(
      Result = if_else(Result == "<MDL", NA_character_, Result),
      Result = as.numeric(new)
      )
} else {
  cat("'Result' column converted to numeric format\n")
  R5SED_new$Result <- as.numeric(R5SED_new$Result)
}
## 'Result' column should be numeric but some cells contain <MDL.
## ACTIONS TAKEN:
## MDL and RL values exist in designated columns so Result of '<MDL' replaced with 'NA'.
  # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information
nrow(R5SED_new) #Number rows before
## [1] 303
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS#
R5SED_new <- R5SED_new %>%
  #Set 0 & negative values as blank
  mutate(Result = ifelse(Result <= 0, NA_real_, Result),
         MDL = ifelse(MDL <= 0, NA_real_, MDL),
         RL = ifelse(RL <= 0, NA_real_, RL))
na_results <- R5SED_new %>% #Record rows where Result, MDL, & RL all equal <NA>
  filter( is.na(Result) & is.na(MDL) & is.na(RL) )
nrow(na_results)
## [1] 0
R5SED_new <- anti_join(R5SED_new, na_results, by='SourceRow') #returns rows from R5SED_new not matching values in no_result
nrow(R5SED_new) #Number rows after
## [1] 303
# Format Units Column - "ng/L", "mg/Kg"
unique(R5SED_new$Unit) #Identifies OLDNAMES
## [1] "mg/Kg dw" "mg/Kg ww" "ng/g dw"
#Convert mg/Kg ww to mg/Kg dw using 'Percent Moisture' column and eq: Result(d) = Result(w)/(1-Pm)*100 
R5SED_new <- R5SED_new %>%
  mutate(
    `Percent Moisture` = if_else(`Percent Moisture` == 'n/a', NA_character_, `Percent Moisture`), #replace 'n/a' with NA class before converting to numeric
    `Percent Moisture` = as.numeric(`Percent Moisture`),
    Result = case_when(Unit == 'mg/Kg ww' ~ Result/(100-`Percent Moisture`)*100,  # convert wet weight result to dry weight result using percent moisture
                       Unit == 'ng/g dw' ~ Result * 0.001, #convert ng/g to mg/Kg
                       TRUE ~ Result),
    Unit = case_when(Unit == 'mg/Kg ww' ~ 'mg/Kg dw',
                     Unit == 'ng/g dw' ~ 'mg/Kg dw',
                     TRUE ~ Unit)
  )
unique(R5SED_new$Unit) #New naming structure for Unit Groupings
## [1] "mg/Kg dw"
  # Format Date and Time Column #
# NEED TO TALK ABOUT HOW WE WANT TO DO THIS - To graph in R we need Date and Time in same column
# THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SEPERATE COLUMNS
R5SED_new <- R5SED_new %>%
  #rowise() %>%    # rowise is very slow - so used sapply to make this a rowise operation
  mutate(
    #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better
    SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))),
    SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")),
    #COMBINE DATE AND TIME INTO SampleDateTime COLUMN
    SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')),
    #FORMAT SampleDateTime COLUMN TO DATE FORMAT
    SampleDateTime = lubridate::ymd_hms(SampleDateTime)
  )


### REMOVE TEMPORARY COLUMNS ###
R5SED_new <- R5SED_new %>%
  select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed
#View(R5SED_new)

## SAVE FORMATTED DATA AS EXCEL FILE ##
writexl::write_xlsx(R5SED_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/R5SED_ceden_format.xlsx')
# In excel, to convert SampleDate column to Date format
# 1 - Select the date column.
# 2 - Go to the Data-tab and choose "Text to Columns".
# 3 - On the first screen, leave radio button on "delimited" and click Next.
# 4 - Unselect any delimiter boxes (everything blank) and click Next.
# 5 - Under column data format choose Date, select YMD
# 6 - Click Finish.

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.8.0    plotly_4.10.0      readxl_1.4.1       actuar_3.3-0      
##  [5] NADA_1.6-1.1       forcats_0.5.2      stringr_1.4.1      dplyr_1.0.9       
##  [9] purrr_0.3.4        readr_2.1.2        tidyr_1.2.0        tibble_3.1.8      
## [13] ggplot2_3.3.6      tidyverse_1.3.2    fitdistrplus_1.1-8 survival_3.4-0    
## [17] MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       googledrive_2.0.0  
## [21] htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1         compiler_4.2.2     
## [25] modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3     htmltools_0.5.3    
## [29] tidyselect_1.1.2    viridisLite_0.4.1   fansi_1.0.3         crayon_1.5.1       
## [33] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0         grid_4.2.2         
## [37] jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1     DBI_1.1.3          
## [41] magrittr_2.0.3      scales_1.2.1        writexl_1.4.0       cli_3.3.0          
## [45] stringi_1.7.8       fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2     
## [49] generics_0.1.3      vctrs_0.4.1         expint_0.1-7        tools_4.2.2        
## [53] glue_1.6.2          hms_1.1.2           fastmap_1.1.0       colorspace_2.0-3   
## [57] gargle_1.2.0        rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2024-01-04 13:25:11 PST"