This report is automatically generated with the R package knitr (version 1.40) .

source("R Functions/functions_QA data.R")


### LOAD DATA ###
USGS <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/USGS_Aq  Sed Hg data.xlsx', sheet='Sheet1', guess_max = 30000)
nrow(USGS) #number of rows should match the Excel file (minus the header row)
## [1] 2911
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ###
#Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns
keep_cols <- c('SourceID','SourceRow', 'CitationCode (PUB ID)','CoordSystem','StationName', 'StationCode', 'SampleDate', 'CollectionTime', 'MatrixName', 'WBT', 'MethodName', 'Analyte', 'Unit', 'Result', 'MDL', 'RL', 'ResultQualCode', 'TargetLatitude', 'TargetLongitude', 'QACode')

#temp_cols <- c('COLUMN NAME', 'COLUMN NAME') #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL)
#temp_cols are removed before the data is merged with other datasets

USGS_new <- USGS %>%
  select( c(keep_cols) ) %>% #DO NOT CHANGE - selects columns specified above
  filter(Result %are not% c('Not Analyzed','Not Calculated', 'NA') | Unit != 'g/day') %>%
  rename(
    CitationCode = `CitationCode (PUB ID)`,
    SampleTime = CollectionTime
  ) %>%
  mutate(
    #Create Missing column or modify existing column here: CEDEN COLUMNNAME = 'SPECIFIED VALUE' or FUNCTION
    #DELTE COLUMN NAMES THAT DO NOT NEED TO BE CHANGED
    Project = NA_character_,
    #CollectionTime = NA_character_,  #Robin commented this out since CollectionTime was in keep_cols
    LabBatch = NA_character_,
    LabSampleID = NA_character_,
    SampleID = NA_character_,
    SampleComments = NA_character_,
    BatchVerification = NA_character_,
    ComplianceCode = NA_character_,
    CollectionComments = NA_character_,
    ResultsComments = NA_character_,
    BatchComments = NA_character_,
    SampleTypeCode = NA_character_
  )

nrow(USGS_new)
## [1] 2535
#str(USGS_new) #just to check data class of different columns - e.g., is Date column in POSIX format?
#View(USGS_new)


### FORMAT COLUMN PARAMETERS ###
  # Standardize MatrixName Groups - "Water", "Sediment", "Soil" #
unique(USGS_new$MatrixName) #Identifies OLDNAMES
## [1] "Aqueous"  "Sediment"
#STANDARD CODE TO CHANGE GROUPING NAMES
#USGS_new <- USGS_new %>%
 # mutate(MatrixName = recode(MatrixName, #COLUMN NAME WHERE CHANGES WILL BE MADE
                      #"OLDNAME" = "NEWNAME" - create a new line for each OLDNAME that needs to be changed

#unique(USGS_new$MatrixName) #New naming structure should now be listed


  # Standardize WBT (WaterBodyType) Groups - "River/Stream", "Drain/Canal", "Wetland", "Spring", "Slough", 
  #                                          "Pond",  "Lake/Reservoir", "Delta", "Forebay/Afterbay", "Not Recorded" #
unique(USGS_new$WBT) #Identifies OLDNAMES
## [1] "River"       "Drain/Canal" "Creek"       "Lake"        "Slough"      NA           
## [7] "Mine"
#Look for inconsistent group pairings between MatrixName and WBT (e.g., 'Soil; Stream')
unique(paste(USGS_new$MatrixName, USGS_new$WBT, sep='; '))
##  [1] "Aqueous; River"        "Aqueous; Drain/Canal"  "Aqueous; Creek"       
##  [4] "Aqueous; Lake"         "Sediment; River"       "Sediment; Creek"      
##  [7] "Sediment; Drain/Canal" "Sediment; Slough"      "Aqueous; NA"          
## [10] "Sediment; Lake"        "Sediment; Mine"        "Aqueous; Mine"
#If an incosistent grouping exists, add comment to 'CollectionComments' column using code:  mutate(CollectionComments = case_when(MatrixName=='CONDITION' ~ 'COMMENT', TRUE ~ MatrixName)), 
#STANDARD CODE TO CHANGE GROUPING NAMES
USGS_new <- USGS_new %>%
  filter(WBT %are not% c("Lake", "Mine")) %>%
  mutate(WBT = recode(WBT,
                      #"OLDNAME" = "NEWNAME" - create a new line for each OLDNAME that needs to be changed
                      "River" = "River/Stream",
                      "Creek" = "River/Stream"),
         #EXAMPLE FOR WHEN "OLDNAME" is 'NA' but we want a NEWNAME - if this example is deleted, also delete the comma after "Not Recorded" above
         WBT = case_when(is.na(WBT) ~ "Not Recorded", #Use "Not Recorded" when WBT value is NA
                         TRUE ~ WBT)                  #Keep original WBT value in all other cases
         )
unique(paste(USGS_new$MatrixName, USGS_new$WBT, sep='; ')) #New naming structure for Matrix Name & WBT Groupings
## [1] "Aqueous; River/Stream"  "Aqueous; Drain/Canal"   "Sediment; River/Stream"
## [4] "Sediment; Drain/Canal"  "Sediment; Slough"       "Aqueous; Not Recorded"
  # Standardize Analyte Groups - "Mercury, Total", "Mercury, Dissolved", "Mercury, Suspended", & same for Methylmercury # Lauren come back to this
unique(USGS_new$Analyte) #IF ANALYTE IS IN 2 COLUMNS e.g., (Hg & MeHg); (Total, Dissolved, Suspended) SEE WQP SCRIPT FOR EXAMPLE
##  [1] "Mercury, Total (unfiltered)"             "Mercury, Total (filtered)"              
##  [3] "Methylmercury, Total (unfiltered)"       "Methylmercury, Total (filtered)"        
##  [5] "Mercury, Total"                          "Methylmercury, Total"                   
##  [7] "Merccury, Dissolved"                     "Mercury, Dissolved"                     
##  [9] "Mercury, Colloid"                        "Mercury, Dissolved + Colloid"           
## [11] "Mercury, Total (particulate)"            "Methylmercury, Total (particulate)"     
## [13] "Monomethylmercury, Total"                "% MMeHg (MMeHg/Hg)"                     
## [15] "Mercury, Total (unfiltered whole water)" "Mercury, Total (dissolved + colloid)"
unique(paste(USGS_new$MatrixName, USGS_new$Analyte, sep='; '))
##  [1] "Aqueous; Mercury, Total (unfiltered)"            
##  [2] "Aqueous; Mercury, Total (filtered)"              
##  [3] "Aqueous; Methylmercury, Total (unfiltered)"      
##  [4] "Aqueous; Methylmercury, Total (filtered)"        
##  [5] "Aqueous; Mercury, Total"                         
##  [6] "Aqueous; Methylmercury, Total"                   
##  [7] "Aqueous; Merccury, Dissolved"                    
##  [8] "Sediment; Mercury, Total"                        
##  [9] "Aqueous; Mercury, Dissolved"                     
## [10] "Aqueous; Mercury, Colloid"                       
## [11] "Aqueous; Mercury, Dissolved + Colloid"           
## [12] "Sediment; Methylmercury, Total"                  
## [13] "Aqueous; Mercury, Total (particulate)"           
## [14] "Aqueous; Methylmercury, Total (particulate)"     
## [15] "Aqueous; Monomethylmercury, Total"               
## [16] "Sediment; Monomethylmercury, Total"              
## [17] "Sediment; % MMeHg (MMeHg/Hg)"                    
## [18] "Aqueous; Mercury, Total (unfiltered whole water)"
## [19] "Aqueous; Mercury, Total (dissolved + colloid)"
USGS_new <- USGS_new %>%
  mutate(Analyte = recode(Analyte,
                          "Mercury, Total (unfiltered)" = "Mercury, Total",
                          "Mercury, Total (unfiltered whole water)" = "Mercury, Total",
                          "Mercury, Total (particulate)" = "Mercury, Suspended",
                          "Merccury, Dissolved" = "Mercury, Dissolved",
                          "Mercury, Total (filtered)" = "Mercury, Dissolved",
                          "Monomethylmercury, Total" = "Methylmercury, Total",
                          "Methylmercury, Total (unfiltered)" = "Methylmercury, Total",
                          "Methylmercury, Total (filtered)" = "Methylmercury, Dissolved",
                          "Methylmercury, Total (particulate)" = "Methylmercury, Suspended",
                          "Mercury, Colloid" = "Mercury, Suspended",
                          "Mercury, Total (dissolved + colloid)" = "Mercury, Total",
                          "Mercury, Dissolved + Colloid" = "Mercury, Total"
                              )
         )
#Create 'Analyte' column from Analyte & Analyte_part2 columns - then delete Analyte_part2 column##
unique(USGS_new$Analyte) #New naming structure for Analyte Groupings
## [1] "Mercury, Total"           "Mercury, Dissolved"       "Methylmercury, Total"    
## [4] "Methylmercury, Dissolved" "Mercury, Suspended"       "Methylmercury, Suspended"
## [7] "% MMeHg (MMeHg/Hg)"
  # Standardize ResultQualCode Groups - "ND", "DNQ", NA#
unique(USGS_new$ResultQualCode) #Identifies OLDNAMES
## [1] "="  "ND" NA
#USGS_new <- USGS_new %>%
 # mutate(ResultQualCode = recode(ResultQualCode,
 #                                "" = NA_character_, #changes empty cells to NA - delete if not needed
  #                               "OLDNAME" = "ND",
   #                              "OLDNAME" = "DNQ"
    #                             )
     #    )
#unique(USGS_new$ResultQualCode) #New naming structure for ResultQualCode Groupings


  # Format Result Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(USGS_new$Result)){
  old <-USGS_new$Result
  new <-USGS_new$Result
  new[grepl('<|[a-df-zA-DF-Z]|^E', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "For 'ND' and '<MDL': 'ND' put in ResultQualCode column & made Result blank.\n",
             "For numeric values with '<': 'ND' put in ResultQualCode column, numeric value moved to MDL column, & made Result blank.\n",
             "For 'E0.02' & 'E0.020': removed 'E' and put 'estimate result' in ResultsComments, & kept numeric value in Result column.\n",
             "For 'NA' & 'N/C': made Result blank.\n"))

  USGS_new <- USGS_new %>%
    mutate(
      # Fixing 'ND' & '<MDL'
      ResultQualCode = if_else(grepl('ND|<MDL',Result), 'ND', ResultQualCode),
      Result = if_else(grepl('ND|<MDL',Result), NA_character_, Result),

      # Fixing numeric Results with '<'
      MDL = as.character(MDL), # convert to character to prevent class error when adding Result values
      MDL = if_else(grepl('<',Result), gsub('<[[:space:]]?|$[[:space:]]|,','', Result), MDL),
      MDL = as.numeric(MDL),   # convert back to numeric
      ResultQualCode = if_else(grepl('<',Result), 'ND', ResultQualCode),
      Result = if_else(grepl('<',Result), NA_character_, Result),

      # Fixing 'E0.02' & 'E0.020'
      ResultsComments = if_else(grepl('^E',Result), ifelse(is.na(ResultsComments),'Estimated result',paste('Estimated result',ResultsComments, sep='; ')), ResultsComments),
      Result = if_else(grepl('^E',Result), sub('E','', Result), Result),

      # Fixing 'NA' & 'N/C'
      Result = if_else(grepl('NA',Result), NA_character_, Result),
      Result = if_else(Result %in% c('NA','N/C'), NA_character_, Result),

      # Make Result Column Numeric
      Result = as.numeric(Result)
    )
} else {
  cat("'Result' column is in numeric format\n")}
## 'Result' column should be numeric but some cells contain <0.02, <0.0004, NA, ND, <3.1, <4.6, <49, <9.3, <24, <120, <6,200, <9.0, <13, <8.1, <32, <840, <9.7, <6.8, <67, <150, <MDL, <0.04, <0.046, <0.030, <0.029, <0.045, <0.025, and N/C.
## ACTIONS TAKEN:
## For 'ND' and '<MDL': 'ND' put in ResultQualCode column & made Result blank.
## For numeric values with '<': 'ND' put in ResultQualCode column, numeric value moved to MDL column, & made Result blank.
## For 'E0.02' & 'E0.020': removed 'E' and put 'estimate result' in ResultsComments, & kept numeric value in Result column.
## For 'NA' & 'N/C': made Result blank.
  # Format MDL Column to Numeric#
  # Check column for text - based on text user needs to decide what to do 
if(!is.numeric(USGS_new$MDL)){
  old <-USGS_new$MDL
  new <-USGS_new$MDL
  new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "~explain here~.\n"))
  #USGS_new <- USGS_new %>%
  #  mutate( #Due stuff to prep column to be converted to Numeric
  #    MDL = as.numeric(new)
  #  )
} else {
  cat("'MDL' column is in numeric format\n")}
## 'MDL' column is in numeric format
#View(USGS_new)


  # Format RL Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(USGS_new$RL)){   #alternate logic: if(any(grepl('[a-df-zA-DF-Z]',USGS_new$RL)))
  old <-USGS_new$RL
  new <-USGS_new$RL
  new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat("'RL' column converted to numeric format\n")
  USGS_new$RL <- as.numeric(USGS_new$RL)
} else {
  cat("'RL' column is in numeric format\n")}
## 'RL' column converted to numeric format
  # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information
nrow(USGS_new) #Number rows before
## [1] 2205
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS#
USGS_new <- USGS_new %>%
  #Set 0 & negative values as blank
  mutate(Result = ifelse(Result <= 0, NA_real_, Result),
         MDL = ifelse(MDL <= 0, NA_real_, MDL),
         RL = ifelse(RL <= 0, NA_real_, RL))
na_results <- USGS_new %>% #Record rows where Result, MDL, & RL all equal <NA>
  filter( is.na(Result) & is.na(MDL) & is.na(RL) )
#View(na_results)
USGS_new <- anti_join(USGS_new, na_results, by='SourceRow') #returns rows from USGS_new not matching values in no_result
nrow(USGS_new) #Number rows after
## [1] 2036
  # Format Units Column - "ng/L", "mg/Kg"
unique(USGS_new$Unit) #Identifies OLDNAMES
## [1] "ng/L"     "µg/L"     "ug/L"     "mg/L"     "ug/g"     "g/day"    "ng/g"    
## [8] "MMeHg/Hg" "ppm"
unique(paste(USGS_new$MatrixName, USGS_new$Unit, sep='; '))
##  [1] "Aqueous; ng/L"      "Aqueous; µg/L"      "Aqueous; ug/L"      "Aqueous; mg/L"     
##  [5] "Sediment; ug/g"     "Aqueous; g/day"     "Sediment; ng/g"     "Sediment; MMeHg/Hg"
##  [9] "Sediment; ppm"      "Aqueous; ug/g"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column
USGS_new <- USGS_new %>%
  filter(Unit != 'g/day') %>%  #filter out 477 results with unit 'g/day'; these cannot be compared to other data and will likely be recalculated during analysis based on flow data 
  filter(Unit != 'MMeHg/Hg') %>%
  standardizeUnits
unique(USGS_new$Unit) #New naming structure for ResultQualCode Groupings
## [1] "ng/L"  "mg/Kg"
  # Format Date and Time Column #
# NEED TO TALK ABOUT HOW WE WANT TO DO THIS - To graph in R we need Date and Time in same column
# THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SEPERATE COLUMNS
USGS_new <- USGS_new %>%
  #rowise() %>%    # rowise is very slow - so used sapply to make this a rowise operation
  mutate(
    #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better
    SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))),
    SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")),
    #COMBINE DATE AND TIME INTO SampleDateTime COLUMN
    SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')),
    #FORMAT SampleDateTime COLUMN TO DATE FORMAT
    SampleDateTime = lubridate::ymd_hms(SampleDateTime)
  )


### REMOVE TEMPORARY COLUMNS ###
#USGS_new <- USGS_new %>%
 # select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed
#View(USGS_new)

## SAVE FORMATTED DATA AS EXCEL FILE ##
writexl::write_xlsx(USGS_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/USGS_ceden_format.xlsx')

# In excel, to convert SampleDate column to Date format
# 1 - Select the date column.
# 2 - Go to the Data-tab and choose "Text to Columns".
# 3 - On the first screen, leave radio button on "delimited" and click Next.
# 4 - Unselect any delimiter boxes (everything blank) and click Next.
# 5 - Under column data format choose Date, select YMD
# 6 - Click Finish.

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mgcv_1.8-41        nlme_3.1-160       lubridate_1.8.0    plotly_4.10.0     
##  [5] readxl_1.4.1       actuar_3.3-0       NADA_1.6-1.1       forcats_0.5.2     
##  [9] stringr_1.4.1      dplyr_1.0.9        purrr_0.3.4        readr_2.1.2       
## [13] tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
## [17] fitdistrplus_1.1-8 survival_3.4-0     MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       webshot_0.5.3      
## [21] googledrive_2.0.0   htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1        
## [25] compiler_4.2.2      modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3    
## [29] htmltools_0.5.3     tidyselect_1.1.2    viridisLite_0.4.1   fansi_1.0.3        
## [33] crayon_1.5.1        tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0        
## [37] grid_4.2.2          jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1    
## [41] DBI_1.1.3           magrittr_2.0.3      scales_1.2.1        writexl_1.4.0      
## [45] cli_3.3.0           stringi_1.7.8       fs_1.5.2            xml2_1.3.3         
## [49] ellipsis_0.3.2      generics_0.1.3      vctrs_0.4.1         expint_0.1-7       
## [53] tools_4.2.2         glue_1.6.2          crosstalk_1.2.0     hms_1.1.2          
## [57] yaml_2.3.5          fastmap_1.1.0       colorspace_2.0-3    gargle_1.2.0       
## [61] rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2024-01-04 15:25:04 PST"