This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") ### LOAD DATA ### USGS <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/USGS_Aq Sed Hg data.xlsx', sheet='Sheet1', guess_max = 30000) nrow(USGS) #number of rows should match the Excel file (minus the header row)
## [1] 2911
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ### #Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns keep_cols <- c('SourceID','SourceRow', 'CitationCode (PUB ID)','CoordSystem','StationName', 'StationCode', 'SampleDate', 'CollectionTime', 'MatrixName', 'WBT', 'MethodName', 'Analyte', 'Unit', 'Result', 'MDL', 'RL', 'ResultQualCode', 'TargetLatitude', 'TargetLongitude', 'QACode') #temp_cols <- c('COLUMN NAME', 'COLUMN NAME') #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL) #temp_cols are removed before the data is merged with other datasets USGS_new <- USGS %>% select( c(keep_cols) ) %>% #DO NOT CHANGE - selects columns specified above filter(Result %are not% c('Not Analyzed','Not Calculated', 'NA') | Unit != 'g/day') %>% rename( CitationCode = `CitationCode (PUB ID)`, SampleTime = CollectionTime ) %>% mutate( #Create Missing column or modify existing column here: CEDEN COLUMNNAME = 'SPECIFIED VALUE' or FUNCTION #DELTE COLUMN NAMES THAT DO NOT NEED TO BE CHANGED Project = NA_character_, #CollectionTime = NA_character_, #Robin commented this out since CollectionTime was in keep_cols LabBatch = NA_character_, LabSampleID = NA_character_, SampleID = NA_character_, SampleComments = NA_character_, BatchVerification = NA_character_, ComplianceCode = NA_character_, CollectionComments = NA_character_, ResultsComments = NA_character_, BatchComments = NA_character_, SampleTypeCode = NA_character_ ) nrow(USGS_new)
## [1] 2535
#str(USGS_new) #just to check data class of different columns - e.g., is Date column in POSIX format? #View(USGS_new) ### FORMAT COLUMN PARAMETERS ### # Standardize MatrixName Groups - "Water", "Sediment", "Soil" # unique(USGS_new$MatrixName) #Identifies OLDNAMES
## [1] "Aqueous" "Sediment"
#STANDARD CODE TO CHANGE GROUPING NAMES #USGS_new <- USGS_new %>% # mutate(MatrixName = recode(MatrixName, #COLUMN NAME WHERE CHANGES WILL BE MADE #"OLDNAME" = "NEWNAME" - create a new line for each OLDNAME that needs to be changed #unique(USGS_new$MatrixName) #New naming structure should now be listed # Standardize WBT (WaterBodyType) Groups - "River/Stream", "Drain/Canal", "Wetland", "Spring", "Slough", # "Pond", "Lake/Reservoir", "Delta", "Forebay/Afterbay", "Not Recorded" # unique(USGS_new$WBT) #Identifies OLDNAMES
## [1] "River" "Drain/Canal" "Creek" "Lake" "Slough" NA ## [7] "Mine"
#Look for inconsistent group pairings between MatrixName and WBT (e.g., 'Soil; Stream') unique(paste(USGS_new$MatrixName, USGS_new$WBT, sep='; '))
## [1] "Aqueous; River" "Aqueous; Drain/Canal" "Aqueous; Creek" ## [4] "Aqueous; Lake" "Sediment; River" "Sediment; Creek" ## [7] "Sediment; Drain/Canal" "Sediment; Slough" "Aqueous; NA" ## [10] "Sediment; Lake" "Sediment; Mine" "Aqueous; Mine"
#If an incosistent grouping exists, add comment to 'CollectionComments' column using code: mutate(CollectionComments = case_when(MatrixName=='CONDITION' ~ 'COMMENT', TRUE ~ MatrixName)), #STANDARD CODE TO CHANGE GROUPING NAMES USGS_new <- USGS_new %>% filter(WBT %are not% c("Lake", "Mine")) %>% mutate(WBT = recode(WBT, #"OLDNAME" = "NEWNAME" - create a new line for each OLDNAME that needs to be changed "River" = "River/Stream", "Creek" = "River/Stream"), #EXAMPLE FOR WHEN "OLDNAME" is 'NA' but we want a NEWNAME - if this example is deleted, also delete the comma after "Not Recorded" above WBT = case_when(is.na(WBT) ~ "Not Recorded", #Use "Not Recorded" when WBT value is NA TRUE ~ WBT) #Keep original WBT value in all other cases ) unique(paste(USGS_new$MatrixName, USGS_new$WBT, sep='; ')) #New naming structure for Matrix Name & WBT Groupings
## [1] "Aqueous; River/Stream" "Aqueous; Drain/Canal" "Sediment; River/Stream" ## [4] "Sediment; Drain/Canal" "Sediment; Slough" "Aqueous; Not Recorded"
# Standardize Analyte Groups - "Mercury, Total", "Mercury, Dissolved", "Mercury, Suspended", & same for Methylmercury # Lauren come back to this unique(USGS_new$Analyte) #IF ANALYTE IS IN 2 COLUMNS e.g., (Hg & MeHg); (Total, Dissolved, Suspended) SEE WQP SCRIPT FOR EXAMPLE
## [1] "Mercury, Total (unfiltered)" "Mercury, Total (filtered)" ## [3] "Methylmercury, Total (unfiltered)" "Methylmercury, Total (filtered)" ## [5] "Mercury, Total" "Methylmercury, Total" ## [7] "Merccury, Dissolved" "Mercury, Dissolved" ## [9] "Mercury, Colloid" "Mercury, Dissolved + Colloid" ## [11] "Mercury, Total (particulate)" "Methylmercury, Total (particulate)" ## [13] "Monomethylmercury, Total" "% MMeHg (MMeHg/Hg)" ## [15] "Mercury, Total (unfiltered whole water)" "Mercury, Total (dissolved + colloid)"
unique(paste(USGS_new$MatrixName, USGS_new$Analyte, sep='; '))
## [1] "Aqueous; Mercury, Total (unfiltered)" ## [2] "Aqueous; Mercury, Total (filtered)" ## [3] "Aqueous; Methylmercury, Total (unfiltered)" ## [4] "Aqueous; Methylmercury, Total (filtered)" ## [5] "Aqueous; Mercury, Total" ## [6] "Aqueous; Methylmercury, Total" ## [7] "Aqueous; Merccury, Dissolved" ## [8] "Sediment; Mercury, Total" ## [9] "Aqueous; Mercury, Dissolved" ## [10] "Aqueous; Mercury, Colloid" ## [11] "Aqueous; Mercury, Dissolved + Colloid" ## [12] "Sediment; Methylmercury, Total" ## [13] "Aqueous; Mercury, Total (particulate)" ## [14] "Aqueous; Methylmercury, Total (particulate)" ## [15] "Aqueous; Monomethylmercury, Total" ## [16] "Sediment; Monomethylmercury, Total" ## [17] "Sediment; % MMeHg (MMeHg/Hg)" ## [18] "Aqueous; Mercury, Total (unfiltered whole water)" ## [19] "Aqueous; Mercury, Total (dissolved + colloid)"
USGS_new <- USGS_new %>% mutate(Analyte = recode(Analyte, "Mercury, Total (unfiltered)" = "Mercury, Total", "Mercury, Total (unfiltered whole water)" = "Mercury, Total", "Mercury, Total (particulate)" = "Mercury, Suspended", "Merccury, Dissolved" = "Mercury, Dissolved", "Mercury, Total (filtered)" = "Mercury, Dissolved", "Monomethylmercury, Total" = "Methylmercury, Total", "Methylmercury, Total (unfiltered)" = "Methylmercury, Total", "Methylmercury, Total (filtered)" = "Methylmercury, Dissolved", "Methylmercury, Total (particulate)" = "Methylmercury, Suspended", "Mercury, Colloid" = "Mercury, Suspended", "Mercury, Total (dissolved + colloid)" = "Mercury, Total", "Mercury, Dissolved + Colloid" = "Mercury, Total" ) ) #Create 'Analyte' column from Analyte & Analyte_part2 columns - then delete Analyte_part2 column## unique(USGS_new$Analyte) #New naming structure for Analyte Groupings
## [1] "Mercury, Total" "Mercury, Dissolved" "Methylmercury, Total" ## [4] "Methylmercury, Dissolved" "Mercury, Suspended" "Methylmercury, Suspended" ## [7] "% MMeHg (MMeHg/Hg)"
# Standardize ResultQualCode Groups - "ND", "DNQ", NA# unique(USGS_new$ResultQualCode) #Identifies OLDNAMES
## [1] "=" "ND" NA
#USGS_new <- USGS_new %>% # mutate(ResultQualCode = recode(ResultQualCode, # "" = NA_character_, #changes empty cells to NA - delete if not needed # "OLDNAME" = "ND", # "OLDNAME" = "DNQ" # ) # ) #unique(USGS_new$ResultQualCode) #New naming structure for ResultQualCode Groupings # Format Result Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(USGS_new$Result)){ old <-USGS_new$Result new <-USGS_new$Result new[grepl('<|[a-df-zA-DF-Z]|^E', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "For 'ND' and '<MDL': 'ND' put in ResultQualCode column & made Result blank.\n", "For numeric values with '<': 'ND' put in ResultQualCode column, numeric value moved to MDL column, & made Result blank.\n", "For 'E0.02' & 'E0.020': removed 'E' and put 'estimate result' in ResultsComments, & kept numeric value in Result column.\n", "For 'NA' & 'N/C': made Result blank.\n")) USGS_new <- USGS_new %>% mutate( # Fixing 'ND' & '<MDL' ResultQualCode = if_else(grepl('ND|<MDL',Result), 'ND', ResultQualCode), Result = if_else(grepl('ND|<MDL',Result), NA_character_, Result), # Fixing numeric Results with '<' MDL = as.character(MDL), # convert to character to prevent class error when adding Result values MDL = if_else(grepl('<',Result), gsub('<[[:space:]]?|$[[:space:]]|,','', Result), MDL), MDL = as.numeric(MDL), # convert back to numeric ResultQualCode = if_else(grepl('<',Result), 'ND', ResultQualCode), Result = if_else(grepl('<',Result), NA_character_, Result), # Fixing 'E0.02' & 'E0.020' ResultsComments = if_else(grepl('^E',Result), ifelse(is.na(ResultsComments),'Estimated result',paste('Estimated result',ResultsComments, sep='; ')), ResultsComments), Result = if_else(grepl('^E',Result), sub('E','', Result), Result), # Fixing 'NA' & 'N/C' Result = if_else(grepl('NA',Result), NA_character_, Result), Result = if_else(Result %in% c('NA','N/C'), NA_character_, Result), # Make Result Column Numeric Result = as.numeric(Result) ) } else { cat("'Result' column is in numeric format\n")}
## 'Result' column should be numeric but some cells contain <0.02, <0.0004, NA, ND, <3.1, <4.6, <49, <9.3, <24, <120, <6,200, <9.0, <13, <8.1, <32, <840, <9.7, <6.8, <67, <150, <MDL, <0.04, <0.046, <0.030, <0.029, <0.045, <0.025, and N/C. ## ACTIONS TAKEN: ## For 'ND' and '<MDL': 'ND' put in ResultQualCode column & made Result blank. ## For numeric values with '<': 'ND' put in ResultQualCode column, numeric value moved to MDL column, & made Result blank. ## For 'E0.02' & 'E0.020': removed 'E' and put 'estimate result' in ResultsComments, & kept numeric value in Result column. ## For 'NA' & 'N/C': made Result blank.
# Format MDL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(USGS_new$MDL)){ old <-USGS_new$MDL new <-USGS_new$MDL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #USGS_new <- USGS_new %>% # mutate( #Due stuff to prep column to be converted to Numeric # MDL = as.numeric(new) # ) } else { cat("'MDL' column is in numeric format\n")}
## 'MDL' column is in numeric format
#View(USGS_new) # Format RL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(USGS_new$RL)){ #alternate logic: if(any(grepl('[a-df-zA-DF-Z]',USGS_new$RL))) old <-USGS_new$RL new <-USGS_new$RL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat("'RL' column converted to numeric format\n") USGS_new$RL <- as.numeric(USGS_new$RL) } else { cat("'RL' column is in numeric format\n")}
## 'RL' column converted to numeric format
# Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information nrow(USGS_new) #Number rows before
## [1] 2205
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS# USGS_new <- USGS_new %>% #Set 0 & negative values as blank mutate(Result = ifelse(Result <= 0, NA_real_, Result), MDL = ifelse(MDL <= 0, NA_real_, MDL), RL = ifelse(RL <= 0, NA_real_, RL)) na_results <- USGS_new %>% #Record rows where Result, MDL, & RL all equal <NA> filter( is.na(Result) & is.na(MDL) & is.na(RL) ) #View(na_results) USGS_new <- anti_join(USGS_new, na_results, by='SourceRow') #returns rows from USGS_new not matching values in no_result nrow(USGS_new) #Number rows after
## [1] 2036
# Format Units Column - "ng/L", "mg/Kg" unique(USGS_new$Unit) #Identifies OLDNAMES
## [1] "ng/L" "µg/L" "ug/L" "mg/L" "ug/g" "g/day" "ng/g" ## [8] "MMeHg/Hg" "ppm"
unique(paste(USGS_new$MatrixName, USGS_new$Unit, sep='; '))
## [1] "Aqueous; ng/L" "Aqueous; µg/L" "Aqueous; ug/L" "Aqueous; mg/L" ## [5] "Sediment; ug/g" "Aqueous; g/day" "Sediment; ng/g" "Sediment; MMeHg/Hg" ## [9] "Sediment; ppm" "Aqueous; ug/g"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column USGS_new <- USGS_new %>% filter(Unit != 'g/day') %>% #filter out 477 results with unit 'g/day'; these cannot be compared to other data and will likely be recalculated during analysis based on flow data filter(Unit != 'MMeHg/Hg') %>% standardizeUnits unique(USGS_new$Unit) #New naming structure for ResultQualCode Groupings
## [1] "ng/L" "mg/Kg"
# Format Date and Time Column # # NEED TO TALK ABOUT HOW WE WANT TO DO THIS - To graph in R we need Date and Time in same column # THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SEPERATE COLUMNS USGS_new <- USGS_new %>% #rowise() %>% # rowise is very slow - so used sapply to make this a rowise operation mutate( #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))), SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")), #COMBINE DATE AND TIME INTO SampleDateTime COLUMN SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')), #FORMAT SampleDateTime COLUMN TO DATE FORMAT SampleDateTime = lubridate::ymd_hms(SampleDateTime) ) ### REMOVE TEMPORARY COLUMNS ### #USGS_new <- USGS_new %>% # select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed #View(USGS_new) ## SAVE FORMATTED DATA AS EXCEL FILE ## writexl::write_xlsx(USGS_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/USGS_ceden_format.xlsx') # In excel, to convert SampleDate column to Date format # 1 - Select the date column. # 2 - Go to the Data-tab and choose "Text to Columns". # 3 - On the first screen, leave radio button on "delimited" and click Next. # 4 - Unselect any delimiter boxes (everything blank) and click Next. # 5 - Under column data format choose Date, select YMD # 6 - Click Finish.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] mgcv_1.8-41 nlme_3.1-160 lubridate_1.8.0 plotly_4.10.0 ## [5] readxl_1.4.1 actuar_3.3-0 NADA_1.6-1.1 forcats_0.5.2 ## [9] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 ## [13] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2 ## [17] fitdistrplus_1.1-8 survival_3.4-0 MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 webshot_0.5.3 ## [21] googledrive_2.0.0 htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 ## [25] compiler_4.2.2 modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 ## [29] htmltools_0.5.3 tidyselect_1.1.2 viridisLite_0.4.1 fansi_1.0.3 ## [33] crayon_1.5.1 tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 ## [37] grid_4.2.2 jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 ## [41] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1 writexl_1.4.0 ## [45] cli_3.3.0 stringi_1.7.8 fs_1.5.2 xml2_1.3.3 ## [49] ellipsis_0.3.2 generics_0.1.3 vctrs_0.4.1 expint_0.1-7 ## [53] tools_4.2.2 glue_1.6.2 crosstalk_1.2.0 hms_1.1.2 ## [57] yaml_2.3.5 fastmap_1.1.0 colorspace_2.0-3 gargle_1.2.0 ## [61] rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2024-01-04 15:25:04 PST"