This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") ### LOAD DATA ### CALFED2003_1a <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/CALFED2003-1a.xls', sheet='Load Calcs_workingWide', guess_max = 30000) nrow(CALFED2003_1a) #number of rows should match the Excel file (minus the header row)
## [1] 145
#str(CALFED2003_1a) ### FORMAT DATA FROM WIDE TO LONG ### #View(CALFED2003_1a) #remove flow and calculated load columns CALFED2003_1a <- CALFED2003_1a %>% select(-Flow, -`THg Loading`:-`DMMHg Loading`) #slice off first row that has units units <- CALFED2003_1a %>% #saving units to new variable slice(1) %>% select(THg:SO4) %>% #select columns that have units gather(key="Analyte", value="Unit") #format from wide to long CALFED2003_1a <- CALFED2003_1a %>% slice(-1) #remove 1st row with units from main data #CALFED wide to long CALFED2003_1a <- CALFED2003_1a %>% gather(THg:SO4, key="Analyte", value="Result") #PASTE LETTER OF COLUMN TO RESULT ID SO THE RESULT ID IS NOT DUPLICATED seqPasteLetters <- function(chr_vector){ col_letters <- LETTERS[c(6:15,17)] # letters corresponding to column of excel data - excludes Flow column for (i in 1:length(chr_vector)){ chr_vector[i] <- paste0(chr_vector[i], col_letters[i]) } return(chr_vector) } CALFED2003_1a <- CALFED2003_1a %>% mutate(tempSourceRow = SourceRow) %>% group_by(tempSourceRow) %>% mutate(SourceRow = seqPasteLetters(SourceRow)) %>% ungroup() %>% dplyr::select(-tempSourceRow) #Add Units back in CALFED2003_1a_new <- left_join(CALFED2003_1a, units, by="Analyte") #View(CALFED2003_1a_new) ### FORMAT COLUMN PARAMETERS ### # Format Result Column to Numeric# # Check column for text - based on text user needs to decide what to do if(any(grepl('<|[a-df-zA-DF-Z]', CALFED2003_1a_new$Result))){ old <- CALFED2003_1a_new$Result new <- CALFED2003_1a_new$Result new[grepl('<|[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "Values with 'U' or '<' were recorded in MDL column, 'ND' in ResultQualCode column, and 'NA' in Result column.\n", "Result with ND (only @ Pospect Slough for DMMHg on 7/19/00), 'ND' recorded in ResultQualCode column 'NA' in Result column, and 0.022 in MDL from data_MMHg_QA.xlsx\n", "Results with 'broken' and 'reservoir dry' were replaced with 'NA' in Result column.\n")) CALFED2003_1a_new <- CALFED2003_1a_new %>% mutate( MDL = case_when( grepl('U|<', Result) ~ gsub('U|<', '', Result)), ResultQualCode = case_when( grepl('U|<|nd', Result) ~ 'ND', TRUE ~ '='), MDL = case_when(grepl('^nd$', Result) ~ "0.022", #this ND only occurs once - Pospect Slough for DMMHg on 7/19/00 and MDL was found to be 0.022 from data_MMHg_QA.xlsx) TRUE ~ MDL), Result = case_when( grepl('<|[a-df-zA-DF-Z]', Result) ~ NA_character_, TRUE ~ Result), Result = as.numeric(Result) ) } else { cat("'Result' column converted to numeric format\n") CALFED2003_1a_new$Result <- as.numeric(CALFED2003_1a_new$Result) }
## 'Result' column should be numeric but some cells contain broken, 0.022U, 0.014U, 0.013U, 0.007U, nd, leak in State Water Project lift Canal, reservoir dry, <1.0, <1, and <2.0. ## ACTIONS TAKEN: ## Values with 'U' or '<' were recorded in MDL column, 'ND' in ResultQualCode column, and 'NA' in Result column. ## Result with ND (only @ Pospect Slough for DMMHg on 7/19/00), 'ND' recorded in ResultQualCode column 'NA' in Result column, and 0.022 in MDL from data_MMHg_QA.xlsx ## Results with 'broken' and 'reservoir dry' were replaced with 'NA' in Result column.
# Format MDL Column to Numeric # CALFED2003_1a_new$MDL <- as.numeric(CALFED2003_1a_new$MDL) # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information nrow(CALFED2003_1a_new) #Number rows before
## [1] 1584
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS# CALFED2003_1a_new <- CALFED2003_1a_new %>% #Set 0 & negative values as blank mutate(Result = ifelse(Result <= 0, NA_real_, Result), MDL = ifelse(MDL <= 0, NA_real_, MDL)) na_results <- CALFED2003_1a_new %>% #Record rows where Result, MDL, & RL all equal <NA> or 0 filter( is.na(Result) & is.na(MDL) ) #View(na_results) CALFED2003_1a_new <- anti_join(CALFED2003_1a_new, na_results, by='SourceRow') #returns rows from CALFED2003_1a_new not matching values in no_result nrow(CALFED2003_1a_new) #Number rows after
## [1] 1223
# Format Analyte Column unique(CALFED2003_1a_new$Analyte)
## [1] "THg" "DHg" "TMMHg" "DMMHg" "field EC" "field Temp" ## [7] "TSS" "VSS" "Chlorophyl" "Phaeoph" "SO4"
#Change Mercury but deciding on Ancillary consituent naming later CALFED2003_1a_new <- CALFED2003_1a_new %>% mutate(Analyte = recode(Analyte, "THg" = "Mercury, Total", "DHg" = "Mercury, Dissolved", "TMMHg" = "Methylmercury, Total", "DMMHg" = "Methylmercury, Dissolved" ) ) # Format Date and Time Column # # NEED TO TALK ABOUT HOW WE WANT TO DO THIS - To graph in R we need Date and Time in same column # No TIme column so use 00:00:00 as default CALFED2003_1a_new <- CALFED2003_1a_new %>% #rowise() %>% # rowise is very slow - so used sapply to make this a rowise operation mutate( #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))), #COMBINE DATE AND TIME INTO SampleDateTime COLUMN SampleDateTime = paste(SampleDate, '00:00:00'), #FORMAT SampleDateTime COLUMN TO DATE FORMAT SampleDateTime = lubridate::ymd_hms(SampleDateTime) ) ### ADD USER DEFINED & MISSING COLUMNS NAMED TO CEDEN STANDARDS ### CALFED2003_1a_new <- CALFED2003_1a_new %>% mutate( #Add user defined columns CitationCode = 'CALFED_2003_1a', #Check with Jennie CoordSystem = NA_character_, Project = 'CALFED 2003 Task1a', StationCode = NA_character_, SampleTime = NA_character_, LabBatch = NA_character_, LabSampleID = NA_character_, MatrixName = 'Aqueous', WBT = case_when(grepl('River|Creek', StationName) ~ 'River/Stream', grepl('Slough', StationName) ~ 'Slough', grepl('Canal', StationName) ~ 'Drain/Canal', grepl('State Water Project', StationName) ~ 'Drain/Canal', grepl('X2', StationName) ~ 'Delta' ), MethodName = NA_character_, RL= NA_real_, SampleID = NA_character_, QACode = NA_character_, BatchVerification = NA_character_, ComplianceCode = NA_character_, SampleComments =NA_character_, CollectionComments = NA_character_, ResultsComments = NA_character_, BatchComments = NA_character_, SampleTypeCode = NA_character_ ) #View(CALFED2003_1a_new) ### SPLIT ANCILLARY DATA AND MERCURY DATA INTO SEPERATE DATAFRAMES & STANDARDIZE UNITS FOR EACH ### CALFED2003_1a_mercury <- CALFED2003_1a_new %>% filter(grepl('mercury', Analyte, ignore.case=T)) unique(CALFED2003_1a_mercury$Unit)
## [1] "ng/l"
CALFED2003_1a_mercury <- CALFED2003_1a_mercury %>% # Format Units Column - "ng/L", "mg/Kg" standardizeUnits unique(CALFED2003_1a_mercury$Unit)
## [1] "ng/L"
CALFED2003_1a_ancillary <- CALFED2003_1a_new %>% filter(!grepl('mercury', Analyte, ignore.case=T)) nrow(CALFED2003_1a_mercury) + nrow(CALFED2003_1a_ancillary) == nrow(CALFED2003_1a_new) #needs to be true
## [1] TRUE
## SAVE FORMATTED DATA AS EXCEL FILE ## #Save Mercury data writexl::write_xlsx(CALFED2003_1a_mercury, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/CALFED2003-1aMerc_ceden_format.xlsx') #Save Ancillary data writexl::write_xlsx(CALFED2003_1a_ancillary, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Ancillary/CALFED2003-1aAncillary_ceden_format.xlsx') # In excel, to convert SampleDate column to Date format # 1 - Select the date column. # 2 - Go to the Data-tab and choose "Text to Columns". # 3 - On the first screen, leave radio button on "delimited" and click Next. # 4 - Unselect any delimiter boxes (everything blank) and click Next. # 5 - Under column data format choose Date, select YMD # 6 - Click Finish.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=C ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## system code page: 65001 ## ## attached base packages: ## [1] grid stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] mgcv_1.8-41 nlme_3.1-160 lubridate_1.8.0 plotly_4.10.0 ## [5] readxl_1.4.1 actuar_3.3-0 NADA_1.6-1.1 forcats_0.5.2 ## [9] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 ## [13] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2 ## [17] fitdistrplus_1.1-8 survival_3.4-0 MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] httr_1.4.4 jsonlite_1.8.0 viridisLite_0.4.1 splines_4.2.2 ## [5] modelr_0.1.9 assertthat_0.2.1 highr_0.9 googlesheets4_1.0.1 ## [9] cellranger_1.1.0 yaml_2.3.5 pillar_1.8.1 backports_1.4.1 ## [13] lattice_0.20-45 glue_1.6.2 digest_0.6.29 rvest_1.0.3 ## [17] colorspace_2.0-3 htmltools_0.5.3 Matrix_1.5-1 pkgconfig_2.0.3 ## [21] broom_1.0.1 haven_2.5.1 webshot_0.5.3 scales_1.2.1 ## [25] tzdb_0.3.0 googledrive_2.0.0 generics_0.1.3 ellipsis_0.3.2 ## [29] withr_2.5.0 lazyeval_0.2.2 cli_3.3.0 magrittr_2.0.3 ## [33] crayon_1.5.1 evaluate_0.16 fs_1.5.2 fansi_1.0.3 ## [37] xml2_1.3.3 tools_4.2.2 data.table_1.14.2 hms_1.1.2 ## [41] expint_0.1-7 gargle_1.2.0 lifecycle_1.0.1 munsell_0.5.0 ## [45] reprex_2.0.2 writexl_1.4.0 compiler_4.2.2 rlang_1.0.5 ## [49] rstudioapi_0.14 htmlwidgets_1.5.4 crosstalk_1.2.0 rmarkdown_2.16 ## [53] gtable_0.3.1 DBI_1.1.3 R6_2.5.1 knitr_1.40 ## [57] fastmap_1.1.0 utf8_1.2.2 stringi_1.7.8 vctrs_0.4.1 ## [61] dbplyr_2.2.1 tidyselect_1.1.2 xfun_0.32
Sys.time()
## [1] "2023-12-29 13:49:58 PST"