This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") ### LOAD DATA ### AppK <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg.xlsx', sheet='Data', guess_max = 30000)
There were 50 or more warnings (use warnings() to see the first 50)
cat('"50 or more warnings" result from Date column having years and not a full day, month, year date. The "Format Date & Time" section addresses this issue\n')
## "50 or more warnings" result from Date column having years and not a full day, month, year date. The "Format Date & Time" section addresses this issue
nrow(AppK) #number of rows should match the Excel file (minus the header row)
## [1] 2479
### LOAD REFERENCE SHEETS ### #Citations AppK_citations <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg.xlsx', sheet='Project Citations', guess_max = 30000) %>% select(ProjID, Citation) %>% rename(ProjectCode = ProjID, CitationCode = Citation) %>% tidyr::separate_rows(., ProjectCode, sep = ",") #seperates a row that has more than one ProjectCode listed for each Citation into multiple rows #Lat, Long, Coord system AppK_coord <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg.xlsx', sheet='Sites', guess_max = 30000) %>% select(SiteName, Latitude, Longitude, Datum) %>% filter(!is.na(Latitude)) %>% distinct(SiteName, .keep_all=T) %>% rename(StationName = SiteName, TargetLatitude = Latitude, TargetLongitude = Longitude, CoordSystem = Datum) ### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ### #Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns keep_cols <- c('SourceID','SourceRow', 'ProjID', 'SiteName', 'SampleDate_FIXED', 'Common', 'Number', 'Tissue', 'Hg_ppmWetWt', 'Weight(g)', 'Length(mm)', 'SampleID') #temp_cols are removed before the data is merged with other datasets #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL) temp_cols <- c('GroupedSites', 'DeltaSubRegion-NumTarget', 'DeltaSubRegion-Linkage-Subregion_1998-2001', 'Genus', 'Species', 'LabQualHGppmWetWt', 'SampleID-Alternate', 'Matrix1', 'Individual&Composite', 'LengthType', 'LabQualHGppmWetWt') AppK_new <- AppK %>% select( c(keep_cols,temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above rename( #OldName = NewName ProjectCode = ProjID, StationName = SiteName, SampleDate = SampleDate_FIXED, CommonName = Common, NumberFishPerComp = Number, TissueName = Tissue, Result = Hg_ppmWetWt, `WeightAvg g` = `Weight(g)`, `TLAvgLength mm` = `Length(mm)` ) %>% mutate( #NewName = 'SPECIFIED VALUE' or FUNCTION ProgramName = NA_character_, ParentProjectName = NA_character_, ProjectName = NA_character_, CompositeID = NA_character_, StationCode = paste(paste0('GroupedSites: ',GroupedSites), paste0('DeltaSubRegion-NumTarget: ',`DeltaSubRegion-NumTarget`), paste0('DeltaSubRegion-Linkage: ', `DeltaSubRegion-Linkage-Subregion_1998-2001`), sep=' ~ '), SampleTime = NA_character_, TaxonomicName = paste(Genus, Species), Method = NA_character_, Analyte = 'Mercury, Total', Unit = 'mg/Kg ww', ResultQualCode = case_when(grepl('<DL',LabQualHGppmWetWt) ~ 'ND', grepl('<RL',LabQualHGppmWetWt) ~ 'ND', grepl('BRL',LabQualHGppmWetWt) ~ 'ND', # BRL=Below Reporting Limit; a RL value is given but all we know is that the result is <RL TRUE ~ NA_character_), Result = case_when(grepl('<DL',LabQualHGppmWetWt) ~ NA_real_, #remove Result value because when Result = MDL/RL is causes confusion grepl('<RL',LabQualHGppmWetWt) ~ NA_real_, #remove Result value because when Result = MDL/RL is causes confusion grepl('BRL',LabQualHGppmWetWt) ~ NA_real_, #remove Result value because when Result = MDL/RL is causes confusion TRUE ~ Result), MDL = case_when(grepl('DL',LabQualHGppmWetWt) ~ .0386, TRUE ~ NA_real_), RL = case_when(grepl('RL',LabQualHGppmWetWt) ~ .00314, grepl('BRL',LabQualHGppmWetWt) ~ .0282, TRUE ~ NA_real_), `TLMin mm` = NA_character_, `TLMax mm` = NA_character_, CompositeRowID = NA_character_, SampleID = paste(paste0('SampleID: ', SampleID), paste0('SampleID-Alt: ',`SampleID-Alternate`), sep=' ~ '), WBT = 'Not Recorded', QACode = NA_character_, BatchVerification = NA_character_, ComplianceCode = NA_character_, ResultComments = paste(ifelse(grepl('x',`Individual&Composite`,ignore.case=T), 'Included in both individual & composite samples', ''), paste0('LengthType: ',ifelse(grepl('placeholder',LengthType),'*placeholder',ifelse(grepl('TL',LengthType),'total length',''))), #shorten '*placeholder...' notation & clarify 'TL' notation paste0('LabQualHGppmWetWt: ',ifelse(grepl('average of',LabQualHGppmWetWt), LabQualHGppmWetWt, '')), #just note the average of duplicates sep=' ~ '), LabSubmissionCode = NA_character_ ) %>% filter(Matrix1 %are not% c('IV', 'MO')) %>% #Exclude IV=Invertebrate & MO=Mollusk left_join(., AppK_citations, by='ProjectCode') %>% # Add CitationCode column left_join(., AppK_coord, by='StationName') # Add TargetLat, TargetLong, and CoordSystem columns
nrow(AppK_new)
## [1] 1816
#str(AppK_new) #just to check data class of different columns - e.g., is Date column in POSIX format? #View(AppK_new) ### FORMAT COLUMN PARAMETERS ### # Standardize TissueName Groups - "Fillet" or "Whole Body" # unique(AppK_new$TissueName)
## [1] "F" "W"
AppK_new <- AppK_new %>% mutate(TissueName = recode(TissueName, "F" = "Fillet", "W" = "Whole Body" ) ) %>% filter(TissueName %in% c('Fillet','Whole Body')) unique(AppK_new$TissueName)
## [1] "Fillet" "Whole Body"
# Standardize Analyte Groups - "Mercury, Total" or "Methylmercury, Total" # unique(AppK_new$Analyte)
## [1] "Mercury, Total"
# [1] "Mercury, Total" - no changes needed # Standardize ResultQualCode Groups - "ND", "DNQ", NA# unique(AppK_new$ResultQualCode) #Identifies OLDNAMES
## [1] NA "ND"
# [1] NA "ND" - no changes needed # Format Result Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(AppK_new$Result)){ old <-AppK_new$Result new <-AppK_new$Result new[grepl('<|[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #AppK_new <- AppK_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # Result = as.numeric(new) # ) } else { cat("'Result' column is in numeric format\n")}
## 'Result' column is in numeric format
# Format MDL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(AppK_new$MDL)){ old <-AppK_new$MDL new <- AppK_new$MDL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #AppK_new <- AppK_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # MDL = as.numeric(new) # ) } else { cat("'MDL' column is in numeric format\n")}
## 'MDL' column is in numeric format
# Format RL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(AppK_new$RL)){ old <-AppK_new$RL new <-AppK_new$RL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #AppK_new <- AppK_new %>% # mutate( #Due stuff to prep column to be converted to Numeric # RL = as.numeric(new) # ) } else { cat("'RL' column is in numeric format\n")}
## 'RL' column is in numeric format
# Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information nrow(AppK_new) #Number rows before
## [1] 1816
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS# AppK_new <- AppK_new %>% #Set 0 & negative values as blank mutate(Result = ifelse(Result <= 0, NA_real_, Result), MDL = ifelse(MDL <= 0, NA_real_, MDL), RL = ifelse(RL <= 0, NA_real_, RL)) na_results <- AppK_new %>% #Record rows where Result, MDL, & RL all equal <NA> filter( is.na(Result) & is.na(MDL) & is.na(RL) ) nrow(na_results)
## [1] 0
AppK_new <- anti_join(AppK_new, na_results, by='SourceRow') #returns rows from AppK_new not matching values in no_result nrow(AppK_new) #Number rows after
## [1] 1816
# Format Units Column - "mg/Kg ww" or "mg/Kg dw" unique(AppK_new$Unit) #Identifies OLDNAMES
## [1] "mg/Kg ww"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column AppK_new <- AppK_new %>% standardizeUnits(pp = "mass") unique(AppK_new$Unit) #New naming structure for Unit Groupings
## [1] "mg/Kg ww"
# Format Date & Time AppK_new <- AppK_new %>% #rowise() %>% # rowise is very slow - so used sapply to make this a rowise operation mutate( #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))), SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")), #COMBINE DATE AND TIME INTO SampleDateTime COLUMN SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')), #FORMAT SampleDateTime COLUMN TO DATE FORMAT SampleDateTime = lubridate::ymd_hms(SampleDateTime) ) ### REMOVE TEMPORARY COLUMNS ### AppK_new <- AppK_new %>% select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed #View(AppK_new) ## SAVE FORMATTED DATA AS EXCEL FILE ## writexl::write_xlsx(AppK_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg_ceden_format.xlsx') # In excel, to convert SampleDate column to Date format # 1 - Select the date column. # 2 - Go to the Data-tab and choose "Text to Columns". # 3 - On the first screen, leave radio button on "delimited" and click Next. # 4 - Unselect any delimiter boxes (everything blank) and click Next. # 5 - Under column data format choose Date, select YMD # 6 - Click Finish.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] lubridate_1.8.0 plotly_4.10.0 readxl_1.4.1 actuar_3.3-0 ## [5] NADA_1.6-1.1 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.9 ## [9] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8 ## [13] ggplot2_3.3.6 tidyverse_1.3.2 fitdistrplus_1.1-8 survival_3.4-0 ## [17] MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 googledrive_2.0.0 ## [21] htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 compiler_4.2.2 ## [25] modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 htmltools_0.5.3 ## [29] tidyselect_1.1.2 fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.1 ## [33] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 grid_4.2.2 ## [37] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 DBI_1.1.3 ## [41] magrittr_2.0.3 scales_1.2.1 writexl_1.4.0 cli_3.3.0 ## [45] stringi_1.7.8 fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 ## [49] generics_0.1.3 vctrs_0.4.1 expint_0.1-7 tools_4.2.2 ## [53] glue_1.6.2 hms_1.1.2 fastmap_1.1.0 colorspace_2.0-3 ## [57] gargle_1.2.0 rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2024-01-05 09:11:47 PST"