This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") # General Method - Use Common Name, Length, Date, Subarea, Analyte, Result, and Unit to identify data repeated between data sources # Remove False Repeats from "repeats_All" by hand # For True repeats sort repeats based on data source preference - see "DataSourceHierarchy.xlsx" # Remove data remaining in "repeats_All" from Master data list - remove data from "repeats_All" that we want to keep in Master # Load QA'd Data Master & Add SourceID Hierarchy ------------------------------------ #SourceID Hierarchy Column to be used when removing repeating data dataSourceHierarchy <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6_DataSourceHierarchy.xlsx', sheet='Data Selection Hierarchy', guess_max = 30000) FishMaster_DELTA <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/5a_FishMaster_QA.xlsx', sheet='Sheet1', guess_max = 30000) %>% left_join(., dataSourceHierarchy, by='SourceID') # adds Hierarchy column based on SourceID nrow(FishMaster_DELTA)
## [1] 7719
## Export to Excel to view ---- writexl::write_xlsx(FishMaster_DELTA, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6b.0_FishMaster_withRepeats', '.xlsx')) # Identify Repeats pt. 1 --------------------------------------------------------- # Arrange by MDL column so row with MDL value is on top. This allows it to be captured # by distinct() and then removed from repeated data. fish_repeats <- FishMaster_DELTA %>% mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, SampleDate, Result, Unit)) %>% group_by(Repeats) %>% mutate(n = n()) %>% filter(n > 1) %>% arrange(Repeats, MDL) %>% ungroup nrow(fish_repeats)
## [1] 4180
## Standardizing weights --------------------------------------------------- # To capture known weights across repeated rows # Standardize weights across repeats/append known 0's and NA's # First, get Repeats where WeightAvg > 0 weight_not_na <- fish_repeats %>% filter(WeightAvg>0) %>% mutate(WeightAvg2 = WeightAvg) %>% distinct(Repeats, .keep_all=T) %>% select(Repeats, WeightAvg2) # Next, add WeightAvg from repeats_withWeightAvg to repeats where WeightAvg = 0 repeats_with_weightavg <- fish_repeats %>% left_join(., weight_not_na, by='Repeats') %>% # add WeightAvg2 column mutate(WeightAvg = case_when(WeightAvg == 0 | is.na(WeightAvg) ~ WeightAvg2, # add WeightAvg2 value where WeightAvg is 0 T ~ WeightAvg)) %>% select(- WeightAvg2) # remove unneeded columns ## Export to Excel to View ---- # export to Excel and examine the repeats (includes "N/A" Result values, will be addressed later) writexl::write_xlsx(repeats_with_weightavg, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6b.1_Repeats_',today(),'.xlsx')) ## Check rounded latitude/longitude ---- # See if that helps with identifying repeats # Decided not to do this because we do not know how accurate the lat/long values are # and it may be excluding true repeats from fish_repeats # Below is the test code, ultimately not utilized except as a QA check # roundedlatlong <- fish_clean %>% # mutate(TargetLatitude=round(TargetLatitude,0))%>% # mutate(TargetLongitude=round(TargetLongitude,0)) # # test_repeats <- roundedlatlong %>% # mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, SampleDate, Result, TargetLatitude, TargetLongitude)) %>% # group_by(Repeats) %>% # mutate(n = n()) %>% # filter(n > 1) %>% # arrange(Repeats, MDL) %>% # ungroup # nrow(test_repeats) # Remove Repeated Data pt. 1 ---------------------------------------------------- # Load manually corrected repeated data, remove single instance we want to keep in Master, then remove remaining repeated data from Master first_repeats <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6c.2_Repeats_2023-04-12_ManuallyRemovedFalseRepeats.xlsx', sheet='Sheet1', guess_max = 30000) %>% # load manually corrected repeated data arrange(Repeats, Hierarchy) repeats_tokeep <- first_repeats %>% filter(n != 4) %>% filter(n != 6) %>% # Do not keep any instances of n=6, already manually deleted the instances to keep in Excel; don't keep any instances here so remainder are all removed distinct(Repeats, n, .keep_all=T) # capture "top" single instance we want to keep in Master repeats_toberemoved <- anti_join(first_repeats, repeats_tokeep, by=c('SourceID','SourceRow')) %>% # remove instance we want to keep from list of repeated data select(-n, -Repeats) nrow(first_repeats) == nrow(repeats_tokeep) + nrow(repeats_toberemoved) # this needs to be TRUE
## [1] TRUE
## Add WeightAvg from repeats_tokeep ---- # Because repeats_tokeep kept CEDEN over R5MF, but CEDEN did not have correct WeightAvg value. repeats_WeightAvg <- repeats_tokeep %>% mutate(WeightAvg2 = WeightAvg) %>% select(SourceID, SourceRow, WeightAvg2) first_fish_final <- FishMaster_DELTA %>% left_join(., repeats_WeightAvg, by=c('SourceID','SourceRow')) %>% # adds WeightAvg2 column mutate(WeightAvg = case_when(!is.na(WeightAvg2) ~ WeightAvg2, # add WeightAvg2 value to repeats that were kept T ~ WeightAvg)) %>% select(- WeightAvg2) ## Remove repeats_toberemoved from master table ---- first_FishMaster_noRepeats <- anti_join(first_fish_final, repeats_toberemoved, by=c('SourceID','SourceRow')) %>% # remove repeated data from Master select(-Hierarchy) # remove extraneous columns nrow(first_fish_final) == nrow(first_FishMaster_noRepeats) + nrow(repeats_toberemoved) # this needs to be TRUE
## [1] TRUE
## Export file to excel: ---- # writexl::write_xlsx(FishMaster_noRepeats, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6a_first_FishMaster_noRepeats_',today(),'.xlsx')) # Catching Missed Repeats --------------------------------------------- ## New Year's Day (Incorrect Dates) ---- # Standardize Dates across repeats/append known incorrect values (just like was done # with weights) - we know there would be no sampling done on this holiday and must have been # incorrectly written as January 1st # First, modifying the table to catch rounding error repeats, to maintain consistency # and be efficient for this second round of repeat removal rounded_repeats <- first_FishMaster_noRepeats %>% mutate(RoundedResult = round(Result, digits = 3)) %>% mutate(RoundedLength = round(as.numeric(TLAvgLength), digits = 1)) %>% mutate(Year = year(SampleDate)) %>% mutate(Repeats = paste(Subarea, CommonName, RoundedLength, Analyte, Year, RoundedResult, Unit)) %>% left_join(., dataSourceHierarchy, by='SourceID') # Get Repeats where Date is not 01/01 (manually checked: all in 1998-2003, no other # occurrences of January samples after 2003; all January occurrences are 01/01) truedates <- rounded_repeats %>% filter((month(SampleDateTime) != month(1))) %>% mutate(SampleDateTime2 = SampleDateTime) %>% distinct(Repeats, .keep_all=T) %>% # 9/8/21 ROBIN's NOTE: Instead of distinct() may need "group_by(Repeats)" & "filter(n() > 1)", "filter(n() == 1)", or another filter() depending on intention of truedates select(Repeats, SampleDateTime2) # Add SampleDateTime from truedates to dataset where SampleDateTime is 01/01 corrected_dates <- rounded_repeats %>% left_join(., truedates, by='Repeats') %>% # adds SampleDateTime2 column mutate(SampleDateTime = case_when(( month(SampleDateTime) == month(1)) & !is.na(SampleDateTime2) ~ SampleDateTime2, # add SampleDateTime2 value where SampleDateTime is 01/01 and there is a corresponding SampleDateTime2 value T ~ SampleDateTime)) %>% mutate(SampleDate = ymd(SampleDateTime)) %>% select(- SampleDateTime2, -Repeats) # remove unneeded columns # Results show that only 2 sampling events remain with incorrect, 01/01 data # At which point nothing can be done for these (no true date data) ## Identifying Repeats pt. 2 ------------------------------- # Fixing missed repeats from rounding errors, incorrect dates and common names # rounded Result and Length to capture true repeats # caught repeats using Year, gets some repeats with incorrect dates to manually remove later # ensured sourceID's are different fish_repeats2 <- corrected_dates %>% mutate(Repeats1 = paste(Subarea, CommonName, RoundedLength, Analyte, Year, RoundedResult, Unit)) %>% group_by(Repeats1) %>% mutate(number_repeats = n()) %>% filter(number_repeats > 1) %>% mutate(Repeats2 = paste(SourceID, Repeats1)) %>% group_by(Repeats2) %>% mutate(number_sources = n()) %>% filter(number_sources == 1) %>% arrange(Repeats1, MDL) %>% ungroup nrow(fish_repeats2)
## [1] 1927
## Export to Excel ---- # Export to Excel and examine the repeats (includes "N/A" Result values) writexl::write_xlsx(fish_repeats2, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6d.1_Repeats_rounding_',today(),'.xlsx')) # Remove Repeated Data pt. 2 ---------------------------------------------------- # Same process: Load manually corrected repeated data, remove single instance we want to # keep in Master, then remove remaining repeated data from Master removed_repeats_rounding <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6d.3_Repeats_rounding_manuallyremoved_2023-04-12.xlsx', sheet='Sheet1', guess_max = 30000) %>% # load manually corrected repeated data arrange(Repeats1, Hierarchy) repeats_tokeep_rounding <- removed_repeats_rounding %>% filter(number_repeats != 3) %>% # Do not keep any instances of n=3, already manually deleted the instances to keep, so don't keep any instances here so remainder are all removed distinct(Repeats1, number_repeats, .keep_all=T) # capture "top" single instance we want to keep in Master repeats_toberemoved_rounding <- anti_join(removed_repeats_rounding, repeats_tokeep_rounding, by=c('SourceID','SourceRow')) %>% # remove instance we want to keep from list of repeated data select(-number_repeats, -number_sources, -Repeats1, -Repeats2) # remove columns we no longer need nrow(removed_repeats_rounding) == nrow(repeats_tokeep_rounding) + nrow(repeats_toberemoved_rounding) # this needs to be TRUE
## [1] TRUE
## Remove repeats_toberemoved from master table ---- FishMaster_norepeats <- anti_join(corrected_dates, repeats_toberemoved_rounding, by=c('SourceID','SourceRow')) %>% # remove repeated data from Master select(-Hierarchy, -RoundedResult, -RoundedLength, -Year) # remove extraneous columns nrow(corrected_dates) == nrow(FishMaster_norepeats) + nrow(repeats_toberemoved_rounding) # this needs to be TRUE
## [1] TRUE
# Final data clean # Append true SampleDateTime and SampleDate from repeats_tokeep_rounding because # there were some inaccurate 01/01 date values missed from "Identify Repeats pt. 1" final_fix_dates <- FishMaster_norepeats %>% mutate(Year = year(SampleDate)) %>% mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, Year, Result, Unit)) %>% group_by(Repeats) %>% arrange(Repeats, MDL) %>% ungroup nrow(final_fix_dates)
## [1] 4526
original_with_truedates <- FishMaster_DELTA %>% mutate(Year = year(SampleDate)) %>% mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, Year, Result, Unit)) %>% filter((month(SampleDateTime) != month(1))) %>% mutate(SampleDateTime2 = SampleDateTime) %>% distinct(Repeats, .keep_all=T) %>% select(Repeats, SampleDateTime2) fish_final <- final_fix_dates %>% left_join(., original_with_truedates, by='Repeats') %>% # adds SampleDateTime2 column mutate(SampleDateTime = case_when(( month(SampleDateTime) == month(1)) & !is.na(SampleDateTime2) ~ SampleDateTime2, # add SampleDateTime2 value where SampleDateTime is 01/01 T ~ SampleDateTime)) %>% mutate(SampleDate = ymd(SampleDateTime)) %>% select(- SampleDateTime2, -Repeats) nrow(fish_final)
## [1] 4526
# Comparing two versions shows the code above is really more of a check, as it doesn't # change any of the 01/01 dates nrow(fish_final %>% filter((month(SampleDateTime) == month(1))))
## [1] 570
nrow(FishMaster_norepeats %>% filter((month(SampleDateTime) == month(1))))
## [1] 570
View(FishMaster_norepeats %>% filter((month(SampleDateTime) == month(1)))) # export final file to excel: writexl::write_xlsx(FishMaster_norepeats, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6a_FishMaster_noRepeats.xlsx'))
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] lubridate_1.8.0 plotly_4.10.0 readxl_1.4.1 actuar_3.3-0 ## [5] NADA_1.6-1.1 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.9 ## [9] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8 ## [13] ggplot2_3.3.6 tidyverse_1.3.2 fitdistrplus_1.1-8 survival_3.4-0 ## [17] MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 googledrive_2.0.0 ## [21] htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 compiler_4.2.2 ## [25] modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 htmltools_0.5.3 ## [29] tidyselect_1.1.2 fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.1 ## [33] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 grid_4.2.2 ## [37] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 DBI_1.1.3 ## [41] magrittr_2.0.3 scales_1.2.1 writexl_1.4.0 cli_3.3.0 ## [45] stringi_1.7.8 fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 ## [49] generics_0.1.3 vctrs_0.4.1 expint_0.1-7 tools_4.2.2 ## [53] glue_1.6.2 hms_1.1.2 fastmap_1.1.0 colorspace_2.0-3 ## [57] gargle_1.2.0 rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2024-01-05 08:55:50 PST"