This report is automatically generated with the R package knitr (version 1.40) .

options(scipen = 999) # turns off scientific notation

# library(scales)
library(readxl)       # loads excel files as tibbles (part of tidyverse)
library(plotly)       # interactive graphs
library(lubridate)    # makes working with dates easier
library(tidyverse)    # functions to work with tibbles


# CUSTOM FUNCTIONS #
sigfig <- function(vec, n=3){
  ### function to print values to N significant digits without dropping trailing zeros
  # input:   vec       vector of numeric
  #          n         integer is the required sigfig  
  # output:  outvec    vector of numeric rounded to N sigfig

  # returns a character output - used in "hovertext" for plotly graphs
  formatC(signif(vec,digits=n), digits=n, format="fg", flag="#")
}


# Function to round values in a dataframe
signif_df <- function(x, digits) {
  # round numeric columns
  # x = data frame 
  # digits = number of digits to round
  numeric_columns <- sapply(x, mode) == 'numeric'
  x[numeric_columns] <-  signif(x[numeric_columns], digits)
  x
}



`%are not%` <- function (x, table) is.na(match(x, table, nomatch=NA_integer_)) #https://stackoverflow.com/questions/5831794/opposite-of-in

  # Collapses vector to be like a grammatical list (i.e., seperated by commas, with individual quotes, and last element "and")  #
  # Needs to be wrapped in a cat() to print properly; e.g., cat(grammaticList(c('a','b', 'c')))
grammaticList <- function(vec) sub(",\\s+([^,]+)$", ", and \\1", ifelse(length(vec) == 2, paste(vec, collapse=" and "), paste(vec, collapse=", ")))

#https://plotly-r.com/working-with-symbols.html
cust_shape <- c('cross-open', 'diamond-open', 'circle-open', 'x-open', 'triangle-up-open', 'hash-open', 'star-square-open',
                'star-open', 'diamond-tall-open', 'star-triangle-down-open', 'star-diamond-open', 'hourglass-open', 'asterisk-open', 'square-open')


# FUNCTION TO LOAD DATA #
loadData <- function(filename){
  data <- readxl::read_xlsx(filename, sheet=1, guess_max = 100000) %>%
    mutate(tempSourceID = substr(SourceID, 1, 5),
           Shape = as.character(cust_shape[as.factor(tempSourceID)])) %>%   #Sets shape initally so shape doesn't change for SourceID on each graph; SourceID needs to be a factor; Shape needs to be character for Plotly graphing
    select(-tempSourceID)

  cat("Data set has", nrow(data),"rows. Added Shape column based on SourceID for use in result_time_Plotly().")
  return(data)
}


# FUNCTION TO STANDARDIZE UNITS
standardizeUnits <- function(dataframe, pp='volume'){
  # pp = "parts per" million (ppm) or billion (ppb); if not by volume then assumes by mass


  # If Column is All NA; convert form Logical NA to Real NA
  if(all(is.na(dataframe$Result))){
    dataframe$Result <- NA_real_
  }

  if(all(is.na(dataframe$MDL))){
    dataframe$MDL <- NA_real_
  }

  if(all(is.na(dataframe$RL))){
    dataframe$RL <- NA_real_
  }


  newUnits <- dataframe %>%
    mutate(
      #ug/L * 1000 = ng/L   "\U00B5" is unicode symbol for mu (the micro symbol)
      Result = ifelse(grepl('(\U00B5|u)g/l',Unit,ignore.case=T, perl=T), Result*1000, Result),
      MDL = ifelse(grepl('(\U00B5|u)g/l',Unit,ignore.case=T, perl=T), MDL*1000, MDL),
      RL = ifelse(grepl('(\U00B5|u)g/l',Unit,ignore.case=T, perl=T), RL*1000, RL),
      Unit = gsub('(\U00B5|u)g/l', 'ng/L', Unit, ignore.case=T, perl=T),

      #mg/L * 1,000,000 = ng/L & ppm * 1,000,000 = ng/L
      #if ppm only appies to measurements by volume; replace 'mg/l' with 'mg/l|ppm'
      Result = ifelse(grepl('mg/l',Unit,ignore.case=T, perl=F), Result*1000000, Result),
      MDL = ifelse(grepl('mg/l',Unit,ignore.case=T, perl=F), MDL*1000000, MDL),
      RL = ifelse(grepl('mg/l',Unit,ignore.case=T, perl=F), RL*1000000, RL),
      Unit = gsub('mg/l', 'ng/L', Unit, ignore.case=T, perl=F),

      #ug/kg * 0.001 = mg/kg;  ng/g * 0.001= mg/kg
      Result = ifelse(grepl('(\U00B5|u)g/kg|ng/g',Unit,ignore.case=T, perl=T), Result*0.001, Result),
      MDL = ifelse(grepl('(\U00B5|u)g/kg|ng/g',Unit,ignore.case=T, perl=T), MDL*0.001, MDL),
      RL = ifelse(grepl('(\U00B5|u)g/kg|ng/g',Unit,ignore.case=T, perl=T), RL*0.001, RL),
      Unit = gsub('(\U00B5|u)g/kg|ng/g','mg/Kg', Unit, ignore.case=T, perl=T),

      #ug/g = mg/kg; only need to change Unit
      Unit = gsub('(\U00B5|u)g/g','mg/Kg', Unit, ignore.case=T, perl=T),


      # ppm * 1,000,000 = ng/L | ppm = mg/kg
      # ppb * 1,000 = ng/L     | ppb * 0.001 = mg/kg
      Result = case_when(pp == "volume" & grepl('ppm',Unit,ignore.case=T, perl=F) ~ Result*1000000, # ppm * 1,000,000 = ng/L
                         pp == "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ Result*1000,    # ppb * 1,000 = ng/L
                         pp != "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ Result*0.001,   # ppb * 0.001 = mg/kg
                         T ~ Result),                                                               # ppm = mg/kg
      MDL = case_when(pp == "volume" & grepl('ppm',Unit,ignore.case=T, perl=F) ~ MDL*1000000, # ppm * 1,000,000 = ng/L
                      pp == "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ MDL*1000,    # ppb * 1,000 = ng/L
                      pp != "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ MDL*0.001,   # ppb * 0.001 = mg/kg
                      T ~ MDL),                                                               # ppm = mg/kg

      RL = case_when(pp == "volume" & grepl('ppm',Unit,ignore.case=T, perl=F) ~ RL*1000000, # ppm * 1,000,000 = ng/L
                     pp == "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ RL*1000,    # ppb * 1,000 = ng/L
                     pp != "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ RL*0.001,   # ppb * 0.001 = mg/kg
                     T ~ RL),                                                               # ppm = mg/kg

      Unit = case_when(pp == "volume" ~ gsub('ppm|ppb','ng/L', Unit, ignore.case=T, perl=T),
                       pp != "volume" ~ gsub('ppm|ppb','mg/Kg', Unit, ignore.case=T, perl=T),
                       T ~ Unit),


      #standardize ng/L and mg/Kg letter case
      Unit = gsub('ng/l', 'ng/L', Unit, ignore.case=T, perl=F),
      Unit = gsub('mg/kg', 'mg/Kg', Unit, ignore.case=T, perl=F)
    )
  return(newUnits)
}



# FUNCTION TO CONVERT CHARACTER NUMBERS TO NUMERIC FORMAT #
chara_to_NumDate <- function(dataframe){
  #Look for columns that are character format & if no letters (except e or E for scientific notation) convert format to numeric
  charaTOnumdate <- function(df){
    #1) Look for columns that are character format & if no letters but contains multiple -'s: convert format to date
    if(is.character(df) & !any(grepl('[a-zA-Z]', df)) & any(grepl('-.*-', df))){ymd(df)}
    #2) Look for columns that are character format & if no letters but contains a ':' - convert format to time
    else if(is.character(df) & !any(grepl('[a-zA-Z]', df)) & any(grepl(':', df))){hms(df, quiet=T)}
    #3) Look for columns that are character format & if no letters (except e or E for scientific notation): convert format to numeric
    else if(is.character(df) & !any(grepl('[a-df-zA-DF-Z]', df))) {as.numeric(df)} #converts character columns that are numeric to numeric; excludes in case of sci notation
    else{df}
  }
  #Apply charaTOnumdate function
  dataframe <- as_tibble(lapply(dataframe, charaTOnumdate))
  return(dataframe)
}


# CHECK FOR BLANKS IN A COLUMN #
anyBlank <- function(dataframe, column, showBlanks = T){
  #https://dplyr.tidyverse.org/articles/programming.html
  column <- enquo(column)
  columnBlanks <- dataframe %>%
    filter (is.na(!! column) | !! column == ''| !! column == 'Not Reported')
  if(nrow(columnBlanks) == 0){
    cat("The column", as_label(column), "does not contain any blanks.")
  }else{
    sources <- grammaticList(unique(dataframe$SourceID))

    cat('The following', nrow(columnBlanks),"rows have a blank value in the", as_label(column), "column: ", sources )
    if(showBlanks) View(columnBlanks)
    return(columnBlanks)
  }
}


# LOOK AT FILTERABLE FACTORS IN SPECIFIED COLUMNS #
unique_factors <- function(dataframe, ...){
  #https://dplyr.tidyverse.org/articles/programming.html
  selector_var <- enquos(...)
  factors <- dataframe %>%
    select(!!! selector_var)
  lapply(factors, function(x) unique(sort(x, na.last=F))) #TO PUT THIS INTO A TIBBLE: tbl <- tibble(Columns = names(aq_factors), Factors = lapply(aq_factors, function(x) unique(sort(x)))); tbl$Columns[[1]]; tbl$Factors[[1]] 
}


## GRAPH RESULT, MDL, & RL VALUES VS TIME #
graphSubset <- function(dataframe, groupByCol, column){
  #https://dplyr.tidyverse.org/articles/programming.html
  groupBycolumn <- enquo(groupByCol)
  column <- enquo(column)
  graph_subset <- dataframe %>%
    select(!! groupBycolumn, Value = !! column, .data$SampleDate, .data$SourceID, .data$SourceRow, .data$Subarea, .data$StationName,    #Selects and renames 'column' to Value
           .data$Analyte, .data$Unit, .data$ResultQualCode, .data$Shape, Matrix = any_of(c("MatrixName", "TissueName"))) %>%
    filter(!is.na(Value)) %>%
    mutate(ValueType = as_label(column)) #Put variable name of column (e.g., "Result", "MDL", or "RL") in column called "ValueType"

  return(graph_subset)
}

result_time_Plotly <- function(dataframe, groupByCol, showMean=F, interactive=T, logscale=T, showLegend=F){
  ## Required Columns
  # Result - numeric
  # SampleDate - dates
  # SourceID - can be NA
  # Subarea - can be NA
  # Shape - can be NA


  # groupByCol <- quo(Subarea) - to test function line-wise
  groupBycolumn <- enquo(groupByCol)

  df_name <- deparse(substitute(dataframe))

  results <- graphSubset(dataframe, groupByCol=as_label(groupBycolumn), Result) %>%
    mutate(ValueType = ifelse(ResultQualCode == "ND", "ND", ValueType), # Distinguish ND Results from "=" & DNQ Results
           n = 1) # add dummy n column so can rbind with result_monthlyAvg

  monthlyAvg <- results %>%
    mutate(ValueType = "Monthly Avg",
           SampleDate = ymd(paste0(year(SampleDate),'-', month(SampleDate), "-15"))) %>%
    group_by(!!groupBycolumn, SampleDate) %>%
    mutate(Value = mean(Value),  # Geometric mean: exp(mean(log(Value)))
              n=n()) %>%
    distinct(!!groupBycolumn, SampleDate, .keep_all=T) %>% # keep all columns so can rbind with results, mdl, rl dataframes
    ungroup

  mdl <- graphSubset(dataframe, groupBy=as_label(groupBycolumn), MDL) %>%
    mutate(n=1)
  rl <- graphSubset(dataframe, groupBy=as_label(groupBycolumn), RL) %>%
    mutate(n=1)

  graphdata <- rbind(results,mdl, rl)
  graphdata$ValueType <- factor(graphdata$ValueType, levels = c("Result", "MDL", "RL", "ND")) #makes factor order consistent

  cust_color <- c("gray24", "deeppink2", "turquoise3", "gray54")
  used_colors <- setNames(cust_color, levels(graphdata$ValueType)) #<or> "graphdata$Color <- cust_color[graphdata$ValueType]" #ValueType needs to be a factor; "used_colors <- setNames(unique(graphdata$Color), unique(graphdata$ValueType))"

  used_shapes <- setNames(graphdata$Shape, graphdata$SourceID)  #<or if Shape column not used> used_shapes <- setNames(cust_shape[1:length(unique(graphdata$SourceID))], unique(graphdata$SourceID)) 
  used_shapes <- used_shapes[unique(names(used_shapes))]

  p <- ggplot() +
    geom_point(data=graphdata, aes(x=SampleDate, y=Value, color=ValueType, shape=SourceID,
                                   text=purrr::map(paste0('SourceRow: <b>',SourceRow, '</b><br>', 'StationName: <b>',StationName,'</b><br>', '<b>', Subarea, '</b>' ), htmltools::HTML)),  # sprintf("ValueType: %s<br>SouceID: %s<br>SourceRow: %s<br>StationName: %s", ValueType, SourceID, SourceRow, StationName)),
               show.legend=showLegend) +
    scale_color_manual(values = used_colors) +
    scale_shape_manual(values = used_shapes) +
    theme_light() +
    facet_grid(rows=vars(!!groupBycolumn))

  if(showMean == T){p <- p +
    geom_line(data=monthlyAvg, aes(x=SampleDate, y=Value),
              color="darkorange", size=.3) +
    geom_point(data=monthlyAvg, aes(x=SampleDate, y=Value,
                                    text=purrr::map(paste0('<b>', month.abb[month(SampleDate)], ' ', year(SampleDate), '</b><br>Monthly Avg = <b>', signif(Value,4), '</b><br>', 'n: <b>',n,'</b>'), htmltools::HTML)),  # sprintf("ValueType: %s<br>n: %s", ValueType, n)),
               color="darkorange", size=.7)}

  if(logscale == T){p <- p + scale_y_log10()}

  if(interactive == T){
    if(showLegend == T) {ggplotly(p)}else{hide_legend(ggplotly(p))}
  }else{p}

}



# STANDARDIZE RESULT QUAL CODE # use rules outlined in '1_ResQualCode Rules.xlsx'

# Secondary Functions to Support Primary Function
betweenDNQ <- function(x, y, z) y!=z & x > y & x < z     #betweenDNQ(0.2, 0.2, 0.3) => F

#Result should only be blank with DNQ or ND
removeData <- function(dataframe, filter_fun){
  #https://dplyr.tidyverse.org/articles/programming.html
  filter_fun <- enquo(filter_fun)
  filter_print <- rlang::quo_text(filter_fun, width=150) #width is the character length that a next line '\n' will be placed; default is 90

  deletedSamples <- dataframe %>%
    filter(!! filter_fun)
  if(nrow(deletedSamples) > 0){
    dataframe1 <- dataframe %>%
      anti_join(., deletedSamples, by=names(.))

    dfText <- capture.output(anti_join(dataframe, dataframe1, by=names(dataframe)))    #captures the dataframe output as text
    dfText <- dfText[!seq_len(length(dfText)) %in% c(1,3)]                             #removes tibble information (1st & 3rd row) from printing
    dfPrint <- paste(dfText, "\n", sep="")                                             #pastes a line seperator after each row of dataframe so it prints nicely

    cat('\nDataset tested for the following logic:\n  ', filter_print, '\nwhich is TRUE for',nrow(deletedSamples), 'instances.\n', dfPrint)
    View(deletedSamples)
    readline(prompt="The above data is considered to be erroneous and will be deleted. Click in 'Console' and Press [enter] to continue.")

    return(dataframe1)
  }else{
    cat('\nDataset tested for the following logic:\n  ', filter_print, '\nNo instances found to be TRUE. No data removed.\n\n')
    return(dataframe)
  }
}

# Primary Function
standQualCode <- function(dataframe, QualCodeColumn){
  # QualCodeColumn <- quo(ResultQualCode) - to test function line-wise
  qualColumn <- enquo(QualCodeColumn)

  #remove instances where (ResultQualCode is blank, 'ND', or 'Not Reported') & (MDL & RL are blank) -> not trustworthy Result
  dataframe <- removeData(dataframe, !is.na(Result) & is.na(MDL) & is.na(RL)& !! qualColumn == 'ND')

  #remove instances where (Result is blank) & (MDL & RL is not blank) & (ResultQualCode is not ND or DNQ) -> blank Result does not make sense for ResultQualCode =, E, or AVG
  dataframe <- removeData(dataframe, is.na(Result) & (!is.na(MDL) & !is.na(RL)) & (!! qualColumn != 'ND' &  !! qualColumn != 'DNQ'))

  #remove instances where Result = RL & MDL < RL & ResultQualCode = ND - when ResultQualCode is ND, Result should be less than or equal to MDL not RL
  dataframe <- removeData(dataframe, Result == RL & MDL < RL & !! qualColumn == 'ND')

  #standardize ResultQualCode   
  standQualCode <- dataframe %>%
    mutate(origResult = Result,      #back up the original Result column for reference
           origQualCode = !! qualColumn, #back up the original QualCode column for reference
           Result = case_when(Result == 0                                            ~ NA_real_,
                              (Result == MDL | Result == RL) & origQualCode =='ND'   ~ NA_real_,
                              Result < MDL & is.na(RL)                               ~ NA_real_,
                              Result < RL & is.na(MDL)                               ~ NA_real_,
                              TRUE ~ Result ),

           !! qualColumn := case_when(pmap_lgl(list(Result, MDL, RL), betweenDNQ)                  ~ 'DNQ',   #need pmap_lgl to do between() as a rowwise function
                                    !is.na(Result) & is.na(MDL) & is.na(RL) & !is.na(origQualCode) ~ '=',
                                    Result > MDL & Result > RL                                     ~ '=',
                                    Result > MDL & is.na(RL)                                       ~ '=',
                                    Result > RL & is.na(MDL)                                       ~ '=',
                                    Result == MDL & RL > MDL  & origQualCode != 'ND'               ~ 'DNQ',
                                    (Result == RL | Result == MDL) & (origQualCode != 'ND' & origQualCode != 'DNQ') ~ '=',
                                    (Result < MDL & is.na(RL)) | (Result < RL & is.na(MDL))                         ~ 'ND',
                                    is.na(Result) & ((!is.na(MDL) & is.na(RL)) | (is.na(MDL) & !is.na(RL)))         ~ 'ND',
                                    MDL > RL & (Result < MDL | is.na(Result))                                       ~ 'ND',
                                    TRUE ~ origQualCode)
           )

  #See any  new QualCodes are NA, 'Not Reported' or ''
  cat('\n', as_label(qualColumn), 'is now standardized and original QualCodes saved in column named origQualCode. ')
  temp <- anyBlank(standQualCode, !! qualColumn, showBlanks=F) # returns a NULL if no blanks
  if(!is.null(temp)){
    temp <- temp %>%
      select(TMDL, Subarea, SourceID, SourceRow, StationName, SampleDateTime, Analyte, Result, MDL, RL, Unit, origQualCode, !! qualColumn)
    dfText <- capture.output(temp)                               #captures the dataframe output as text
    dfText <- dfText[!seq_len(length(dfText)) %in% c(1,3)]       #removes tibble information (1st & 3rd row) from printing
    dfPrint <- paste(dfText, "\n", sep="")                       #pastes a line seperator after each row of dataframe so it prints nicely

    makingNewQualCodeEqual <- quo((is.na(!! qualColumn) | !! qualColumn == 'Not Reported' | !! qualColumn == '') & !is.na(Result) & is.na(MDL) & is.na(RL)) #logic to make a blank QualCode '='
     standQualCode <- standQualCode %>%
      mutate(!! qualColumn := if_else(!! makingNewQualCodeEqual, '=', !! qualColumn)) %>% #if Result is not blank and no new QualCode -> make new QualCode '='
      anti_join(. ,temp, by=names(temp))                                                #remove any samples where the new QualCode is still blank

    temp2 <- temp %>%
      mutate(!! qualColumn := if_else(!! makingNewQualCodeEqual, '=', !! qualColumn)) %>% #do same to temp, so User can see results
      anti_join(. ,temp, by=names(temp))
    dfText2 <- capture.output(temp2)                               #captures the dataframe output as text
    dfText2 <- dfText2[!seq_len(length(dfText2)) %in% c(1:3)]       #removes tibble information (1st & 3rd row) from printing
    dfPrint2 <- paste(dfText2, "\n", sep="")                       #pastes a line seperator after each row of dataframe so it prints nicely
    cat('\n', dfPrint, '\nOf the samples above, the following samples contain a Result but no MDL, RL, or QualCode and will be given a new QualCode of "=":\n', dfPrint2)
    if(nrow(temp) == nrow(temp2)){
      cat('All samples now contain a standardized QualCode.')
    }else{
      cat('\nThe remaining',nrow(temp)-nrow(temp2),'samples still have a blank QualCode and will be deleted. See "deletedSamples" tab.')
      col_names <- names(temp)[names(temp) %are not% as_label(qualColumn)] #don't include standradized QualCodes when looking for differences between temp and temp2
      deletedSamples <- temp %>%
        anti_join(., temp2, by=col_names)
      View(deletedSamples)
    }
  }else{
    cat('\n\nQualCode standardization completed based on rules established in "1_ResQualCode Rules.xlsx".')
  }

  return(standQualCode)
}

# # TESTING RESULTQUALCODE STANDARDIZATION FUNCTION #
# Result<-c(           0.2,          0.2,       0.2,       0.2,      0.2,      0.2,      0.2, NA_real_,      0.2, NA_real_,  0.2, NA_real_, NA_real_, NA_real_, 0.1,  0.1,      0.1, 0.3,  0.3,      0.3,  0.2, 0.05, NA_real_, NA_real_, NA_real_,  0.4, NA_real_, NA_real_, NA_real_,      0.2,      0.2,      0.2,      0.2,      0.2,      0.2, 0.2,  0.2,      0.2)
# MDL  <- c(           0.3,    NA_real_,  NA_real_,  NA_real_,      0.1, NA_real_,      0.3,      0.3, NA_real_, NA_real_,  0.1,      0.1,      0.1,      0.1, 0.1,  0.1,      0.1, 0.1,  0.1,      0.1,  0.3,  0.3,      0.3,      0.3,      0.3,  0.3,      0.2,      0.2,      0.2,      0.2,      0.2,      0.2, NA_real_, NA_real_, NA_real_, 0.2,  0.2,      0.2)
# RL   <- c(           0.4,     NA_real_,  NA_real_,  NA_real_, NA_real_,      0.1, NA_real_, NA_real_,      0.3,      0.3,  0.3,      0.3,      0.3,      0.3, 0.3,  0.3,      0.3, 0.3,  0.3,      0.3,  0.1,  0.1,      0.1,      0.1,      0.1,  0.1,      0.2,      0.2,      0.2, NA_real_, NA_real_, NA_real_,      0.2,      0.2,      0.2, 0.2,  0.2,      0.2)
# Code <- c('Not Reported',NA_character_,     'ND',    'notND',    'any',    'any',    'any',    'any',    'any',    'any','any',     'ND',    'DNQ','notNDNQ','ND','DNQ','notNDNQ','ND','DNQ','notNDNQ','any','any',     'ND',    'DNQ','notNDNQ','any',     'ND',    'DNQ','notNDNQ',     'ND',    'DNQ','notNDNQ',     'ND',    'DNQ','notNDNQ','ND','DNQ','notNDNQ')
# dfTestStandFun <- tibble(Result, MDL, RL, ResultQualCode=Code, TMDL='', SourceID='', Subarea='', SourceRow='', StationName='', SampleDateTime='', Analyte='', Unit='')

# fixedQualCode <- standQualCode(dfTestStandFun, ResultQualCode) 
# View(dfTestStandFun)
# View(fixedQualCode)
# View(anti_join(dfTestStandFun, fixedQualCode, by=c("Result"="origResult", "MDL", "RL", "ResultQualCode"="origQualCode")))


# Select only complete tidal cycles for analysis
# Function to create unique identifiers for each tide cycle, returns a data frame with an additional column "TideID"
unique_tidal_cycle <- function(clean_flow){
  # clean_flow is a data frame of clean flow data
  id_flow <- clean_flow %>%
    mutate(TideID = paste0(SiteType, '1')) # setup column w/ initial values to be changed by for loop below

  n_rows <- nrow(id_flow)

  j = 1  # numeric value to ID tide

  # for loop
  for(n in 2:n_rows){

    if(sign(id_flow$Flow[n-1]) != sign(id_flow$Flow[n])){
      j = j + 1
    }

    id_flow$TideID[n] <- paste0(id_flow$SiteType[n], j)
  }

  return(id_flow) # return data frame
}


# Function to identify gaps longer than 16 minutes between time interval measurements, returns a data frame with additional column "Gap" (TRUE if a gap larger than set time in minutes, NA if not a gap larger than set time in minutes)
identify_tidal_gaps <- function(flow, time_gaps){
  # flow is a data frame with unique cycle ID column
  # time_gaps is a time (in minutes); any gap between rows that is greater than this time is identified as a gap with TRUE

  flow_gaps <- flow %>%
    mutate(Gap = NA) # create an empty column for identified gaps

  n_rows <- nrow(flow_gaps)

  flow_gaps$Gap[1] = FALSE # assuming the first and last tidal cycles are incomplete
  flow_gaps$Gap[n_rows] = FALSE

  k = NA # set default of k to NA


  # for loop
  for(n in 2:(n_rows-1)){

    if((difftime(flow_gaps$SampleDateTime[n+1],
                 flow_gaps$SampleDateTime[n], units = "mins")) > time_gaps){
      k = TRUE
    } else if ((difftime(flow_gaps$SampleDateTime[n],
                         flow_gaps$SampleDateTime[n-1], units = "mins")) > time_gaps){
      k = TRUE
    }
    else {
      k = FALSE
    }
    flow_gaps$Gap[n] = k
  }

  return(flow_gaps) # return data frame
}


# Randomization Test - tests for significant difference between mean and median. Provides p-value of test
randomizationTest <- function(dataframe, .dataCol, .groupingCol, N=1000){
  # Currently, this code can test for significance between more than 2 grouping variables, but it can not test for pairwise differences.

  set.seed(124)

  # .dataCol <- quo(Result) - to test function line-wise
  dataCol <- enquo(.dataCol)

  # .groupingCol <- quo(SampleTypeCode) - to test function line-wise
  groupingCol <- enquo(.groupingCol)


  observed_stats <- dataframe %>%
    group_by(!!groupingCol) %>%
    summarise(obsMean = mean(!!dataCol),
              obsMedian = median(!!dataCol))

  obs_mean_diff <- diff(observed_stats %>% pull(obsMean))
  obs_median_diff <- diff(observed_stats %>% pull(obsMedian))


  sample_size <- nrow(dataframe)
  mean_diff_dist <- NULL
  median_diff_dist <- NULL
  for(i in 1:N){
    shuffle_data <- dataframe %>%
      slice_sample(n=sample_size, replace=F) %>%
      pull(!!dataCol)

    shuffle_stats <- dataframe %>%
      mutate(dataShuffle = shuffle_data) %>%
      group_by(!!groupingCol) %>%
      summarise(shuffleMean = mean(dataShuffle),
                shuffleMedian = median(dataShuffle))

    mean_diff_dist <- c(mean_diff_dist, diff(shuffle_stats %>% pull(shuffleMean)))
    median_diff_dist <- c(median_diff_dist, diff(shuffle_stats %>% pull(shuffleMedian)))

  }
  # More than or equal to abs value equates to a two tailed test
  mean_count <- ifelse(abs(mean_diff_dist) >= abs(obs_mean_diff), 1, 0)
  median_count <- ifelse(abs(median_diff_dist) >= abs(obs_median_diff), 1, 0)

  # get plot characteristics for ggplot
  mean_plot <- hist(mean_diff_dist, breaks=20, plot=F)
  median_plot <- hist(median_diff_dist, breaks=20, plot=F)

  # The percentage of 1s is the p-value for the test. This percentage will be small if the observed mean difference is unlikely to have happened by chance.
  results <- list(TestedGroups = dataframe %>% distinct(!!groupingCol) %>% pull,

                  obs_mean_diff = obs_mean_diff,
                  p_value_diff_means = sum(mean_count)/N,
                  mean_diff_dist = mean_diff_dist,

                  mean_diff_dist_plot = tibble(Mean_diff = mean_diff_dist) %>%
                    ggplot +
                    geom_histogram(aes(Mean_diff), bins=20) +
                    ylab("Frequency") +
                    xlab(paste("Randomized", paste(dataframe %>% distinct(!!groupingCol) %>% pull, collapse = " & "), "Mean Difference")) +
                    geom_vline(xintercept = obs_mean_diff, color="red", size=1) +
                    annotate(geom='text', x=obs_mean_diff - diff(mean_plot$breaks[1:2]/3),
                             y=max(mean_plot$counts)/2, label='Observed Difference', color='red', angle=90)+
                    theme_light(),

                  obs_median_diff = obs_median_diff,
                  p_value_diff_medians = sum(median_count)/N,
                  median_diff_dist = median_diff_dist,

                  median_diff_dist_plot = tibble(median_diff = median_diff_dist) %>%
                    ggplot +
                    geom_histogram(aes(median_diff), bins=20) +
                    ylab("Frequency") +
                    xlab(paste("Randomized", paste(dataframe %>% distinct(!!groupingCol) %>% pull, collapse = " & "), "Median Difference")) +
                    geom_vline(xintercept = obs_median_diff, color="red", size=1) +
                    annotate(geom='text', x=obs_median_diff  - diff(median_plot$breaks[1:2]/3),
                             y=max(median_plot$counts)/2, label='Observed Difference', color='red', angle=90)+
                    theme_light()
  )
  print("The P-value will be small if the observed difference is unlikely to have happened by chance")
  return(results)
}

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mgcv_1.8-41        nlme_3.1-160       lubridate_1.8.0    plotly_4.10.0     
##  [5] readxl_1.4.1       actuar_3.3-0       NADA_1.6-1.1       forcats_0.5.2     
##  [9] stringr_1.4.1      dplyr_1.0.9        purrr_0.3.4        readr_2.1.2       
## [13] tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
## [17] fitdistrplus_1.1-8 survival_3.4-0     MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       webshot_0.5.3      
## [21] googledrive_2.0.0   htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1        
## [25] compiler_4.2.2      modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3    
## [29] htmltools_0.5.3     tidyselect_1.1.2    viridisLite_0.4.1   fansi_1.0.3        
## [33] crayon_1.5.1        tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0        
## [37] grid_4.2.2          jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1    
## [41] DBI_1.1.3           magrittr_2.0.3      scales_1.2.1        writexl_1.4.0      
## [45] cli_3.3.0           stringi_1.7.8       fs_1.5.2            xml2_1.3.3         
## [49] ellipsis_0.3.2      generics_0.1.3      vctrs_0.4.1         expint_0.1-7       
## [53] tools_4.2.2         glue_1.6.2          crosstalk_1.2.0     hms_1.1.2          
## [57] yaml_2.3.5          fastmap_1.1.0       colorspace_2.0-3    gargle_1.2.0       
## [61] rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2024-01-04 15:10:04 PST"