Tidy evaluation syntax in R user-defined function

125 views Asked by At

I want to define a generic function

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  xvar <- enquo(xvar)
  yvar <- enquo(yvar)
  groupvar <- enquo(groupvar)

  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if( "yield" %in% names(tmp) )
  {
    tmp <- tmp %>%
      mutate_at(vars(!!yvar), ~ifelse(round(yield, 0) < 85, NA, .))
  }
  
  # Compute IQR for each year
  tmp_iqr <- tmp %>%
    group_by(!!groupvar) %>%
    summarise(iqr=IQR(!!yvar, na.rm = TRUE))
  
  p <- ggplot(data = tmp %>% mutate_at(vars(!!yvar), ~ifelse(tmp_iqr[which(tmp_iqr[[!!groupvar]] %in% (!!xvar)),]$iqr == 0, . + runif(1, -0.01, 0.01), . )), aes(x = !!xvar, y = !!yvar))
  p <- p + stat_boxplot(aes(group = !!groupvar), na.rm = TRUE, coef = 10000)   # Trick (large unrealistic coef value) so whiskers end at min(y) & max(y)
  p <- p + geom_boxplot(na.rm = TRUE, outlier.shape = NA)

  return(p)
}

which is able to plot boxplot whiskers extending to min/max even when the IQR is 0. I am trying to achieve this by adding tiny random numbers (below significance level) to the incriminate data to avoid IQR=0.

However, I must have missed something in the syntax about quosure, because running this function

func_boxplot2(data, date, days, date)

with the dataset

structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 
7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
10L, 10L, 10L, 10L, 10L), .Label = c("2010", "2011", "2012", 
"2013", "2014", "2015", "2016", "2017", "2018", "2019"), class = c("ordered", 
"factor")), station = c("41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", 
"41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", 
"41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", 
"41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", 
"41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1"
), days = c(16, 15, 45, 26, 14, 14, 32, 7, 87, 42, 24, 23, 25, 
25, 55, 29, 29, 16, 11, 14, 58, 21, 19, 10, 10, 14, 33, 18, 10, 
7, 9, 10, 19, 7, 8, 7, 1, 5, 15, 8, 1, 4, 5, 6, 14, 6, 5, 5, 
3, 5, 19, 8, 4, 5, 3, 4, 16, 3, 1, 3), yield = c(98.4817351598173, 
49.4520547945205, 95.8561643835616, 97.6712328767123, 98.2648401826484, 
95.1598173515982, 97.8767123287671, 27.9109589041096, 98.310502283105, 
98.972602739726, 97.203196347032, 96.2100456621005, 98.7818761384335, 
96.7554644808743, 97.4954462659381, 98.8046448087432, 98.747723132969, 
98.3037340619308, 99.0525114155251, 96.1986301369863, 97.1004566210046, 
96.4954337899543, 96.3698630136986, 98.2077625570776, 96.62100456621, 
98.3675799086758, 95.6963470319635, 96.8835616438356, 93.5844748858447, 
87.8196347031963, 91.2328767123288, 92.5570776255708, 81.5182648401827, 
82.7739726027397, 90.1826484018265, 87.1461187214612, 87.2153916211293, 
92.9986338797814, 94.6948998178506, 85.5760473588342, 92.3611111111111, 
96.2204007285975, 86.3698630136986, 86.4269406392694, 87.796803652968, 
93.2762557077626, 96.6438356164384, 95.6164383561644, 71.3812785388128, 
93.7442922374429, 96.3698630136986, 97.2602739726027, 95.7876712328767, 
94.7146118721461, 87.6141552511416, 43.0821917808219, 88.6872146118722, 
92.6826484018265, 90.365296803653, 86.541095890411), environ = structure(c(5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 
3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 
3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L), .Label = c("Urbain avec très forte influence du trafic", 
"Urbain avec forte influence du trafic", "Urbain avec influence modérée du trafic", 
"Urbain avec faible influence du trafic", "Urbain avec très faible influence du trafic", 
"Industriel avec influence modérée du trafic"), class = "factor")), row.names = c(NA, 
-60L), class = c("tbl_df", "tbl", "data.frame"))

gives me the following errors

 Error: Problem with `mutate()` input `days`.
x Must extract column with a single valid subscript.
x Subscript `date` has size 60 but must be size 1.
ℹ Input `days` is `(structure(function (..., .x = ..1, .y = ..2, . = ..1) ...`.

What is wrong in my syntax, please ?

Many thanks,

A.

======== UPDATE ==========

Using the suggested updated function

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        ifelse(
          IQR({{yvar}}, na.rm = TRUE) == 0,
          x + runif(1, -0.01,0.01),
          x
        )
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}

results in the following plot

enter image description here

As stated in my comment, it seems that the processing of tmp output for all rows of the same year the first value yvar of this year which explains the plot. Indeed, commenting this block gives to following figure

enter image description here

2

There are 2 answers

3
Paul On BEST ANSWER

The condition in ifelse wasn't the correct length. You can change the data ggplot argument to this.

data =
  tmp %>%
    group_by(!!groupvar) %>%
    mutate_at(
        vars(!!yvar),
        if (IQR(., na.rm = TRUE) == 0) {
          . + runif(1, -0.01,0.01)
        } else {
          .
        }
    )

Your usage of quosure and !! is correct, however you should use the more recent {{ operator instead.

This is the updated function

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        if (IQR(x, na.rm = TRUE) == 0) {
          x + runif(length(x), -0.01, 0.01)
        } else {
          x
        }
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}
func_boxplot2(data, date, days, date)

plot

2
user2554330 On

I think you've received a good answer to your question. Here's a completely different approach to the problem, answering a question you maybe should have asked: instead of modifying the data, how to modify the stat used in producing the boxplot.

Normally the stat is StatBoxplot, and the important part of that ggproto object is the method compute_group. When used in a plot, it returns a single row dataframe containing columns

[1] "ymin"        "lower"       "middle"      "upper"      
[5] "ymax"        "outliers"    "notchupper"  "notchlower" 
[9] "x"           "relvarwidth" "flipped_aes" 

These mostly have a sort of obvious meaning; the only non-obvious one is outliers, which is a list-mode column, containing a single numeric vector holding the outliers.

So to get rid of outlier plotting completely, you could create an inherited stat which is just like StatBoxplot, but which modifies the result of compute_group:

NoOutlierStatBoxplot <- 
  ggproto("NoOutlierStatBoxplot", ggplot2::StatBoxplot,
          compute_group = function(..., self) {
            res <- ggproto_parent(StatBoxplot, self)$compute_group(...)
            res$ymin <- min(c(res$ymin, res$outliers[[1]]))
            res$ymax <- max(c(res$ymax, res$outliers[[1]]))
            res$outliers <- list(numeric())
            res
          })

(This is not really identical to what you were doing: it still calculates the upper and lower quartiles after removing outliers. If that matters to you, you might want a more extensive modification.)

With this modification, you can drop a lot of the code from func_boxplot2, including dropping stat_boxplot():

func_boxplot3 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA, 
                 aes(group = {{groupvar}}), 
                 stat = NoOutlierStatBoxplot)
}

func_boxplot3(mydf, date, days, date)

enter image description here