Evaluate model performance by comparing train and test set with mae - a lightgbm model within tidymodels

28 views Asked by At

I have built a machine learning model that predicts time series using the lightgbm algorithm. Now I want to check for overfitting. I would like to compare the MAE of the training set with the MAE of the test set. Unfortunately, I am not sure how to check my model for overfitting. I would be grateful for any help. Below my current code:

# data preprocessing
    model_recipe  <- recipe(error_stat_forecast ~ ., dt_lgbm) %>%
        update_role(product, date, new_role = "id") %>%
        
        # create meta data for modelling from the date column
        step_timeseries_signature(date) %>%
        
        # remove some of the variables created in the last step
        step_rm(matches(
            "(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)"
        )) %>%
        step_rm(date) %>%
        
        # removes no variance predictors which provide no predictive information
        step_nzv(all_predictors()) %>%
        
        # removes variables with constant values - no predictive information
        step_zv(all_predictors()) %>%
        
        # one-hot encode all nominal predictors
        step_dummy(all_nominal_predictors(), one_hot = TRUE)
    

    # create a train & test split
    ts_splits <- dt_lgbm %>% 
        time_series_split(date, initial = 24, assess = 12)
    
    dt_lgbm_train <- training(ts_splits)
    dt_lgbm_test <- testing(ts_splits)
    
    # create a graph of the split
    ts_splits %>% tk_time_series_cv_plan() %>% 
        plot_time_series_cv_plan(date, error_stat_forecast)
    
    
    # apply cross validation
    # erzeugt 4 folds die mithilfe der ID identifiziert werden -> zufällige Auswahl
    dt_lgbm_cv <- vfold_cv(dt_lgbm_train, v = 4)
    
    
    # set up lgbm model
    lgbm_model <-
        boost_tree(
            mode = "regression",
            # mtry = 1,               
            trees = tune(),           
            min_n = tune(),           
            tree_depth = tune(),      
            learn_rate = tune(),      
            loss_reduction = tune()   
        ) %>%
        set_engine("lightgbm")
    
    # create workflow
    lgbm_wf <-
        workflow() %>%
        add_model(lgbm_model) %>% 
        add_recipe(model_recipe)
    
    
    # shows a collection of tuning parameters 
    extract_parameter_set_dials(lgbm_model)
    
    
    # defining grid search space
    lgbm_grid_search <-
        parameters(trees(),
                   min_n(range = c(5, 20)),
                   learn_rate(range = c(0.01, 0.1)),
                   loss_reduction(range = c(0, 5)),
                   tree_depth(c(5, 10)))
    
    
    # creating hyperparameter combinations according to latin_hypercube 
    lgbm_grid <-
        grid_latin_hypercube(lgbm_grid_search,
                             size = 10)
    
    
    # evaluate hyperparameter defined before with validation set
    lgbm_tune <-
        lgbm_wf %>%
        tune_grid(
            resamples = dt_lgbm_cv,
            grid = lgbm_grid,
            metrics = metric_set(mae),
            control = control_grid(verbose = TRUE))
    
    
    # select best hyperparameter combination
    lgbm_best_params <-
        lgbm_tune %>%
        select_best(metric = "mae")
    
    
    # store the best hyperparameters to the workflow
    lgbm_wf <- lgbm_wf %>%
        finalize_workflow(lgbm_best_params)
    
    
    # fitting final parameter on train set
    fitted_workflow <- fit(lgbm_wf, dt_lgbm_train)
    
    
    # feature importance  
    fitted_workflow %>% extract_fit_engine() %>% vip()
    # ggsave("Feature Importance.png", device = "png")
    
    
    # last fit    
    lgbm_forecast <- predict(fitted_workflow, dt_lgbm_test)

If I extract the "outcomes" from "fitted_workflow", these correspond exactly to the data from the training set.

data.table(fitted_workflow[["outcomes"]])

But shouldn't this be changed by the model?

0

There are 0 answers