Skip to contents

Preprocess data for analysis and visualization.

Usage

preprocess(x, config, ...)

preprocess.class_tabular.PreprocessorConfig(
  x,
  config,
  dat_validation = NULL,
  dat_test = NULL,
  verbosity = 1L
)

preprocess.class_tabular.Preprocessor(x, config, verbosity = 1L)

Arguments

x

data.frame, data.table, tbl_df (tabular data): Data to be preprocessed.

config

PreprocessorConfig: Setup using setup_Preprocessor.

...

Not used.

dat_validation

tabular data: Validation set data.

dat_test

tabular data: Test set data.

verbosity

Integer: Verbosity level.

Value

Preprocessor object.

Details

Methods are provided for preprocessing training set data, which accepts a PreprocessorConfig object, and for preprocessing validation and test set data, which accept a Preprocessor object.

Author

EDG

Examples

# Setup a `Preprocessor`: this outputs a `PreprocessorConfig` object.
prp <- setup_Preprocessor(remove_duplicates = TRUE, scale = TRUE, center = TRUE)

# Includes a long list of parameters
prp
#> <PreprocessorConfig>
#>                complete_cases: <lgc> FALSE
#>         remove_features_thres: <NUL> NULL
#>            remove_cases_thres: <NUL> NULL
#>                   missingness: <lgc> FALSE
#>                        impute: <lgc> FALSE
#>                   impute_type: <chr> missRanger
#>      impute_missRanger_params: 
#>                                    pmm.k: <nmr> 3.00
#>                                  maxiter: <nmr> 10.00
#>                                num.trees: <nmr> 500.00
#>               impute_discrete: <chr> get_mode
#>             impute_continuous: <chr> mean
#>                integer2factor: <lgc> FALSE
#>               integer2numeric: <lgc> FALSE
#>                logical2factor: <lgc> FALSE
#>               logical2numeric: <lgc> FALSE
#>                numeric2factor: <lgc> FALSE
#>         numeric2factor_levels: <NUL> NULL
#>                 numeric_cut_n: <nmr> 0.00
#>            numeric_cut_labels: <lgc> FALSE
#>               numeric_quant_n: <nmr> 0.00
#>          numeric_quant_NAonly: <lgc> FALSE
#>             unique_len2factor: <nmr> 0.00
#>              character2factor: <lgc> FALSE
#>              factorNA2missing: <lgc> FALSE
#>        factorNA2missing_level: <chr> missing
#>                factor2integer: <lgc> FALSE
#>       factor2integer_startat0: <lgc> TRUE
#>                         scale: <lgc> TRUE
#>                        center: <lgc> TRUE
#>                 scale_centers: <NUL> NULL
#>            scale_coefficients: <NUL> NULL
#>              remove_constants: <lgc> FALSE
#> remove_constants_skip_missing: <lgc> TRUE
#>             remove_duplicates: <lgc> TRUE
#>               remove_features: <NUL> NULL
#>                       one_hot: <lgc> FALSE
#>                one_hot_levels: <NUL> NULL
#>             add_date_features: <lgc> FALSE
#>                 date_features: <chr> weekday, month, year
#>                  add_holidays: <lgc> FALSE
#>                       exclude: <NUL> NULL

# Resample iris to get train and test data
res <- resample(iris, setup_Resampler(seed = 2026))
#> 2026-02-15 11:20:38 
#> Input contains more than one column; stratifying on last.
#>  [resample]
#> 2026-02-15 11:20:38 
#> Using max n bins possible = 3.
#>  [kfold]
iris_train <- iris[res[[1]], ]
iris_test <- iris[-res[[1]], ]

# Preprocess training data
iris_pre <- preprocess(iris_train, prp)
#> 2026-02-15 11:20:38 
#> Removing 1 duplicate case...
#>  [preprocess]
#> 2026-02-15 11:20:38 
#> Scaling and centering 4 numeric features...
#>  [preprocess]
#> 2026-02-15 11:20:38 
#> Preprocessing done.
#>  [preprocess]

# Access preprocessd training data with `preprocessed()`
preprocessed(iris_pre)
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1    -0.87028640   1.0040625  -1.32659450 -1.29261555     setosa
#> 2    -1.11148263  -0.1115625  -1.32659450 -1.29261555     setosa
#> 3    -1.35267886   0.3346875  -1.38360658 -1.29261555     setosa
#> 4    -1.47327698   0.1115625  -1.26958242 -1.29261555     setosa
#> 5    -0.99088452   1.2271875  -1.32659450 -1.29261555     setosa
#> 6    -0.50849205   1.8965625  -1.15555826 -1.03136293     setosa
#> 7    -1.47327698   0.7809375  -1.32659450 -1.16198924     setosa
#> 9    -1.71447321  -0.3346875  -1.32659450 -1.29261555     setosa
#> 10   -1.11148263   0.1115625  -1.26958242 -1.42324186     setosa
#> 11   -0.50849205   1.4503125  -1.26958242 -1.29261555     setosa
#> 12   -1.23208075   0.7809375  -1.21257034 -1.29261555     setosa
#> 13   -1.23208075  -0.1115625  -1.32659450 -1.42324186     setosa
#> 14   -1.83507133  -0.1115625  -1.49763073 -1.42324186     setosa
#> 15   -0.02609959   2.1196875  -1.44061866 -1.29261555     setosa
#> 16   -0.14669771   3.0121875  -1.26958242 -1.03136293     setosa
#> 17   -0.50849205   1.8965625  -1.38360658 -1.03136293     setosa
#> 18   -0.87028640   1.0040625  -1.32659450 -1.16198924     setosa
#> 19   -0.14669771   1.6734375  -1.15555826 -1.16198924     setosa
#> 20   -0.87028640   1.6734375  -1.26958242 -1.16198924     setosa
#> 21   -0.50849205   0.7809375  -1.15555826 -1.29261555     setosa
#> 22   -0.87028640   1.4503125  -1.26958242 -1.03136293     setosa
#> 23   -1.47327698   1.2271875  -1.55464281 -1.29261555     setosa
#> 25   -1.23208075   0.7809375  -1.04153410 -1.29261555     setosa
#> 26   -0.99088452  -0.1115625  -1.21257034 -1.29261555     setosa
#> 27   -0.99088452   0.7809375  -1.21257034 -1.03136293     setosa
#> 28   -0.74968829   1.0040625  -1.26958242 -1.29261555     setosa
#> 30   -1.35267886   0.3346875  -1.21257034 -1.29261555     setosa
#> 31   -1.23208075   0.1115625  -1.21257034 -1.29261555     setosa
#> 33   -0.74968829   2.3428125  -1.26958242 -1.42324186     setosa
#> 34   -0.38789394   2.5659375  -1.32659450 -1.29261555     setosa
#> 35   -1.11148263   0.1115625  -1.26958242 -1.29261555     setosa
#> 36   -0.99088452   0.3346875  -1.44061866 -1.29261555     setosa
#> 37   -0.38789394   1.0040625  -1.38360658 -1.29261555     setosa
#> 38   -1.11148263   1.2271875  -1.32659450 -1.42324186     setosa
#> 39   -1.71447321  -0.1115625  -1.38360658 -1.29261555     setosa
#> 40   -0.87028640   0.7809375  -1.26958242 -1.29261555     setosa
#> 41   -0.99088452   1.0040625  -1.38360658 -1.16198924     setosa
#> 42   -1.59387510  -1.6734375  -1.38360658 -1.16198924     setosa
#> 43   -1.71447321   0.3346875  -1.38360658 -1.29261555     setosa
#> 44   -0.99088452   1.0040625  -1.21257034 -0.77011032     setosa
#> 45   -0.87028640   1.6734375  -1.04153410 -1.03136293     setosa
#> 46   -1.23208075  -0.1115625  -1.32659450 -1.16198924     setosa
#> 48   -1.47327698   0.3346875  -1.32659450 -1.29261555     setosa
#> 49   -0.62909017   1.4503125  -1.26958242 -1.29261555     setosa
#> 50   -0.99088452   0.5578125  -1.32659450 -1.29261555     setosa
#> 51    1.42107780   0.3346875   0.55480411  0.27490014 versicolor
#> 53    1.30047968   0.1115625   0.66882827  0.40552645 versicolor
#> 54   -0.38789394  -1.6734375   0.15571956  0.14427383 versicolor
#> 55    0.81808722  -0.5578125   0.49779203  0.40552645 versicolor
#> 56   -0.14669771  -0.5578125   0.44077995  0.14427383 versicolor
#> 57    0.57689099   0.5578125   0.55480411  0.53615275 versicolor
#> 58   -1.11148263  -1.4503125  -0.24336499 -0.24760509 versicolor
#> 59    0.93868533  -0.3346875   0.49779203  0.14427383 versicolor
#> 60   -0.74968829  -0.7809375   0.09870748  0.27490014 versicolor
#> 61   -0.99088452  -2.3428125  -0.12934084 -0.24760509 versicolor
#> 63    0.21509664  -1.8965625   0.15571956 -0.24760509 versicolor
#> 64    0.33569475  -0.3346875   0.55480411  0.27490014 versicolor
#> 66    1.05928345   0.1115625   0.38376788  0.27490014 versicolor
#> 67   -0.26729582  -0.1115625   0.44077995  0.40552645 versicolor
#> 68   -0.02609959  -0.7809375   0.21273164 -0.24760509 versicolor
#> 69    0.45629287  -1.8965625   0.44077995  0.40552645 versicolor
#> 70   -0.26729582  -1.2271875   0.09870748 -0.11697878 versicolor
#> 71    0.09449852   0.3346875   0.61181619  0.79740537 versicolor
#> 72    0.33569475  -0.5578125   0.15571956  0.14427383 versicolor
#> 73    0.57689099  -1.2271875   0.66882827  0.40552645 versicolor
#> 74    0.33569475  -0.5578125   0.55480411  0.01364752 versicolor
#> 75    0.69748910  -0.3346875   0.32675580  0.14427383 versicolor
#> 76    0.93868533  -0.1115625   0.38376788  0.27490014 versicolor
#> 77    1.17988156  -0.5578125   0.61181619  0.27490014 versicolor
#> 79    0.21509664  -0.3346875   0.44077995  0.40552645 versicolor
#> 80   -0.14669771  -1.0040625  -0.12934084 -0.24760509 versicolor
#> 81   -0.38789394  -1.4503125   0.04169540 -0.11697878 versicolor
#> 82   -0.38789394  -1.4503125  -0.01531668 -0.24760509 versicolor
#> 83   -0.02609959  -0.7809375   0.09870748  0.01364752 versicolor
#> 84    0.21509664  -0.7809375   0.78285243  0.53615275 versicolor
#> 85   -0.50849205  -0.1115625   0.44077995  0.40552645 versicolor
#> 86    0.21509664   0.7809375   0.44077995  0.53615275 versicolor
#> 87    1.05928345   0.1115625   0.55480411  0.40552645 versicolor
#> 88    0.57689099  -1.6734375   0.38376788  0.14427383 versicolor
#> 89   -0.26729582  -0.1115625   0.21273164  0.14427383 versicolor
#> 90   -0.38789394  -1.2271875   0.15571956  0.14427383 versicolor
#> 91   -0.38789394  -1.0040625   0.38376788  0.01364752 versicolor
#> 92    0.33569475  -0.1115625   0.49779203  0.27490014 versicolor
#> 93   -0.02609959  -1.0040625   0.15571956  0.01364752 versicolor
#> 94   -0.99088452  -1.6734375  -0.24336499 -0.24760509 versicolor
#> 95   -0.26729582  -0.7809375   0.26974372  0.14427383 versicolor
#> 96   -0.14669771  -0.1115625   0.26974372  0.01364752 versicolor
#> 98    0.45629287  -0.3346875   0.32675580  0.14427383 versicolor
#> 99   -0.87028640  -1.2271875  -0.41440123 -0.11697878 versicolor
#> 100  -0.14669771  -0.5578125   0.21273164  0.14427383 versicolor
#> 101   0.57689099   0.5578125   1.29596114  1.71178952  virginica
#> 102  -0.02609959  -0.7809375   0.78285243  0.92803168  virginica
#> 103   1.54167591  -0.1115625   1.23894906  1.18928429  virginica
#> 104   0.57689099  -0.3346875   1.06791282  0.79740537  virginica
#> 105   0.81808722  -0.1115625   1.18193698  1.31991060  virginica
#> 106   2.14466649  -0.1115625   1.63803362  1.18928429  virginica
#> 107  -1.11148263  -1.2271875   0.44077995  0.66677906  virginica
#> 109   1.05928345  -1.2271875   1.18193698  0.79740537  virginica
#> 110   1.66227403   1.2271875   1.35297322  1.71178952  virginica
#> 111   0.81808722   0.3346875   0.78285243  1.05865798  virginica
#> 112   0.69748910  -0.7809375   0.89687659  0.92803168  virginica
#> 113   1.17988156  -0.1115625   1.01090075  1.18928429  virginica
#> 114  -0.14669771  -1.2271875   0.72584035  1.05865798  virginica
#> 115  -0.02609959  -0.5578125   0.78285243  1.58116321  virginica
#> 116   0.69748910   0.3346875   0.89687659  1.45053691  virginica
#> 117   0.81808722  -0.1115625   1.01090075  0.79740537  virginica
#> 118   2.26526461   1.6734375   1.69504569  1.31991060  virginica
#> 119   2.26526461  -1.0040625   1.80906985  1.45053691  virginica
#> 120   0.21509664  -1.8965625   0.72584035  0.40552645  virginica
#> 121   1.30047968   0.3346875   1.12492490  1.45053691  virginica
#> 122  -0.26729582  -0.5578125   0.66882827  1.05865798  virginica
#> 124   0.57689099  -0.7809375   0.66882827  0.79740537  virginica
#> 126   1.66227403   0.3346875   1.29596114  0.79740537  virginica
#> 127   0.45629287  -0.5578125   0.61181619  0.79740537  virginica
#> 128   0.33569475  -0.1115625   0.66882827  0.79740537  virginica
#> 129   0.69748910  -0.5578125   1.06791282  1.18928429  virginica
#> 130   1.66227403  -0.1115625   1.18193698  0.53615275  virginica
#> 131   1.90347026  -0.5578125   1.35297322  0.92803168  virginica
#> 132   2.50646084   1.6734375   1.52400946  1.05865798  virginica
#> 133   0.69748910  -0.5578125   1.06791282  1.31991060  virginica
#> 134   0.57689099  -0.5578125   0.78285243  0.40552645  virginica
#> 136   2.26526461  -0.1115625   1.35297322  1.45053691  virginica
#> 137   0.57689099   0.7809375   1.06791282  1.58116321  virginica
#> 138   0.69748910   0.1115625   1.01090075  0.79740537  virginica
#> 139   0.21509664  -0.1115625   0.61181619  0.79740537  virginica
#> 140   1.30047968   0.1115625   0.95388867  1.18928429  virginica
#> 141   1.05928345   0.1115625   1.06791282  1.58116321  virginica
#> 142   1.30047968   0.1115625   0.78285243  1.45053691  virginica
#> 144   1.17988156   0.3346875   1.23894906  1.45053691  virginica
#> 146   1.05928345  -0.1115625   0.83986451  1.45053691  virginica
#> 147   0.57689099  -1.2271875   0.72584035  0.92803168  virginica
#> 148   0.81808722  -0.1115625   0.83986451  1.05865798  virginica
#> 149   0.45629287   0.7809375   0.95388867  1.45053691  virginica
#> 150   0.09449852  -0.1115625   0.78285243  0.79740537  virginica

# Apply the same preprocessing to test data
# In this case, the scale and center values from training data will be used.
# Note how `preprocess()` accepts either a `PreprocessorConfig` or `Preprocessor` object for
# this reason.
iris_test_pre <- preprocess(iris_test, iris_pre)
#> 2026-02-15 11:20:38 
#> Scaling and centering 4 numeric features...
#>  [preprocess]
#> 2026-02-15 11:20:38 
#> Preprocessing done.
#>  [preprocess]

# Access preprocessed test data
preprocessed(iris_test_pre)
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 8    -0.99088452   0.7809375  -1.26958242  -1.2926155     setosa
#> 24   -0.87028640   0.5578125  -1.15555826  -0.9007366     setosa
#> 29   -0.74968829   0.7809375  -1.32659450  -1.2926155     setosa
#> 32   -0.50849205   0.7809375  -1.26958242  -1.0313629     setosa
#> 47   -0.87028640   1.6734375  -1.21257034  -1.2926155     setosa
#> 52    0.69748910   0.3346875   0.44077995   0.4055264 versicolor
#> 62    0.09449852  -0.1115625   0.26974372   0.4055264 versicolor
#> 65   -0.26729582  -0.3346875  -0.07232876   0.1442738 versicolor
#> 78    1.05928345  -0.1115625   0.72584035   0.6667791 versicolor
#> 97   -0.14669771  -0.3346875   0.26974372   0.1442738 versicolor
#> 108   1.78287214  -0.3346875   1.46699738   0.7974054  virginica
#> 123   2.26526461  -0.5578125   1.69504569   1.0586580  virginica
#> 125   1.05928345   0.5578125   1.12492490   1.1892843  virginica
#> 135   0.33569475  -1.0040625   1.06791282   0.2749001  virginica
#> 145   1.05928345   0.5578125   1.12492490   1.7117895  virginica