dacol provides utilities to add or modify columns in dataframe.

The utilities include:

  • Statistical measures: mode, confident_interval, ceiling
  • Normalize a vector column: cosine, logistic, zscore
  • Compute distance between 2 vector columns: euclidean, pearson, cosine, canberra
  • Manage outliers: trim_outlier, normalize_ptile
  • Calculate percentile: decile_band, decile_ptile, dc_rank_ptile

More info: https://ldanai.github.io/dacol/

Installation

You can install dacol from github with:

# install.packages("remotes")
remotes::install_github("ldanai/dacol")

Example

This shows how to use dacol:

library(dacol)
library(dplyr)

max = 30
df = tibble(x1 = seq(-1.2*max, 1.2*max, length.out = 200),
            x2 = seq(0, max, length.out = 200),
            x3 = sample(200))

df
#> # A tibble: 200 x 3
#>       x1    x2    x3
#>    <dbl> <dbl> <int>
#>  1 -36   0       111
#>  2 -35.6 0.151    31
#>  3 -35.3 0.302    92
#>  4 -34.9 0.452     6
#>  5 -34.6 0.603    20
#>  6 -34.2 0.754    55
#>  7 -33.8 0.905   190
#>  8 -33.5 1.06    135
#>  9 -33.1 1.21     10
#> 10 -32.7 1.36    173
#> # ... with 190 more rows

df =
  df %>%
  mutate(
    # Transformation
    y_cosine   = dc_cosine(x1, max),
    y_logistic = dc_logistic(x2, max),
    y_zcore    = dc_zscore(x2),

    # Distant between 2 vector columns
    y_dist_canb = dc_dist_canberra(x2, x3),
    y_dist_cos  = dc_dist_cosine(x2, y_zcore),
    y_dist_euc  = dc_dist_euclidean(x2, y_zcore),
    y_dist_pear = dc_dist_pearson(x2, y_zcore),

    # Manage outliers
    y_trim = dc_trim_outlier(x3, 0.01),
    y_norm = dc_normalize_ptile(x3, 0.01),

    # Stats measures
    y_mode = dc_mode(x3),
    y_ceil = dc_ceiling(x1, -1),

    # Band segmentation
    y_dec_band1  = dc_decile_band(x3),
    y_dec_band2  = dc_decile_band(x3, c(seq(0, 0.9, 0.1))),
    y_dec_ptile1 = dc_decile_ptile(x3),
    y_dec_ptile2 = dc_decile_ptile(x3, c(seq(0, 0.9, 0.1))),

    # Rank percentile
    y_ranked1 = dc_rank_ptile(x3),
    y_ranked2 = dc_rank_ptile(x3, c(seq(1, 100, 1)))
  )
#> Warning in if (is.na(n)) n = max(dplyr::n_distinct(x), 10000): the condition has
#> length > 1 and only the first element will be used

df
#> # A tibble: 200 x 20
#>       x1    x2    x3 y_cosine y_logistic y_zcore y_dist_canb y_dist_cos
#>    <dbl> <dbl> <int>    <dbl>      <dbl>   <dbl>       <dbl>      <dbl>
#>  1 -36   0       111        0     0        -1.72     0            0.498
#>  2 -35.6 0.151    31        0     0.0251   -1.70     0.00484      0.498
#>  3 -35.3 0.302    92        0     0.0502   -1.68     0.00327      0.498
#>  4 -34.9 0.452     6        0     0.0752   -1.67     0.0701       0.498
#>  5 -34.6 0.603    20        0     0.100    -1.65     0.0293       0.498
#>  6 -34.2 0.754    55        0     0.125    -1.63     0.0135       0.498
#>  7 -33.8 0.905   190        0     0.150    -1.62     0.00474      0.498
#>  8 -33.5 1.06    135        0     0.174    -1.60     0.00776      0.498
#>  9 -33.1 1.21     10        0     0.198    -1.58     0.108        0.498
#> 10 -32.7 1.36    173        0     0.222    -1.56     0.00778      0.498
#> # ... with 190 more rows, and 12 more variables: y_dist_euc <dbl>,
#> #   y_dist_pear <dbl>, y_trim <dbl>, y_norm <dbl>, y_mode <int>, y_ceil <dbl>,
#> #   y_dec_band1 <int>, y_dec_band2 <int>, y_dec_ptile1 <dbl>,
#> #   y_dec_ptile2 <dbl>, y_ranked1 <dbl>, y_ranked2 <dbl>

Please note that the ‘dacol’ project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.