% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PLSrounding.R
\encoding{UTF8}
\name{PLSrounding}
\alias{PLSrounding}
\alias{PLSroundingInner}
\alias{PLSroundingPublish}
\title{PLS inspired rounding}
\usage{
PLSrounding(
  data,
  freqVar = NULL,
  roundBase = 3,
  hierarchies = NULL,
  formula = NULL,
  dimVar = NULL,
  maxRound = roundBase - 1,
  printInc = nrow(data) > 1000,
  output = NULL,
  extend0 = FALSE,
  preAggregate = NA,
  aggregatePackage = "base",
  aggregateNA = TRUE,
  aggregateBaseOrder = FALSE,
  rowGroupsPackage = aggregatePackage,
  ...,
  action_unused_dots = getOption("SmallCountRounding.action_unused_dots", "inform"),
  allowed_unused_dots = getOption("SmallCountRounding.allowed_unused_dots", character(0))
)

PLSroundingInner(..., output = "inner")

PLSroundingPublish(..., output = "publish")
}
\arguments{
\item{data}{Input data (inner cells), typically a data frame, tibble, or data.table.
If \code{data} is not a classic data frame, it will be coerced to one internally
unless \code{preAggregate} is \code{TRUE} and \code{aggregatePackage} is \code{"data.table"}.}

\item{freqVar}{Variable holding counts (inner cells frequencies).  When \code{NULL} (default), microdata is assumed.}

\item{roundBase}{Rounding base}

\item{hierarchies}{List of hierarchies}

\item{formula}{Model formula defining publishable cells}

\item{dimVar}{The main dimensional variables and additional aggregating variables. This parameter can be  useful when hierarchies and formula are unspecified.}

\item{maxRound}{Inner cells contributing to original publishable cells equal to or less than maxRound will be rounded}

\item{printInc}{Printing iteration information to console when TRUE}

\item{output}{Possible non-NULL values are \code{"input"}, \code{"inner"} and \code{"publish"}. Then a single data frame is returned.}

\item{extend0}{When \code{extend0} is set to \code{TRUE}, the data is automatically extended.
This is relevant when \code{zeroCandidates = TRUE} (see \code{\link{RoundViaDummy}}).
Additionally, \code{extend0} can be specified as a list, representing the \code{varGroups} parameter
in the \code{\link[SSBtools]{Extend0}} function.
Can also be set to \code{"all"} which means that input codes in hierarchies are considered in addition to those in data.}

\item{preAggregate}{When \code{TRUE}, the data will be aggregated within the function to an appropriate level.
This is defined by the dimensional variables according to \code{dimVar}, \code{hierarchies} or \code{formula}.
When \code{FALSE}, no aggregation is performed.
When \code{NA} (default), the function will automatically decide whether to aggregate:
aggregation is applied unless \code{freqVar} is present and the data contain no duplicated rows with respect to
the dimensional variables.
Exception: if a non-\code{NULL} \code{x} (the model matrix) is supplied via \code{...}, \code{NA} is treated as \code{FALSE}.}

\item{aggregatePackage}{Package used to preAggregate.
Parameter \code{pkg} to \code{\link[SSBtools]{aggregate_by_pkg}}.}

\item{aggregateNA}{Whether to include NAs in the grouping variables while preAggregate.
Parameter \code{include_na} to \code{\link[SSBtools]{aggregate_by_pkg}}.}

\item{aggregateBaseOrder}{Parameter \code{base_order} to \code{\link[SSBtools]{aggregate_by_pkg}}, used when preAggregate.
The default is set to \code{FALSE} to avoid unnecessary sorting operations.
When \code{TRUE}, an attempt is made to return the same result with \code{data.table} as with base R.
This cannot be guaranteed due to potential variations in sorting behavior across different systems.}

\item{rowGroupsPackage}{Parameter \code{pkg} to \code{\link[SSBtools]{RowGroups}}.
The parameter is input to \code{\link[SSBtools]{Formula2ModelMatrix}}
via \code{\link[SSBtools]{ModelMatrix}}.}

\item{...}{Further parameters sent to \code{RoundViaDummy}}

\item{action_unused_dots}{Character string controlling how unused arguments
in \code{...} are handled. Internally uses \code{\link[ellipsis:check_dots_used]{ellipsis::check_dots_used()}} with a
custom action. One of "warn", "abort", "inform", or "none". The value "none"
disables the check entirely. The default is taken from
\code{getOption("SmallCountRounding.action_unused_dots")}, falling back to "warn"
if the option is not set. Users can change the default globally with e.g.
\code{options(SmallCountRounding.action_unused_dots = "abort")}.}

\item{allowed_unused_dots}{Character vector of argument names ignored by the
unused-argument check. May be useful when this function is wrapped by
another function, or in other cases where a correctly spelled argument is
nevertheless not registered as used. The default is taken from
\code{getOption("SmallCountRounding.allowed_unused_dots")}, falling back to
\code{character(0)} if the option is not set. Users can change the default
globally with e.g.
\code{options(SmallCountRounding.allowed_unused_dots = c("plotColor", "lineType"))}.}
}
\value{
Output is a four-element list with class attribute "PLSrounded",
which ensures informative printing and enables the use of \code{\link[SSBtools]{FormulaSelection}} on this object.
\item{inner}{Data frame corresponding to input data with the main dimensional variables and with cell
frequencies (original, rounded, difference).}
\item{publish}{Data frame of publishable data with the main dimensional variables and with cell frequencies
(original, rounded, difference).}
\item{metrics}{A named character vector of various statistics calculated from the two output data frames
("\code{inner_}" used to distinguish). See examples below and the function \code{\link{HDutility}}.}
\item{freqTable}{Matrix of frequencies of cell frequencies and absolute differences.
For example, row "\code{rounded}" and column "\code{inn.4+}" is the number of rounded
inner cell frequencies greater than or equal to \code{4}.}
}
\description{
Small count rounding of necessary inner cells are performed so that all small frequencies of cross-classifications to be published
(publishable cells) are rounded. The publishable cells can be defined from a model formula, hierarchies or automatically from data.
}
\details{
This function is a user-friendly wrapper for \code{RoundViaDummy} with data frame output and with computed summary of the results.
See \code{\link{RoundViaDummy}} for more details.
}
\examples{
# Small example data set
z <- SmallCountData("e6")
print(z)

# Publishable cells by formula interface
a <- PLSrounding(z, "freq", roundBase = 5,  formula = ~geo + eu + year)
print(a)
print(a$inner)
print(a$publish)
print(a$metrics)
print(a$freqTable)

# Using FormulaSelection()
FormulaSelection(a$publish, ~eu + year)
FormulaSelection(a, ~eu + year) # same as above
FormulaSelection(a)             # just a$publish

# Recalculation of maxdiff, HDutility, meanAbsDiff and rootMeanSquare
max(abs(a$publish[, "difference"]))
HDutility(a$publish[, "original"], a$publish[, "rounded"])
mean(abs(a$publish[, "difference"]))
sqrt(mean((a$publish[, "difference"])^2))

# Five lines below produce equivalent results 
# Ordering of rows can be different
PLSrounding(z, "freq", dimVar = c("geo", "eu", "year"))
PLSrounding(z, "freq", formula = ~eu * year + geo * year)
PLSrounding(z[, -2], "freq", hierarchies = SmallCountData("eHrc"))
PLSrounding(z[, -2], "freq", hierarchies = SmallCountData("eDimList"))
PLSrounding(z[, -2], "freq", hierarchies = SmallCountData("eDimList"), formula = ~geo * year)

# Define publishable cells differently by making use of formula interface
PLSrounding(z, "freq", formula = ~eu * year + geo)

# Define publishable cells differently by making use of hierarchy interface
eHrc2 <- list(geo = c("EU", "@Portugal", "@Spain", "Iceland"), year = c("2018", "2019"))
PLSrounding(z, "freq", hierarchies = eHrc2)

# Also possible to combine hierarchies and formula
PLSrounding(z, "freq", hierarchies = SmallCountData("eDimList"), formula = ~geo + year)

# Single data frame output
PLSroundingInner(z, "freq", roundBase = 5, formula = ~geo + eu + year)
PLSroundingPublish(z, roundBase = 5, formula = ~geo + eu + year)

# Microdata input
PLSroundingInner(rbind(z, z), roundBase = 5, formula = ~geo + eu + year)

# Zero perturbed due to both  extend0 = TRUE and zeroCandidates = TRUE 
set.seed(12345)
PLSroundingInner(z[sample.int(5, 12, replace = TRUE), 1:3], 
                 formula = ~geo + eu + year, roundBase = 5, 
                 extend0 = TRUE, zeroCandidates = TRUE, printInc = TRUE)

# Parameter avoidHierarchical (see RoundViaDummy and ModelMatrix) 
PLSroundingPublish(z, roundBase = 5, formula = ~geo + eu + year, avoidHierarchical = TRUE)


# To illustrate hierarchical_extend0 
#    (parameter to underlying function, SSBtools::Extend0fromModelMatrixInput)
PLSroundingInner(z[-c(2:3), ], roundBase = 5, formula = ~geo + eu + year, 
   avoidHierarchical = TRUE, zeroCandidates = TRUE, extend0 = TRUE)
PLSroundingInner(z[-c(2:3), ], roundBase = 5, formula = ~geo + eu + year, 
   avoidHierarchical = TRUE, zeroCandidates = TRUE, extend0 = TRUE, 
   hierarchical_extend0 = TRUE)

# Package sdcHierarchies can be used to create hierarchies. 
# The small example code below works if this package is available. 
if (require(sdcHierarchies)) {
  z2 <- cbind(geo = c("11", "21", "22"), z[, 3:4], stringsAsFactors = FALSE)
  h2 <- list(
    geo = hier_compute(inp = unique(z2$geo), dim_spec = c(1, 1), root = "Tot", as = "df"),
    year = hier_convert(hier_create(root = "Total", nodes = c("2018", "2019")), as = "df"))
  PLSrounding(z2, "freq", hierarchies = h2)
}

# Use PLS2way to produce tables as in Langsrud and Heldal (2018) and to demonstrate 
# parameters maxRound, zeroCandidates and identifyNew (see RoundViaDummy).   
# Parameter rndSeed used to ensure same output as in reference.
exPSD <- SmallCountData("exPSD")
a <- PLSrounding(exPSD, "freq", 5, formula = ~rows + cols, rndSeed=124)
PLS2way(a, "original")  # Table 1
PLS2way(a)  # Table 2
a <- PLSrounding(exPSD, "freq", 5, formula = ~rows + cols, identifyNew = FALSE, rndSeed=124)
PLS2way(a)  # Table 3
a <- PLSrounding(exPSD, "freq", 5, formula = ~rows + cols, maxRound = 7)
PLS2way(a)  # Values in col1 rounded
a <- PLSrounding(exPSD, "freq", 5, formula = ~rows + cols, zeroCandidates = TRUE)
PLS2way(a)  # (row3, col4): original is 0 and rounded is 5

# Using formula followed by FormulaSelection 
output <- PLSrounding(data = SmallCountData("example1"), 
                      formula = ~age * geo * year + eu * year, 
                      freqVar = "freq", 
                      roundBase = 5)
FormulaSelection(output, ~(age + eu) * year)

# Example similar to the one in the documentation of tables_by_formulas,
# but using PLSroundingPublish with roundBase = 4.
tables_by_formulas(SSBtoolsData("magnitude1"),
                   table_fun = PLSroundingPublish, 
                   table_formulas = list(table_1 = ~region * sector2, 
                                         table_2 = ~region1:sector4 - 1, 
                                         table_3 = ~region + sector4 - 1), 
                   substitute_vars = list(region = c("geo", "eu"), region1 = "eu"), 
                   collapse_vars = list(sector = c("sector2", "sector4")), 
                   roundBase = 4) 

}
\references{
Langsrud, Ø. and Heldal, J. (2018): \dQuote{An Algorithm for Small Count Rounding of Tabular Data}.
Presented at: \emph{Privacy in statistical databases}, Valencia, Spain. September 26-28, 2018.
\url{https://www.researchgate.net/publication/327768398_An_Algorithm_for_Small_Count_Rounding_of_Tabular_Data}
}
\seealso{
\code{\link{RoundViaDummy}}, \code{\link{PLS2way}}, \code{\link[SSBtools]{ModelMatrix}}
}
