% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ConfProb.R
\name{EstConf}
\alias{EstConf}
\title{Confidence Probabilities}
\usage{
EstConf(
  Pedigree = NULL,
  LifeHistData = NULL,
  args.sim = list(nSnp = 400, SnpError = 0.001, ParMis = c(0.4, 0.4)),
  args.seq = list(Module = "ped", Err = 0.001, Tassign = 0.5, CalcLLR = FALSE),
  nSim = 10,
  nCores = 1,
  quiet = TRUE
)
}
\arguments{
\item{Pedigree}{reference pedigree from which to simulate, dataframe with
columns id-dam-sire. Additional columns are ignored.}

\item{LifeHistData}{dataframe with id, sex (1=female, 2=male, 3=unknown),
birth year, and optionally BY.min - BY.max - YearLast.}

\item{args.sim}{list of arguments to pass to \code{\link{SimGeno}}, such as
\code{nSnp} (number of SNPs), \code{SnpError} (genotyping error rate) and
\code{ParMis} (proportion of non-genotyped parents). Set to \code{NULL} to
use all default values.}

\item{args.seq}{list of arguments to pass to \code{\link{sequoia}}, such as
\code{Module} ('par' or 'ped'), \code{Err} (assumed genotyping error rate),
and \code{Complex}. May include (part of) \code{SeqList}, a list of sequoia
output (i.e. as a list-within-a-list). Set to \code{NULL} to use all
default values.}

\item{nSim}{number of rounds of simulate - reconstruct - compare to
perform, i.e. number of simulated datasets.}

\item{nCores}{number of computer cores to use. If \code{>1}, package
\pkg{parallel} is used. Set to NULL to use all but one of the available
cores, as detected by \code{parallel::detectCores()} (using all cores tends
to freeze up your computer). With large datasets, the amount of computer
memory may be the limiting factor for the number of cores you can use.}

\item{quiet}{suppress messages. \code{TRUE} runs \code{SimGeno} and
\code{sequoia} quietly, \code{'very'} also suppresses other messages and
the simulation counter when \code{nCores=1} (there is no simulation counter
when \code{nCores>1}).}
}
\value{
A list, with elements:
  \item{ConfProb}{See below}
  \item{PedErrors}{See below}
  \item{Pedigree.reference}{the pedigree from which data was simulated}
  \item{LifeHistData}{}
  \item{Pedigree.inferred}{a list with for each simulation the inferred
    pedigree based on the simulated data}
  \item{SimSNPd}{a list with for each simulation the IDs of the individuals
    simulated to have been genotyped}
  \item{PedComp.fwd}{array with \code{Counts} from the 'forward'
    \code{PedCompare}, from which \code{PedErrors} is calculated}
  \item{RunParams}{a list with the call to \code{EstConf} as a semi-nested
  list (args.sim, args.seq, nSim, nCores), as well as the default parameter
  values for \code{SimGeno} and \code{sequoia}.}
  \item{RunTime}{\code{sequoia} runtime per simulation in seconds, as
    measured by \code{\link{system.time}()['elapsed']}.}

Dataframe \code{ConfProb} has 7 columns:
\item{id.cat, dam.cat, sire.cat}{Category of the focal individual, dam, and
  sire, in the pedigree inferred based on the simulated data. Coded as
  G=genotyped, D=dummy, X=none}
\item{dam.conf}{Probability that the dam is correct, given the categories of
  the assigned dam and sire (ignoring whether or not the sire is correct)}
\item{sire.conf}{as \code{dam.conf}, for the sire}
\item{pair.conf}{Probability that both dam and sire are correct, given their
  categories}
\item{N}{Number of individuals per category-combination, across all
  \code{nSim} simulations}

Array \code{PedErrors} has three dimensions:
\item{class}{\itemize{
  \item \code{FalseNeg}(atives): could have been assigned but was not
(individual + parent both genotyped or dummifiable; P1only in
\code{PedCompare}).
  \item \code{FalsePos}(itives): no parent in reference pedigree, but
one was assigned based on the simulated data (P2only)
  \item \code{Mismatch}: different parents between the pedigrees
  }}
\item{cat}{Category of individual + parent, as a two-letter code where the
  first letter indicates the focal individual and the second the parent;
  G=Genotyped, D=Dummy, T=Total}
\item{parent}{dam or sire}
}
\description{
Estimate confidence probabilities ('backward') and assignment
  error rates ('forward') per category (genotyped/dummy) by repeatedly
  simulating genotype data from a reference pedigree using
  \code{\link{SimGeno}}, reconstruction a pedigree from this using
  \code{\link{sequoia}}, and counting the number of mismatches using
  \code{\link{PedCompare}}.
}
\details{
The confidence probability is taken as the number of correct
  (matching) assignments, divided by all assignments made in the
  \emph{observed} (inferred-from-simulated) pedigree. In contrast, the false
  negative & false positive assignment rates are proportions of the number of
  parents in the \emph{true} (reference) pedigree. Each rate is calculated
  separately for dams & sires, and separately for each category
  (\strong{G}enotyped/\strong{D}ummy(fiable)/\strong{X} (none)) of
  individual, parent and co-parent.

 This function does not know which individuals in the actual \code{Pedigree}
 are genotyped, so the confidence probabilities need to be added to the
 \code{Pedigree} as shown in the example at the bottom.

 A confidence of \eqn{1} means all assignments on simulated data were correct for
 that category-combination. It should be interpreted as (and perhaps modified
 to) \eqn{> 1 - 1/N}, where sample size \code{N} is given in the last column
 of the \code{ConfProb} and \code{PedErrors} dataframes in the output. The
 same applies for a false negative/positive rate of \eqn{0} (i.e. to be
 interpreted as \eqn{< 1/N}).
}
\section{Assumptions}{

  Because the actual true pedigree is (typically) unknown, the provided
  reference pedigree is used as a stand-in and assumed to be the true
  pedigree, with unrelated founders. It is also assumed that the probability
  to be genotyped is equal for all parents; in each round, a new random
  set of parents (proportion set by \code{ParMis}) is mimicked to be
  non-genotyped. In addition, SNPs are assumed to segregate independently.

  An experimental version offering more fine-grained control is available at
  https://github.com/JiscaH/sequoiaExtra .
}

\section{Object size}{

  The size in Kb of the returned list can become pretty big, as each of the
  inferred pedigrees is included. When running \code{EstConf} many times for
  a range of parameter values, it may be prudent to save the required summary
  statistics for each run rather than the full output.
}

\section{Errors}{

  If you have a large pedigree and try to run this function on multiple
  cores, you may run into "Cannot allocate vector of size ..." errors or even
  unexpected crashes: there is not enough computer memory for each separate
  run. Try reducing `nCores`.
}

\examples{
# estimate proportion of parents that are genotyped (= 1 - ParMis)
prop_parents_genotyped <- c(
  dam = mean(unique(SeqOUT_griffin$Pedigree$dam) \%in\% rownames(Geno_griffin)),
sire = mean(unique(SeqOUT_griffin$Pedigree$sire) \%in\% rownames(Geno_griffin))
)

# Example for parentage assignment only
conf_grif <- EstConf(Pedigree = SeqOUT_griffin$Pedigree,
               LifeHistData = SeqOUT_griffin$LifeHist,
               args.sim = list(nSnp = 150,   # no. in actual data, or what-if
                               SnpError = 5e-3,  # best estimate, or what-if
                               CallRate=0.9,     # from SnpStats()
                               ParMis=c(0.28, 0.22)),  # calc'd above
               args.seq = list(Err=5e-3, Module="par"),  # as in real run
               nSim = 1,   # try-out, proper run >=20 (10 if huge pedigree)
               nCores=1)

# parent-pair confidence, per category (Genotyped/Dummy/None)
conf_grif$ConfProb

# Proportion of true parents that was correctly assigned
1 - apply(conf_grif$PedErrors, MARGIN=c('cat','parent'), FUN=sum, na.rm=TRUE)

# add columns with confidence probabilities to pedigree
# first add columns with category (G/D/X)
Ped.withConf <- getAssignCat(Pedigree = SeqOUT_griffin$Pedigree,
                             SNPd = SeqOUT_griffin$PedigreePar$id)
Ped.withConf <- merge(Ped.withConf, conf_grif$ConfProb, all.x=TRUE,
                      sort=FALSE)  # (note: merge() messes up column order)
head(Ped.withConf[Ped.withConf$dam.cat=="G", ])

# save output summary
\dontrun{
conf_griff[['Note']] <- 'You could add a note'
saveRDS(conf_grif[c('ConfProb','PedComp.fwd','RunParams','RunTime','Note')],
   file = 'conf_200SNPs_Err005_Callrate80.RDS')
}

## overall assignment rate (AR), error rate (ER) & runtime
AR_max <- sum(!is.na(Ped_griffin$dam)) + sum(!is.na(Ped_griffin$sire))
ER_max <- 2*nrow(Ped_griffin)
PCT <- conf_grif$PedComp.fwd[,'TT',,]   # Total-Total counts
list(AR = mean(apply(PCT[,'Match',],1,sum)/AR_max),  # sum over dam+sire
     ER = mean(apply(PCT[,c('Mismatch','P2only'),],1,sum)/ER_max),
     Time = mean(conf_grif$RunTime)/60)   # runtime in seconds --> minutes


## P(actual FS | inferred as FS) etc.
\dontrun{
PairL <- list()
for (i in 1:length(conf_grif$Pedigree.inferred)) {  # nSim
  cat(i, "\t")
  PairL[[i]] <- ComparePairs(conf_grif$Pedigree.reference,
                             conf_grif$Pedigree.inferred[[i]],
                             GenBack=1, patmat=TRUE, ExcludeDummies = TRUE,
                             Return="Counts")
}
# P(actual relationship (Ped1) | inferred relationship (Ped2))
PairRel.prop.A <- plyr::laply(PairL, function(M)
                     sweep(M, MARGIN='Ped2', STATS=colSums(M), FUN="/"))
PairRel.prop <- apply(PairRel.prop.A, 2:3, mean, na.rm=TRUE) #avg across sims
round(PairRel.prop, 3)
# or: P(inferred relationship | actual relationship)
PairRel.prop2 <- plyr::laply(PairL, function(M)
   sweep(M, MARGIN='Ped1', STATS=rowSums(M), FUN="/"))
}

\dontrun{
# confidence probability vs. sibship size
source('https://raw.githubusercontent.com/JiscaH/sequoiaExtra/main/conf_vs_sibsize.R')
conf_grif_nOff <- Conf_by_nOff(conf_grif)
conf_grif_nOff['conf',,'GD',]
conf_grif_nOff['N',,'GD',]
}

}
\seealso{
\code{\link{SimGeno}, \link{sequoia}, \link{PedCompare}}.
}
