% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/supervised_learning.R
\name{custom_rec_lin_model}
\alias{custom_rec_lin_model}
\title{Create a Custom Record Linkage Model}
\usage{
custom_rec_lin_model(ml_model, vectors)
}
\arguments{
\item{ml_model}{A trained ML model that predicts the probability of a match based on comparison vectors.}

\item{vectors}{An object of class \code{comparison_vectors} (a result of the \code{comparison_vectors} function), used for training the \code{ml_model}.}
}
\value{
Returns a list containing:\cr
\itemize{
\item{\code{b_vars} -- here \code{NULL},}
\item{\code{cpar_vars} -- here \code{NULL},}
\item{\code{cnonpar_vars} -- here \code{NULL},}
\item{\code{b_params} -- here \code{NULL},}
\item{\code{cpar_params} -- here \code{NULL},}
\item{\code{cnonpar_params} -- here \code{NULL},}
\item{\code{ratio_kliep} -- here \code{NULL},}
\item{\code{ratio_kliep_list} -- here \code{NULL},}
\item{\code{ml_model} -- ML model used for creating the record linkage model,}
\item{\code{pi_est} -- a prior probability of matching,}
\item{\code{match_prop} -- proportion of matches in the smaller dataset,}
\item{\code{variables} -- a character vector of key variables used for comparison,}
\item{\code{comparators} -- a list of functions used to compare pairs of records,}
\item{\code{methods} -- here \code{NULL},}
\item{\code{prob_ratio} -- here \code{"2"}.}
}
}
\description{
Creates a supervised record linkage model using a custom machine learning (ML) classifier.
}
\details{
The \code{custom_rec_lin_model} function creates a custom record linkage model,
based on known matches and non-matches (which might later serve as a classifier
for pairs outside training data). The procedure of creating a custom model
based on training data is as follows.
\enumerate{
\item{Use the \code{comparison_vectors} function to compare pairs of records.}
\item{Train a machine learning classifier using the \code{Omega} element
of the output of the \code{comparison_vectors} function. The classifier should
predict the probability of matching based on a given vector.}
\item{Use the \code{custom_rec_lin_model} function with
appropriate arguments.}
}
}
\examples{
if (requireNamespace("xgboost", quietly = TRUE)) {
  df_1 <- data.frame(
    "name" = c("James", "Emma", "William", "Olivia", "Thomas",
    "Sophie", "Harry", "Amelia", "George", "Isabella"),
    "surname" = c("Smith", "Johnson", "Brown", "Taylor", "Wilson",
    "Davis", "Clark", "Harris", "Lewis", "Walker")
  )
  df_2 <- data.frame(
    "name" = c("James", "Ema", "Wimliam", "Olivia", "Charlotte",
    "Henry", "Lucy", "Edward", "Alice", "Jack"),
    "surname" = c("Smith", "Johnson", "Bron", "Tailor", "Moore",
    "Evans", "Hall", "Wright", "Green", "King")
  )
  comparators <- list("name" = jarowinkler_complement(),
                      "surname" = jarowinkler_complement())
  matches <- data.frame("a" = 1:4, "b" = 1:4)
  vectors <- comparison_vectors(A = df_1, B = df_2, variables = c("name", "surname"),
                               comparators = comparators, matches = matches)
  model_xgb <- xgboost::xgboost(x = as.matrix(vectors$Omega[, c("gamma_name", "gamma_surname")]),
                       y = factor(vectors$Omega$match),
                       objective = "binary:logistic", eval_metric = "logloss",
                       nrounds = 100, verbosity = 0)
  custom_xgb_model <- custom_rec_lin_model(model_xgb, vectors)
  custom_xgb_model
}
}
\author{
Adam Struzik
}
