% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/01-basic.R
\name{test_WEAT}
\alias{test_WEAT}
\title{Word Embedding Association Test (WEAT) and Single-Category WEAT.}
\usage{
test_WEAT(
  data,
  T1,
  T2,
  A1,
  A2,
  use.pattern = FALSE,
  labels = list(),
  p.perm = TRUE,
  p.nsim = 10000,
  p.side = 2,
  seed = NULL,
  pooled.sd = "Caliskan"
)
}
\arguments{
\item{data}{A \code{\link[PsychWordVec:as_wordvec]{wordvec}} (data.table) or
\code{\link[PsychWordVec:as_embed]{embed}} (matrix),
see \code{\link{data_wordvec_load}}.}

\item{T1, T2}{Target words (a vector of words or a pattern of regular expression).
If only \code{T1} is specified,
it will tabulate data for single-category WEAT (SC-WEAT).}

\item{A1, A2}{Attribute words (a vector of words or a pattern of regular expression).
Both must be specified.}

\item{use.pattern}{Defaults to \code{FALSE} (using a vector of words).
If you use regular expression in \code{T1}, \code{T2}, \code{A1}, and \code{A2},
please specify this argument as \code{TRUE}.}

\item{labels}{Labels for target and attribute concepts (a named \code{list}),
such as (the default)
\code{list(T1="Target1", T2="Target2", A1="Attrib1", A2="Attrib2")}.}

\item{p.perm}{Permutation test to get exact or approximate \emph{p} value of the overall effect.
Defaults to \code{TRUE}. See also the \code{\link[sweater:weat_exact]{sweater}} package.}

\item{p.nsim}{Number of samples for resampling in permutation test. Defaults to \code{10000}.

If \code{p.nsim} is larger than the number of all possible permutations (rearrangements of data),
then it will be ignored and an exact permutation test will be conducted.
Otherwise (in most cases for real data and always for SC-WEAT), a resampling test is performed,
which takes much less computation time and produces the approximate \emph{p} value
(comparable to the exact one).}

\item{p.side}{One-sided (\code{1}) or two-sided (\code{2}) \emph{p} value.
Defaults to \code{2}.

In Caliskan et al.'s (2017) article, they reported one-sided \emph{p} value for WEAT.
Here, I suggest reporting two-sided \emph{p} value as a more conservative estimate.
The users take the full responsibility for the choice.
\itemize{
  \item{The one-sided \emph{p} value is calculated as the proportion of sampled permutations
        where the difference in means is greater than the test statistic.}
  \item{The two-sided \emph{p} value is calculated as the proportion of sampled permutations
        where the absolute difference is greater than the test statistic.}
}}

\item{seed}{Random seed for reproducible results of permutation test. Defaults to \code{NULL}.}

\item{pooled.sd}{Method used to calculate the pooled \emph{SD} for effect size estimate in WEAT.
\itemize{
  \item{Defaults to \code{"Caliskan"}: \code{sd(data.diff$cos_sim_diff)}, which is highly suggested
        and identical to Caliskan et al.'s (2017) original approach.}
  \item{Otherwise specified, it will calculate the pooled \emph{SD} as:
        \eqn{\sqrt{[(n_1 - 1) * \sigma_1^2 + (n_2 - 1) * \sigma_2^2] / (n_1 + n_2 - 2)}}.

        This is \strong{NOT suggested} because it may \emph{overestimate} the effect size,
        especially when there are only a few T1 and T2 words that have small variances.}
}}
}
\value{
A \code{list} object of new class \code{weat}:
\describe{
  \item{\code{words.valid}}{
    Valid (actually matched) words}
  \item{\code{words.not.found}}{
    Words not found}
  \item{\code{data.raw}}{
    A \code{data.table} of cosine similarities between all word pairs}
  \item{\code{data.mean}}{
    A \code{data.table} of \emph{mean} cosine similarities
    \emph{across} all attribute words}
  \item{\code{data.diff}}{
    A \code{data.table} of \emph{differential} mean cosine similarities
    \emph{between} the two attribute concepts}
  \item{\code{eff.label}}{
    Description for the difference between the two attribute concepts}
  \item{\code{eff.type}}{
    Effect type: WEAT or SC-WEAT}
  \item{\code{eff}}{
    Raw effect, standardized effect size, and p value (if \code{p.perm=TRUE})}
}
}
\description{
Tabulate data (cosine similarity and standardized effect size) and
conduct the permutation test of significance for the
\emph{Word Embedding Association Test} (WEAT) and
\emph{Single-Category Word Embedding Association Test} (SC-WEAT).
\itemize{
  \item{For WEAT, two-samples permutation test is conducted (i.e., rearrangements of data).}
  \item{For SC-WEAT, one-sample permutation test is conducted (i.e., rearrangements of +/- signs to data).}
}
}
\section{Download}{

Download pre-trained word vectors data (\code{.RData}):
\url{https://psychbruce.github.io/WordVector_RData.pdf}
}

\examples{
## cc() is more convenient than c()!

\donttest{weat = test_WEAT(
  demodata,
  labels=list(T1="King", T2="Queen", A1="Male", A2="Female"),
  T1=cc("king, King"),
  T2=cc("queen, Queen"),
  A1=cc("male, man, boy, brother, he, him, his, son"),
  A2=cc("female, woman, girl, sister, she, her, hers, daughter"),
  seed=1)
weat

sc_weat = test_WEAT(
  demodata,
  labels=list(T1="Occupation", A1="Male", A2="Female"),
  T1=cc("
    architect, boss, leader, engineer, CEO, officer, manager,
    lawyer, scientist, doctor, psychologist, investigator,
    consultant, programmer, teacher, clerk, counselor,
    salesperson, therapist, psychotherapist, nurse"),
  A1=cc("male, man, boy, brother, he, him, his, son"),
  A2=cc("female, woman, girl, sister, she, her, hers, daughter"),
  seed=1)
sc_weat
}
\dontrun{

## the same as the first example, but using regular expression
weat = test_WEAT(
  demodata,
  labels=list(T1="King", T2="Queen", A1="Male", A2="Female"),
  use.pattern=TRUE,  # use regular expression below
  T1="^[kK]ing$",
  T2="^[qQ]ueen$",
  A1="^male$|^man$|^boy$|^brother$|^he$|^him$|^his$|^son$",
  A2="^female$|^woman$|^girl$|^sister$|^she$|^her$|^hers$|^daughter$",
  seed=1)
weat

## replicating Caliskan et al.'s (2017) results
## WEAT7 (Table 1): d = 1.06, p = .018
## (requiring installation of the `sweater` package)
Caliskan.WEAT7 = test_WEAT(
  as_wordvec(sweater::glove_math),
  labels=list(T1="Math", T2="Arts", A1="Male", A2="Female"),
  T1=cc("math, algebra, geometry, calculus, equations, computation, numbers, addition"),
  T2=cc("poetry, art, dance, literature, novel, symphony, drama, sculpture"),
  A1=cc("male, man, boy, brother, he, him, his, son"),
  A2=cc("female, woman, girl, sister, she, her, hers, daughter"),
  p.side=1, seed=1234)
Caliskan.WEAT7
# d = 1.055, p = .0173 (= 173 counts / 10000 permutation samples)

## replicating Caliskan et al.'s (2017) supplemental results
## WEAT7 (Table S1): d = 0.97, p = .027
Caliskan.WEAT7.supp = test_WEAT(
  demodata,
  labels=list(T1="Math", T2="Arts", A1="Male", A2="Female"),
  T1=cc("math, algebra, geometry, calculus, equations, computation, numbers, addition"),
  T2=cc("poetry, art, dance, literature, novel, symphony, drama, sculpture"),
  A1=cc("male, man, boy, brother, he, him, his, son"),
  A2=cc("female, woman, girl, sister, she, her, hers, daughter"),
  p.side=1, seed=1234)
Caliskan.WEAT7.supp
# d = 0.966, p = .0221 (= 221 counts / 10000 permutation samples)
}

}
\references{
Caliskan, A., Bryson, J. J., & Narayanan, A. (2017).
Semantics derived automatically from language corpora contain human-like biases.
\emph{Science, 356}(6334), 183--186.
}
\seealso{
\code{\link{tab_similarity}}

\code{\link{dict_expand}}

\code{\link{dict_reliability}}

\code{\link{test_RND}}
}
