% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ssn_lm.R
\name{ssn_lm}
\alias{ssn_lm}
\title{Fitting Linear Models for Spatial Stream Networks}
\usage{
ssn_lm(
  formula,
  ssn.object,
  tailup_type = "none",
  taildown_type = "none",
  euclid_type = "none",
  nugget_type = "nugget",
  tailup_initial,
  taildown_initial,
  euclid_initial,
  nugget_initial,
  additive,
  estmethod = "reml",
  anisotropy = FALSE,
  random,
  randcov_initial,
  partition_factor,
  local,
  ...
)
}
\arguments{
\item{formula}{A two-sided linear formula describing the fixed effect structure
of the model, with the response to the left of the \code{~} operator and
the terms on the right, separated by \code{+} operators.}

\item{ssn.object}{A spatial stream network object with class \code{SSN}.}

\item{tailup_type}{The tailup covariance function type. Available options
include \code{"linear"}, \code{"spherical"}, \code{"exponential"},
\code{"mariah"}, \code{"epa"}, \code{"gaussian"}, and \code{"none"}. Parameterizations are
described in Details.}

\item{taildown_type}{The taildown covariance function type. Available options
include \code{"linear"}, \code{"spherical"}, \code{"exponential"},
\code{"mariah"}, \code{"epa"}, \code{"gaussian"}, and \code{"none"}. Parameterizations are
described in Details.}

\item{euclid_type}{The euclidean covariance function type. Available options
include \code{"spherical"}, \code{"exponential"}, \code{"gaussian"},
\code{"cosine"}, \code{"cubic"}, \code{"pentaspherical"}, \code{"wave"},
\code{"jbessel"}, \code{"gravity"}, \code{"rquad"}, \code{"magnetic"}, and
\code{"none"}. Parameterizations are
described in Details.}

\item{nugget_type}{The nugget covariance function type. Available options
include \code{"nugget"} or \code{"none"}. Parameterizations are
described in Details.}

\item{tailup_initial}{An object from \code{\link[=tailup_initial]{tailup_initial()}} specifying initial and/or
known values for the tailup covariance parameters.}

\item{taildown_initial}{An object from \code{\link[=taildown_initial]{taildown_initial()}} specifying initial and/or
known values for the taildown covariance parameters.}

\item{euclid_initial}{An object from \code{\link[=euclid_initial]{euclid_initial()}} specifying initial and/or
known values for the euclidean covariance parameters.}

\item{nugget_initial}{An object from \code{\link[=nugget_initial]{nugget_initial()}} specifying initial and/or
known values for the nugget covariance parameters.}

\item{additive}{The name of the variable in \code{ssn.object} that is used
to define spatial weights. Can be quoted or unquoted. For the tailup covariance functions, these additive
weights are used for branching. Technical details that describe the role
of the additive variable in the tailup covariance function are available
in Ver Hoef and Peterson (2010).}

\item{estmethod}{The estimation method. Available options include
\code{"reml"} for restricted maximum likelihood and \code{"ml"} for maximum
likelihood. The default is \code{"reml"}.}

\item{anisotropy}{A logical indicating whether (geometric) anisotropy should
be modeled. Not required if \code{spcov_initial} is provided with 1) \code{rotate}
assumed unknown or assumed known and non-zero or 2) \code{scale} assumed unknown
or assumed known and less than one. When \code{anisotropy} is \code{TRUE},
computational times can significantly increase. The default is \code{FALSE}.}

\item{random}{A one-sided linear formula describing the random effect structure
of the model. Terms are specified to the right of the \code{~ operator}.
Each term has the structure \code{x1 + ... + xn | g1/.../gm}, where \code{x1 + ... + xn}
specifies the model for the random effects and \code{g1/.../gm} is the grouping
structure. Separate terms are separated by \code{+} and must generally
be wrapped in parentheses. Random intercepts are added to each model
implicitly when at least  one other variable is defined.
If a random intercept is not desired, this must be explicitly
defined (e.g., \code{x1 + ... + xn - 1 | g1/.../gm}). If only a random intercept
is desired for a grouping structure, the random intercept must be specified
as \code{1 | g1/.../gm}. Note that \code{g1/.../gm} is shorthand for \code{(1 | g1/.../gm)}.
If only random intercepts are desired and the shorthand notation is used,
parentheses can be omitted.}

\item{randcov_initial}{An optional object specifying initial and/or
known values for the random effect variances. See \code{\link[spmodel:randcov_initial]{spmodel::randcov_initial()}}.}

\item{partition_factor}{A one-sided linear formula with a single term
specifying the partition factor.  The partition factor assumes observations
from different levels of the partition factor are uncorrelated.}

\item{local}{An optional logical or list controlling the big data approximation.
\code{local} can only be used when big data distance matrices have been created
using \code{\link[=ssn_create_bigdist]{ssn_create_bigdist()}} and is most beneficial when the sample size is
at least 5,000 \code{ssn_lm()} or 3,000 \code{ssn_glm()}.
If a list is provided, the following arguments detail the big
data approximation:
\itemize{
\item \code{index: }The group indexes. Observations in different
levels of \code{index} are assumed to be uncorrelated for the
purposes of estimation. If \code{index} is not provided, it is
determined by specifying \code{method} and either \code{size} or \code{groups}.
\item \code{method}: The big data approximation method used to determine \code{index}. Ignored
if \code{index} is provided. If \code{method = "random"},
observations are randomly assigned to \code{index} based on \code{size}.
If \code{method = "kmeans"}, observations assigned to \code{index}
based on k-means clustering on the coordinates with \code{groups} clusters. The default
is \code{"kmeans"}. Note that both methods have a random component, which
means that you may get different results from separate model fitting calls.
To ensure consistent results, specify \code{index} or set a seed via
\code{base::set.seed()}.
\item \code{size}: The number of observations in each \code{index} group
when \code{method} is \code{"random"}. If the number of observations
is not divisible by \code{size}, some levels get \code{size - 1} observations.
The default is 100.
\item \code{groups: }The number of \code{index} groups. If \code{method}
is \code{"random"}, \code{size} is \eqn{ceiling(n / groups)}, where
\eqn{n} is the sample size. Automatically determined if \code{size}
is specified. If \code{method} is \code{"kmeans"}, \code{groups}
is the number of clusters.
\item \code{var_adjust: }The approach for adjusting the variance-covariance
matrix of the fixed effects. \code{"none"} for no adjustment, \code{"theoretical"}
for the theoretically-correct adjustment,
\code{"pooled"} for the pooled adjustment, and \code{"empirical"} for the
empirical adjustment. The default is \code{"theoretical"} for samples sizes
up to 100,000 and \code{"none"} for samples sizes exceeding 100,000.
\item \code{parallel}: If \code{TRUE}, parallel processing via the
parallel package is automatically used. The default is \code{FALSE}.
\item \code{ncores}: If \code{parallel = TRUE}, the number of cores to
parallelize over. The default is the number of available cores on your machine.
}
When \code{local} is a list, at least one list element must be provided to
initialize default arguments for the other list elements.
If \code{local} is \code{TRUE}, defaults for \code{local} are chosen such
that \code{local} is transformed into
\code{list(size = 100, method = "kmeans", var_adjust = "theoretical", parallel = FALSE)}.}

\item{...}{Other arguments to \code{stats::optim()}.}
}
\value{
A list with many elements that store information about
the fitted model object and has class \code{ssn_lm}. Many generic functions that
summarize model fit are available for \code{ssn_lm} objects, including
\code{AIC}, \code{AICc}, \code{anova}, \code{augment}, \code{coef},
\code{cooks.distance}, \code{covmatrix}, \code{deviance}, \code{fitted}, \code{formula},
\code{glance}, \code{glances}, \code{hatvalues}, \code{influence},
\code{labels}, \code{logLik}, \code{loocv}, \code{model.frame}, \code{model.matrix},
\code{plot}, \code{predict}, \code{print}, \code{pseudoR2}, \code{summary},
\code{terms}, \code{tidy}, \code{update}, \code{varcomp}, and \code{vcov}.

This fitted model list contains the following elements:
\itemize{
\item \code{additive}: The name of the additive function value column.
\item \code{anisotropy}: Whether euclidean anisotropy was modeled.
\item \code{call}: The function call.
\item \code{coefficients}: Model coefficients.
\item \code{contrasts}: Any user-supplied contrasts.
\item \code{cooks_distance}: Cook's distance values.
\item \code{crs}: The geographic coordinate reference system.
\item \code{deviance}: The model deviance.
\item \code{diagtol}: A tolerance value that may be added to the diagonal
of covariance matrices to encourage decomposition stability.
\item \code{estmethod}: The estimation method.
\item \code{euclid_max}: The maximum euclidean distance.
\item \code{fitted}: Fitted values.
\item \code{formula}: The model formula.
\item \code{hatvalues}: The hat (leverage) values.
\item \code{is_known}: An object that identifies which parameters are known.
\item \code{local_index}: An index identifier used internally for sorting.
\item \code{missing_index}: Which rows in the "obs" object had missing responses.
\item \code{n}: The sample size.
\item \code{npar}: The number of estimated covariance parameters.
\item \code{observed_index}: Which rows in the "obs" object had observed responses.
\item \code{optim}: The optimization output.
\item \code{p}: The number of fixed effects.
\item \code{partition_factor}: The partition factor formula.
\item \code{pseudoR2}: The pseudo R-squared.
\item \code{random}: The random effect formula.
\item \code{residuals}: The residuals.
\item \code{sf_column_name}: The name of the geometry columns \code{ssn.object}
\item \code{ssn.object}: An updated \code{ssn.object}.
\item \code{tail_max}: The maximum stream distance.
\item \code{terms}: The model terms.
\item \code{vcov}: Variance-covariance matrices
\item \code{xlevels}: The levels of factors in the model matrix.
}

These list elements are meant to be used with various generic functions
(\code{e.g., residuals()} that operate on the model object.
While possible to access elements of the fitted model list directly, we strongly
advise against doing so when there is a generic available to return the element
of interest. For example, we strongly recommend using \code{residuals()} to
obtain model residuals instead of accessing the fitted model list directly via
\code{object$residuals}.
}
\description{
This function works on spatial stream network objects to fit
linear models with spatially autocorrelated errors using likelihood methods, allowing for
non-spatial random effects, anisotropy, partition factors, big data methods, and more.
The spatial formulation is described in Ver Hoef and Peterson (2010)
and Peterson and Ver Hoef (2010).
}
\details{
The linear model for spatial stream networks can be written as
\eqn{y = X \beta + zu + zd + ze + n}, where \eqn{X} is the fixed effects design
matrix, \eqn{\beta} are the fixed effects, \eqn{zu} is tailup random error,
\eqn{zd} is taildown random error, and \eqn{ze} is Euclidean random error,
and \eqn{n} is nugget random error. The tailup random errors capture spatial
covariance moving downstream (and depend on downstream distance), the taildown
random errors capture spatial covariance moving upstream (and depend on upstream)
distance, the Euclidean random errors capture spatial covariance that depends on
Euclidean distance, and the nugget random errors captures variability
independent of spatial locations. The response \eqn{y} is modeled using a
spatial covariance function expressed as
\eqn{de(zu) * R(zu) + de(zd) * R(zd) + de(ze) * R(ze) + nugget * I}.
\eqn{de(zu)}, \eqn{de(zu)}, and \eqn{de(zd)} represent the tailup, taildown, and Euclidean
variances, respectively. \eqn{R(zu)}, \eqn{R(zd)}, and \eqn{R(ze)} represent the tailup,
taildown, and Euclidean correlation matrices, respectively. Each correlation
matrix depends on a range parameter that controls the distance-decay behavior
of the correlation. \eqn{nugget} represents the nugget variance and
\eqn{I} represents an identity matrix.

\code{tailup_type} Details: Let \eqn{D} be a matrix of hydrologic distances,
\eqn{W} be a diagonal matrix of weights from \code{additive}, \eqn{r = D / range},
and \eqn{I} be
an identity matrix. Then parametric forms for flow-connected
elements of \eqn{R(zu)} are given below:
\itemize{
\item linear: \eqn{(1 - r) * (r <= 1) * W}
\item spherical: \eqn{(1 - 1.5r + 0.5r^3) * (r <= 1) * W}
\item exponential: \eqn{exp(-r) * W}
\item mariah: \eqn{log(90r + 1) / 90r * (D > 0) + 1 * (D = 0) * W}
\item epa: \eqn{(D - range)^2 * F * (r <= 1) * W / 16range^5}
\item gaussian: \eqn{2 exp(-r^2) * (1 - pnorm(r * 2^{1/2})) * W}
\item none: \eqn{I} * W
}

Details describing the \code{F} matrix in the \code{epa} covariance are given in Garreta et al. (2010).
Flow-unconnected elements of \eqn{R(zu)} are assumed uncorrelated.
Observations on different networks are also assumed uncorrelated.

\code{taildown_type} Details: Let \eqn{D} be a matrix of hydrologic distances,
\eqn{r = D / range},
and \eqn{I} be an identity matrix. Then parametric forms for flow-connected
elements of \eqn{R(zd)} are given below:
\itemize{
\item linear: \eqn{(1 - r) * (r <= 1)}
\item spherical: \eqn{(1 - 1.5r + 0.5r^3) * (r <= 1)}
\item exponential: \eqn{exp(-r)}
\item mariah: \eqn{log(90r + 1) / 90r * (D > 0) + 1 * (D = 0)}
\item epa: \eqn{(D - range)^2 * F1 * (r <= 1) / 16range^5}
\item gaussian: \eqn{0}
\item none: \eqn{I}
}

Now let \eqn{A} be a matrix that contains the shorter of the two distances
between two sites and the common downstream junction, \eqn{r1 = A / range},
\eqn{B} be a matrix that contains the longer of the two distances between two sites and the
common downstream junction, \eqn{r2 = B / range},  and \eqn{I} be an identity matrix.
Then parametric forms for flow-unconnected elements of \eqn{R(zd)} are given below:
\itemize{
\item linear: \eqn{(1 - r2) * (r2 <= 1)}
\item spherical: \eqn{(1 - 1.5r1 + 0.5r2) * (1 - r2)^2 * (r2 <= 1)}
\item exponential: \eqn{0}
\item mariah: \eqn{(log(90r1 + 1) - log(90r2 + 1)) / (90r1 - 90r2) * (A =/ B) + (1 / (90r1 + 1)) * (A = B)}
\item epa: \eqn{(B - range)^2 * F2 * (r2 <= 1) / 16range^5}
\item gaussian: \eqn{2 exp(-(B - A) / range) * (1 - pnorm(r * 2^{1/2})) * W}
\item none: \eqn{I}
}

Details describing the \code{F1} and \code{F2} matrices in the \code{epa}
covariance are given in Garreta et al. (2010).
Observations on different networks are assumed uncorrelated.

\code{euclid_type} Details: Let \eqn{D} be a matrix of Euclidean distances,
\eqn{r = D / range}, and \eqn{I} be an identity matrix. Then parametric
forms for elements of \eqn{R(ze)} are given below:
\itemize{
\item exponential: \eqn{exp(- r )}
\item spherical: \eqn{(1 - 1.5r + 0.5r^3) * (r <= 1)}
\item gaussian: \eqn{exp(- r^2 )}
\item cubic: \eqn{(1 - 7r^2 + 8.75r^3 - 3.5r^5 + 0.75r^7) * (r <= 1)}
\item pentaspherical: \eqn{(1 - 1.875r + 1.25r^3 - 0.375r^5) * (r <= 1)}
\item cosine: \eqn{cos(r)}
\item wave: \eqn{sin(r) * (h > 0) / r + (h = 0)}
\item jbessel: \eqn{Bj(h * range)}, Bj is Bessel-J function
\item gravity: \eqn{(1 + r^2)^{-0.5}}
\item rquad: \eqn{(1 + r^2)^{-1}}
\item magnetic: \eqn{(1 + r^2)^{-1.5}}
\item none: \eqn{I}
}

\code{nugget_type} Details: Let \eqn{I} be an identity matrix and \eqn{0}
be the zero matrix. Then parametric
forms for elements the nugget variance are given below:
\itemize{
\item nugget: \eqn{I}
\item none: \eqn{0}
}
In short, the nugget effect is modeled when \code{nugget_type} is \code{"nugget"}
and omitted when \code{nugget_type} is \code{"none"}.

\code{estmethod} Details: The various estimation methods are
\itemize{
\item \code{reml}: Maximize the restricted log-likelihood.
\item \code{ml}: Maximize the log-likelihood.
}

\code{anisotropy} Details: By default, all Euclidean covariance parameters except \code{rotate}
and \code{scale} are assumed unknown, requiring estimation. If either \code{rotate} or \code{scale}
are given initial values other than 0 and 1 (respectively) or are assumed unknown
in \code{\link[=euclid_initial]{euclid_initial()}}, \code{anisotropy} is implicitly set to \code{TRUE}.
(Geometric) Anisotropy is modeled by transforming a Euclidean covariance function that
decays differently in different directions to one that decays equally in all
directions via rotation and scaling of the original Euclidean coordinates. The rotation is
controlled by the \code{rotate} parameter in \eqn{[0, \pi]} radians. The scaling
is controlled by the \code{scale} parameter in \eqn{[0, 1]}. The anisotropy
correction involves first a rotation of the coordinates clockwise by \code{rotate} and then a
scaling of the coordinates' minor axis by the reciprocal of \code{scale}. The Euclidean
covariance is then computed using these transformed coordinates.

\code{random} Details: If random effects are used, the model
can be written as \eqn{y = X \beta + W1\gamma 1 + ... Wj\gamma j + zu + zd + ze + n},
where each Z is a random effects design matrix and each u is a random effect.

\code{partition_factor} Details: The partition factor can be represented in matrix form as \eqn{P}, where
elements of \eqn{P} equal one for observations in the same level of the partition
factor and zero otherwise. The covariance matrix involving only the
spatial and random effects components is then multiplied element-wise
(Hadmard product) by \eqn{P}, yielding the final covariance matrix.

\code{local} Details: The big data approximation works by sorting observations into different levels
of an index variable. Observations in different levels of the index variable
are assumed to be uncorrelated for the purposes of model fitting. Sparse matrix methods are then implemented
for significant computational gains. Parallelization generally further speeds up
computations when data sizes are larger than a few thousand. Both the \code{"random"} and \code{"kmeans"} values of \code{method}
in \code{local} have random components. That means you may get slightly different
results when using the big data approximation and rerunning \code{ssn_lm()} with the same code. For consistent results,
either set a seed via \code{base::set.seed()} or specify \code{index} to \code{local}.

Other Details: Observations with \code{NA} response values are removed for model
fitting, but their values can be predicted afterwards by running
\code{predict(object)}.
}
\note{
This function does not perform any internal scaling. If optimization is not
stable due to large extremely large variances, scale relevant variables
so they have variance 1 before optimization.
}
\examples{
# Copy the mf04p .ssn data to a local directory and read it into R
# When modeling with your .ssn object, you will load it using the relevant
# path to the .ssn data on your machine
copy_lsn_to_temp()
temp_path <- paste0(tempdir(), "/MiddleFork04.ssn")
mf04p <- ssn_import(temp_path, overwrite = TRUE)

ssn_mod <- ssn_lm(
  formula = Summer_mn ~ ELEV_DEM,
  ssn.object = mf04p,
  tailup_type = "exponential",
  additive = "afvArea"
)
summary(ssn_mod)

}
\references{
Garreta, V., Monestiez, P. and Ver Hoef, J.M. (2010) Spatial modelling and
prediction on river networks: up model, down model, or hybrid?
\emph{Environmetrics} \bold{21(5)}, 439--456.

Peterson, E.E. and Ver Hoef, J.M. (2010) A mixed-model moving-average approach
to geostatistical modeling in stream networks. \emph{Ecology} \bold{91(3)},
644--651.

Ver Hoef, J.M. and Peterson, E.E. (2010) A moving average approach for spatial
statistical models of stream networks (with discussion).
\emph{Journal of the American Statistical Association} \bold{105}, 6--18.
DOI: 10.1198/jasa.2009.ap08248.  Rejoinder pgs. 22--24.
}
