17#include <RcppArmadillo.h>
18#include <RcppParallel.h>
39static void checkResponse(
const vec &y,
const string &family){
41 shared_ptr<GLMFamily> fam = getGLMFamily( family );
44 vec res = unique(omit_nan(y));
46 string famStr = fam->family();
48 if( famStr ==
"BinomialLogit" || famStr ==
"BinomialProbit" ){
50 bool valid_binary = (res.n_elem == 2 && (res[0] == 0 && res[1] == 1));
53 bool valid_beta = (res[0] >= 0 && res[res.n_elem-1] <= 1);
55 if( ! (valid_binary || valid_beta) ){
56 throw logic_error(
"Invalid response for binomial, must be only 0/1 or in interval [0,1]" );
59 else if( famStr ==
"PoissonLog" || famStr ==
"QuasipoissonLog" || famStr ==
"NB"){
62 throw logic_error(
"Invalid response for poisson/nb: must be non-negative" );
66 if( ! all_integer_valued(res) ){
67 throw logic_error(
"Invalid response for poisson/nb: must be integers" );
70 }
else if( famStr ==
"QuasibinomialLogit" ){
71 if( res[0] < 0 || res[res.n_elem-1] >=1 ){
72 throw logic_error(
"Invalid response for quasi-binomial, must be between 0 and 1" );
102static ModelFitGLM GLM(
105 const string &family,
107 const vec &weights = {},
108 const vec &offset = {},
110 const vec &betaInit = {},
111 const double &epsilon = 1e-8,
112 const double &maxit = 25,
113 const double &lambda = 0){
115 shared_ptr<GLMFamily> fam = getGLMFamily( family );
117 checkResponse(y, family);
120 bool alloc_local =
false;
121 if( work ==
nullptr ){
131 if( offset.is_empty() ){
132 offset_ = vec(y.n_elem, fill::zeros);
136 vec weights_(weights);
137 if( weights.is_empty() ){
138 weights_ = vec(y.n_elem, fill::ones);
142 for(i=0; i<maxit; i++){
143 if( i == 0 && betaInit.is_empty() ){
146 work->mu = fam->initialize(y, weights_);
147 work->eta = fam->link(work->mu) ;
150 work->eta = X * fit.coef + offset_;
151 work->mu = fam->linkinv( work->eta );
153 work->gprime= fam->mu_eta( work->eta );
154 work->z = (work->eta - offset_) + (y - work->mu) / work->gprime;
155 work->wsqrt = work->gprime % sqrt(weights_ / fam->variance( work->mu ));
157 vec beta_prev(fit.coef);
160 fit = lm(
scaleEachCol(X, work->wsqrt), work->z % work->wsqrt,
LEAST, lambda, 0, work );
163 if( ! fit.success )
break;
166 if( i > 0 && norm(fit.coef - beta_prev) < epsilon )
break;
171 work->eta = X * fit.coef + offset_;
172 work->mu = fam->linkinv( work->eta );
179 double rdf_offset = sum(work->wsqrt == 0);
180 fit = lm(
scaleEachCol(X, work->wsqrt), work->z % work->wsqrt, md, lambda, rdf_offset, work, fam->estimateDispersion(),
true);
184 vec dr = fam->dev_resids(y, work->mu, weights_);
187 fit.setDevResids( dr, y, work->mu, weights_);
191 fit.setFittedValues( work->mu, weights_ );
196 fit.residuals.elem(find(work->wsqrt == 0)).fill(datum::nan);
202 fit.varFitted = wvar( work->eta, weights_);
206 if( alloc_local)
delete work;
238 const vec &weights = {},
239 const vec &offset = {},
240 const bool &doCoxReid =
true,
242 const vec &betaInit = {},
243 const double &epsilon = 1e-8,
244 const double &maxit = 25,
245 const double &epsilon_nb = 1e-4,
246 const double &maxit_nb = 5,
247 const double &lambda = 0){
250 bool alloc_local =
false;
251 if( work ==
nullptr ){
261 fit = GLM(X, y,
"poisson/log",
LEAST, weights, offset, work, betaInit, epsilon, maxit, lambda);
262 vec beta_prev(fit.coef);
267 for(
int i=0; i<maxit_nb; i++){
269 theta = nb_theta_ml(y, work->mu, y.n_elem, weights, X, doCoxReid, ct);
272 beta_prev = fit.coef;
273 family =
"nb:" + to_string(theta);
274 fit = GLM(X, y, family,
LEAST, weights, offset, work, fit.coef, epsilon, maxit, lambda);
277 if( !fit.success )
break;
280 if( norm(fit.coef - beta_prev) < epsilon_nb )
break;
284 fit = GLM(X, y, family, md, weights, offset, work, fit.coef, epsilon, maxit, lambda);
291 fit.vcov = fit.vcov / fit.dispersion;
292 fit.se = sqrt(diagvec(fit.vcov));
294 fit.dispersion = 1.0;
297 if( alloc_local)
delete work;
330 const arma::mat &X_design,
331 const arma::mat &X_features,
332 const vector<string> &ids,
334 arma::vec weights = {},
335 const vec &offset = {},
337 const bool &doCoxReid =
true,
338 const bool &shareTheta =
false,
339 const bool &fastApprox =
false,
340 const int &nthreads = 1,
341 const double &epsilon = 1e-8,
342 const double &maxit = 25,
343 const double &epsilon_nb = 1e-4,
344 const double &maxit_nb = 5,
345 const double &lambda = 0){
348 if( ! weights.is_empty() ){
349 weights = weights / mean(weights);
352 int n_covs = X_design.n_cols;
358 if( family ==
"nb" ){
359 fitInit = GLM_NB(X_design, y,
LEAST, weights, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
363 family =
"nb:" + to_string(fitInit.theta);
366 fitInit = GLM(X_design, y, family,
LEAST, weights, offset, work, {}, epsilon, maxit, lambda);
370 vec workingResponse(work->z);
371 vec workingWeights(square(work->wsqrt));
372 workingWeights = workingWeights / mean(workingWeights);
382 ModelFitList mfl = lmFitFeatures_preproj(workingResponse, X_design, X_features, ids, workingWeights, md, nthreads);
384 for(
int i=0; i<mfl.size(); i++){
392 vec betaInit(fitInit.coef);
393 betaInit.resize(betaInit.n_elem + 1);
394 betaInit[betaInit.n_elem] = 0;
397 tbb::task_arena limited_arena(nthreads);
398 limited_arena.execute([&] {
400 tbb::blocked_range<int>(0, X_features.n_cols, 100),
401 [&](
const tbb::blocked_range<int>& r){
403 disable_parallel_blas();
407 arma::mat X(X_design);
408 X.insert_cols(n_covs, X_features.col(0));
410 GLMWork *work = new GLMWork();
413 for (int j = r.begin(); j != r.end(); ++j) {
415 X.col(n_covs) = X_features.col(j);
419 if( family ==
"nb" ){
420 fit = GLM_NB(X, y, md, weights, offset, doCoxReid, work, betaInit, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
422 fit = GLM(X, y, family, md, weights, offset, work, betaInit, epsilon, maxit, lambda);
466 const vector<string> &ids,
467 const vector<string> &family,
468 const arma::vec weights = {},
469 const vec &offset = {},
471 const bool &doCoxReid =
true,
472 const int &nthreads = 1,
473 const double &epsilon = 1e-8,
474 const double &maxit = 25,
475 const double &epsilon_nb = 1e-4,
476 const double & maxit_nb = 5,
477 const double &lambda = 0){
480 vec w_norm = weights;
481 if( ! w_norm.is_empty() ){
482 w_norm = w_norm / mean(w_norm);
488 uvec idx_drop = rows_with_nan(X);
490 X_clean.rows(idx_drop).zeros();
493 tbb::task_arena limited_arena(nthreads);
494 limited_arena.execute([&] {
496 tbb::blocked_range<int>(0, Y.n_cols, 10),
497 [&](
const tbb::blocked_range<int>& r){
499 disable_parallel_blas();
502 GLMWork *work = new GLMWork();
507 for (int j = r.begin(); j != r.end(); ++j) {
513 idx = unique(join_cols(find_nan(y), idx_drop));
519 if( family[j] ==
"nb" ){
520 fit = GLM_NB(X_clean, y, md, w, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
522 fit = GLM(X_clean, y, family[j], md, w, offset, work, {}, epsilon, maxit, lambda);
529 fit.mu_mean = mean(work->mu);
532 fit.y_mean = mean(y);
566 const vector<string> &ids,
567 const string &family,
568 const arma::vec &weights = {},
569 const vec &offset = {},
571 const bool &doCoxReid =
true,
572 const int &nthreads = 1,
573 const double &epsilon = 1e-8,
574 const double &maxit = 25,
575 const double &epsilon_nb = 1e-4,
576 const double & maxit_nb = 5,
577 const double &lambda = 0){
580 vector<string> famVec(Y.n_cols, family);
582 return glmFitResponses( Y, X, ids, famVec, weights, offset, md, doCoxReid, nthreads, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
Definition ModelFit.h:165
vec coef
Definition ModelFit.h:40
mat scaleEachCol(const mat &X, const vec &w)
Definition misc.h:17
Definition CleanData.h:17
ModelDetail
Definition ModelFit.h:26
@ MOST
Definition ModelFit.h:31
@ MEDIUM
Definition ModelFit.h:29
@ LEAST
Definition ModelFit.h:27
@ HIGH
Definition ModelFit.h:30
@ MAX
Definition ModelFit.h:32
@ LOW
Definition ModelFit.h:28
vector< ModelFitGLM > ModelFitGLMList
Definition ModelFit.h:394
vector< ModelFit > ModelFitList
Definition ModelFit.h:393
unordered_map< long, double > CountTable
Definition nb_theta.h:18
GLMWork()
Definition glm.h:82
vec wsqrt
Definition glm.h:81
vec gprime
Definition glm.h:81
vec eta
Definition glm.h:81
vec mu
Definition glm.h:81
LMWork()
Definition linearRegression.h:40