fastglmm
Massively scalable generalized linear mixed models
Loading...
Searching...
No Matches
glm.h
Go to the documentation of this file.
1/***************************************************************
2 * @file glm.h
3 * @author Gabriel Hoffman
4 * @email gabriel.hoffman@mssm.edu
5 * @brief Fit generalizd linear models
6 * Copyright (C) 2024 Gabriel Hoffman
7 **************************************************************/
8
9
10#ifndef _GLM_H_
11#define _GLM_H_
12
13// if -D USE_R, use RcppArmadillo library
14#ifdef USE_R
15// [[Rcpp::depends(RcppParallel)]]
16// [[Rcpp::depends(RcppParallel)]]
17#include <RcppArmadillo.h>
18#include <RcppParallel.h>
19#else
20#include <armadillo>
21#include <tbb/tbb.h>
22#endif
23
24
25#include <iostream>
26
27// #include "ModelFit.h"
28#include "linearRegression.h"
29#include "nb_theta.h"
30#include "glm_family.h"
31
32using namespace std;
33using namespace arma;
34
35namespace fastglmmLib {
36
39static void checkResponse(const vec &y, const string &family){
40
41 shared_ptr<GLMFamily> fam = getGLMFamily( family );
42
43 // Keep unique sorted values that are not NAN
44 vec res = unique(omit_nan(y));
45
46 string famStr = fam->family();
47
48 if( famStr == "BinomialLogit" || famStr == "BinomialProbit" ){
49 // valid for binary logistic regression
50 bool valid_binary = (res.n_elem == 2 && (res[0] == 0 && res[1] == 1));
51
52 // valid for binomial with logit/probit link
53 bool valid_beta = (res[0] >= 0 && res[res.n_elem-1] <= 1);
54
55 if( ! (valid_binary || valid_beta) ){
56 throw logic_error( "Invalid response for binomial, must be only 0/1 or in interval [0,1]" );
57 }
58 }
59 else if( famStr == "PoissonLog" || famStr == "QuasipoissonLog" || famStr == "NB"){
60 // min value must be non-negative
61 if( res[0] < 0){
62 throw logic_error( "Invalid response for poisson/nb: must be non-negative" );
63 }
64
65 // check that values are integers
66 if( ! all_integer_valued(res) ){
67 throw logic_error( "Invalid response for poisson/nb: must be integers" );
68 }
69
70 }else if( famStr == "QuasibinomialLogit" ){
71 if( res[0] < 0 || res[res.n_elem-1] >=1 ){
72 throw logic_error( "Invalid response for quasi-binomial, must be between 0 and 1" );
73 }
74 }
75}
76
77
80struct GLMWork : LMWork {
81 vec eta, mu, gprime, z, wsqrt, w;
83};
84
85
102static ModelFitGLM GLM(
103 const mat& X,
104 const colvec& y,
105 const string &family,
106 const ModelDetail md = LOW,
107 const vec &weights = {},
108 const vec &offset = {},
109 GLMWork *work = nullptr,
110 const vec &betaInit = {},
111 const double &epsilon = 1e-8,
112 const double &maxit = 25,
113 const double &lambda = 0){
114
115 shared_ptr<GLMFamily> fam = getGLMFamily( family );
116
117 checkResponse(y, family);
118
119 // allocate work, if not already alloc'd
120 bool alloc_local = false;
121 if( work == nullptr ){
122 alloc_local = true;
123 work = new GLMWork();
124 }
125
126 ModelFit fit;
127 fit.coef = betaInit;
128
129 // if offset is empty, set to zeros
130 vec offset_(offset);
131 if( offset.is_empty() ){
132 offset_ = vec(y.n_elem, fill::zeros);
133 }
134
135 // if weights is empty, set to ones1
136 vec weights_(weights);
137 if( weights.is_empty() ){
138 weights_ = vec(y.n_elem, fill::ones);
139 }
140
141 int i;
142 for(i=0; i<maxit; i++){
143 if( i == 0 && betaInit.is_empty() ){
144 // initialize mu, essential for Poisson
145 // faster convergence than initializing beta to zero
146 work->mu = fam->initialize(y, weights_);
147 work->eta = fam->link(work->mu) ;
148 }else{
149 // linear predictor
150 work->eta = X * fit.coef + offset_;
151 work->mu = fam->linkinv( work->eta );
152 }
153 work->gprime= fam->mu_eta( work->eta );
154 work->z = (work->eta - offset_) + (y - work->mu) / work->gprime;
155 work->wsqrt = work->gprime % sqrt(weights_ / fam->variance( work->mu ));
156
157 vec beta_prev(fit.coef);
158
159 // Solve least squares system to get beta
160 fit = lm(scaleEachCol(X, work->wsqrt), work->z % work->wsqrt, LEAST, lambda, 0, work );
161
162 // if model is singular
163 if( ! fit.success ) break;
164
165 // stopping criterion
166 if( i > 0 && norm(fit.coef - beta_prev) < epsilon ) break;
167 }
168
169 // for last beta value
170 // linear predictor
171 work->eta = X * fit.coef + offset_;
172 work->mu = fam->linkinv( work->eta );
173
174 // Solve least squares system,
175 // estimate other parameters based on ModelDetail
176 // Estimate dispersion if needed
177 // Reduce residual degrees of freedom by the number of
178 // entries with zero weights
179 double rdf_offset = sum(work->wsqrt == 0);
180 fit = lm(scaleEachCol(X, work->wsqrt), work->z % work->wsqrt, md, lambda, rdf_offset, work, fam->estimateDispersion(), true);
181
182 if( md == MAX){
183 // compute raw deviance residuals
184 vec dr = fam->dev_resids(y, work->mu, weights_);
185
186 // transform and store residuals
187 fit.setDevResids( dr, y, work->mu, weights_);
188 }
189
190 if( md >= MOST ){
191 fit.setFittedValues( work->mu, weights_ );
192 }
193
194 if( md >= HIGH ){
195 // if weight is zero, set residuals to NAN
196 fit.residuals.elem(find(work->wsqrt == 0)).fill(datum::nan);
197 }
198
199 if( md >= LOW ){
200 // save variance of fitted eta:
201 // prediction on latent scale
202 fit.varFitted = wvar( work->eta, weights_);
203 }
204
205 // free work if allocated in this function
206 if( alloc_local) delete work;
207
208 return ModelFitGLM(fit, family, i);
209}
210
211
212
213
214
215
234static ModelFitGLM GLM_NB(
235 const mat& X,
236 const colvec& y,
237 const ModelDetail md = LOW,
238 const vec &weights = {},
239 const vec &offset = {},
240 const bool &doCoxReid = true,
241 GLMWork *work = nullptr,
242 const vec &betaInit = {},
243 const double &epsilon = 1e-8,
244 const double &maxit = 25,
245 const double &epsilon_nb = 1e-4,
246 const double &maxit_nb = 5,
247 const double &lambda = 0){
248
249 // allocate work, if not already alloc'd
250 bool alloc_local = false;
251 if( work == nullptr ){
252 alloc_local = true;
253 work = new GLMWork();
254 }
255
256 ModelFitGLM fit;
257 string family;
258 double theta;
259
260 // Fit Poisson regression to estimate coefficients
261 fit = GLM(X, y, "poisson/log", LEAST, weights, offset, work, betaInit, epsilon, maxit, lambda);
262 vec beta_prev(fit.coef);
263
264 // Lookup table to speed up estimation of theta
265 CountTable ct = CreateLUT(y, weights);
266
267 for(int i=0; i<maxit_nb; i++){
268
269 theta = nb_theta_ml(y, work->mu, y.n_elem, weights, X, doCoxReid, ct);
270
271 // Fit NB regression to estimate coefficients
272 beta_prev = fit.coef;
273 family = "nb:" + to_string(theta);
274 fit = GLM(X, y, family, LEAST, weights, offset, work, fit.coef, epsilon, maxit, lambda);
275
276 // if model is singular
277 if( !fit.success ) break;
278
279 // stopping criterion
280 if( norm(fit.coef - beta_prev) < epsilon_nb ) break;
281 }
282
283 // Estimate parameters based on ModelDetail
284 fit = GLM(X, y, family, md, weights, offset, work, fit.coef, epsilon, maxit, lambda);
285 fit.theta = theta;
286
287 // Since theta is estimated from the data
288 // set the dispersion to be 1
289 // So undo scaling by dispersion
290 if( md >= MEDIUM ){
291 fit.vcov = fit.vcov / fit.dispersion;
292 fit.se = sqrt(diagvec(fit.vcov));
293 }
294 fit.dispersion = 1.0;
295
296 // free work if allocated in this function
297 if( alloc_local) delete work;
298
299 return fit;
300}
301
302
303
304
305
306
307
328static ModelFitGLMList glmFitFeatures(
329 const arma::vec &y,
330 const arma::mat &X_design,
331 const arma::mat &X_features,
332 const vector<string> &ids,
333 string family,
334 arma::vec weights = {},
335 const vec &offset = {},
336 const ModelDetail md = LOW,
337 const bool &doCoxReid = true,
338 const bool &shareTheta = false,
339 const bool &fastApprox = false,
340 const int &nthreads = 1,
341 const double &epsilon = 1e-8,
342 const double &maxit = 25,
343 const double &epsilon_nb = 1e-4,
344 const double &maxit_nb = 5,
345 const double &lambda = 0){
346
347 // standardize weights
348 if( ! weights.is_empty() ){
349 weights = weights / mean(weights);
350 }
351
352 int n_covs = X_design.n_cols;
353
354 // Estimate coef using only design matrix
355 // use to initialize coefficients for each feature
356 ModelFitGLM fitInit;
357 GLMWork *work = new GLMWork();
358 if( family == "nb" ){
359 fitInit = GLM_NB(X_design, y, LEAST, weights, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
360
361 // if shareTheta, use same theta value across all features
362 if( shareTheta ){
363 family = "nb:" + to_string(fitInit.theta);
364 }
365 }else{
366 fitInit = GLM(X_design, y, family, LEAST, weights, offset, work, {}, epsilon, maxit, lambda);
367 }
368
369 // get working response
370 vec workingResponse(work->z);
371 vec workingWeights(square(work->wsqrt));
372 workingWeights = workingWeights / mean(workingWeights);
373 delete work;
374
375 ModelFitGLMList fitList(X_features.n_cols, ModelFitGLM());
376
377 if( fastApprox ){
378
379 // Pre-projection on working response
380 // when test of X_features is truely under the null,
381 // approximation is very good
382 ModelFitList mfl = lmFitFeatures_preproj(workingResponse, X_design, X_features, ids, workingWeights, md, nthreads);
383
384 for(int i=0; i<mfl.size(); i++){
385 fitList.at(i) = ModelFitGLM(mfl[i], family, 1);
386 }
387 }else{
388
389 // Full fit of each model
390
391 // set betaInit to [fitInit.coef,0]
392 vec betaInit(fitInit.coef);
393 betaInit.resize(betaInit.n_elem + 1);
394 betaInit[betaInit.n_elem] = 0;
395
396 // Parallel part using Thread Building Blocks
397 tbb::task_arena limited_arena(nthreads);
398 limited_arena.execute([&] {
399 tbb::parallel_for(
400 tbb::blocked_range<int>(0, X_features.n_cols, 100),
401 [&](const tbb::blocked_range<int>& r){
402
403 disable_parallel_blas();
404
405 // create design matrix with jth feature in the last column
406 // X = cbind(X_design, X_features[,0])
407 arma::mat X(X_design);
408 X.insert_cols(n_covs, X_features.col(0));
409
410 GLMWork *work = new GLMWork();
411
412 // iterate through features
413 for (int j = r.begin(); j != r.end(); ++j) {
414 // Create design matrix with intercept as first column
415 X.col(n_covs) = X_features.col(j);
416
417 // GLM regression
418 ModelFitGLM fit;
419 if( family == "nb" ){
420 fit = GLM_NB(X, y, md, weights, offset, doCoxReid, work, betaInit, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
421 }else{
422 fit = GLM(X, y, family, md, weights, offset, work, betaInit, epsilon, maxit, lambda);
423 }
424
425 // Save feature ID
426 fit.ID = ids[j];
427
428 // save result to list
429 fitList.at(j) = fit;
430 }
431
432 delete work;
433 }); });
434 }
435
436 return fitList;
437}
438
439
440
441
442
463static ModelFitGLMList glmFitResponses(
464 const arma::mat &Y,
465 const arma::mat &X,
466 const vector<string> &ids,
467 const vector<string> &family,
468 const arma::vec weights = {},
469 const vec &offset = {},
470 const ModelDetail md = LOW,
471 const bool &doCoxReid = true,
472 const int &nthreads = 1,
473 const double &epsilon = 1e-8,
474 const double &maxit = 25,
475 const double &epsilon_nb = 1e-4,
476 const double & maxit_nb = 5,
477 const double &lambda = 0){
478
479 // standardize weights
480 vec w_norm = weights;
481 if( ! w_norm.is_empty() ){
482 w_norm = w_norm / mean(w_norm);
483 }
484
485 ModelFitGLMList fitList(Y.n_cols, ModelFitGLM());
486
487 // find rows in X with NAN values
488 uvec idx_drop = rows_with_nan(X);
489 mat X_clean(X);
490 X_clean.rows(idx_drop).zeros();
491
492 // Parallel part using Thread Building Blocks
493 tbb::task_arena limited_arena(nthreads);
494 limited_arena.execute([&] {
495 tbb::parallel_for(
496 tbb::blocked_range<int>(0, Y.n_cols, 10),
497 [&](const tbb::blocked_range<int>& r){
498
499 disable_parallel_blas();
500
501 // local workspace
502 GLMWork *work = new GLMWork();
503 vec y, w;
504 uvec idx;
505
506 // iterate through responses
507 for (int j = r.begin(); j != r.end(); ++j) {
508
509 // identify samples with NAN entries
510 // set values and weights to zero
511 y = Y.col(j);
512 w = w_norm;
513 idx = unique(join_cols(find_nan(y), idx_drop));
514 y.elem(idx).zeros();
515 w.elem(idx).zeros();
516
517 // GLM regression
518 ModelFitGLM fit;
519 if( family[j] == "nb" ){
520 fit = GLM_NB(X_clean, y, md, w, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
521 }else{
522 fit = GLM(X_clean, y, family[j], md, w, offset, work, {}, epsilon, maxit, lambda);
523 }
524
525 // Save feature ID
526 fit.ID = ids[j];
527
528 // return mean of mu for jth response
529 fit.mu_mean = mean(work->mu);
530
531 // return mean of response
532 fit.y_mean = mean(y);
533
534 // save result to list
535 fitList.at(j) = fit;
536 }
537 delete work;
538 }); });
539
540 return fitList;
541 }
542
563static ModelFitGLMList glmFitResponses(
564 const arma::mat &Y,
565 const arma::mat &X,
566 const vector<string> &ids,
567 const string &family,
568 const arma::vec &weights = {},
569 const vec &offset = {},
570 const ModelDetail md = LOW,
571 const bool &doCoxReid = true,
572 const int &nthreads = 1,
573 const double &epsilon = 1e-8,
574 const double &maxit = 25,
575 const double &epsilon_nb = 1e-4,
576 const double & maxit_nb = 5,
577 const double &lambda = 0){
578
579 // all responses analyzed with same family value
580 vector<string> famVec(Y.n_cols, family);
581
582 return glmFitResponses( Y, X, ids, famVec, weights, offset, md, doCoxReid, nthreads, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
583}
584
585
586};
587
588#endif
Definition ModelFit.h:165
Definition ModelFit.h:37
vec coef
Definition ModelFit.h:40
mat scaleEachCol(const mat &X, const vec &w)
Definition misc.h:17
Definition CleanData.h:17
ModelDetail
Definition ModelFit.h:26
@ MOST
Definition ModelFit.h:31
@ MEDIUM
Definition ModelFit.h:29
@ LEAST
Definition ModelFit.h:27
@ HIGH
Definition ModelFit.h:30
@ MAX
Definition ModelFit.h:32
@ LOW
Definition ModelFit.h:28
vector< ModelFitGLM > ModelFitGLMList
Definition ModelFit.h:394
vector< ModelFit > ModelFitList
Definition ModelFit.h:393
unordered_map< long, double > CountTable
Definition nb_theta.h:18
Definition glm.h:80
vec w
Definition glm.h:81
GLMWork()
Definition glm.h:82
vec wsqrt
Definition glm.h:81
vec gprime
Definition glm.h:81
vec z
Definition glm.h:81
vec eta
Definition glm.h:81
vec mu
Definition glm.h:81
LMWork()
Definition linearRegression.h:40