fastglmm
Massively scalable generalized linear mixed models
Loading...
Searching...
No Matches
glm.h
Go to the documentation of this file.
1/***************************************************************
2 * @file glm.h
3 * @author Gabriel Hoffman
4 * @email gabriel.hoffman@mssm.edu
5 * @brief Fit generalizd linear models
6 * Copyright (C) 2024 Gabriel Hoffman
7 **************************************************************/
8
9
10#ifndef _GLM_H_
11#define _GLM_H_
12
13// if -D USE_R, use RcppArmadillo library
14#ifdef USE_R
15// [[Rcpp::depends(RcppParallel)]]
16// [[Rcpp::depends(RcppParallel)]]
17#include <RcppArmadillo.h>
18#include <RcppParallel.h>
19#else
20#include <armadillo>
21#include <tbb/tbb.h>
22#endif
23
24
25#include <iostream>
26
27// #include "ModelFit.h"
28#include "linearRegression.h"
29#include "nb_theta.h"
30#include "glm_family.h"
31
32using namespace std;
33using namespace arma;
34
35namespace fastglmmLib {
36
39static void checkResponse(const vec &y, const string &family){
40
41 shared_ptr<GLMFamily> fam = getGLMFamily( family );
42
43 // Keep unique sorted values that are not NAN
44 vec res = unique(omit_nan(y));
45
46 string famStr = fam->family();
47
48 if( famStr == "BinomialLogit" || famStr == "BinomialProbit" ){
49 // valid for binary logistic regression
50 bool valid_binary = (res.n_elem == 2 && (res[0] == 0 && res[1] == 1));
51
52 // valid for binomial with logit/probit link
53 bool valid_beta = (res[0] >= 0 && res[res.n_elem-1] <= 1);
54
55 if( ! (valid_binary || valid_beta) ){
56 throw logic_error( "Invalid response for binomial, must be only 0/1 or in interval [0,1]" );
57 }
58 }
59 else if( famStr == "PoissonLog" || famStr == "QuasipoissonLog" || famStr == "NB"){
60 // min value must be non-negative
61 if( res[0] < 0){
62 throw logic_error( "Invalid response for poisson/nb, must be non-negative" );
63 }
64 }else if( famStr == "QuasibinomialLogit" ){
65 if( res[0] < 0 || res[res.n_elem-1] >=1 ){
66 throw logic_error( "Invalid response for quasi-binomial, must be between 0 and 1" );
67 }
68 }
69}
70
71
74struct GLMWork : LMWork {
75 vec eta, mu, gprime, z, wsqrt, w;
77};
78
79
96static ModelFitGLM GLM(
97 const mat& X,
98 const colvec& y,
99 const string &family,
100 const ModelDetail md = LOW,
101 const vec &weights = {},
102 const vec &offset = {},
103 GLMWork *work = nullptr,
104 const vec &betaInit = {},
105 const double &epsilon = 1e-8,
106 const double &maxit = 25,
107 const double &lambda = 0){
108
109 shared_ptr<GLMFamily> fam = getGLMFamily( family );
110
111 checkResponse(y, family);
112
113 // allocate work, if not already alloc'd
114 bool alloc_local = false;
115 if( work == nullptr ){
116 alloc_local = true;
117 work = new GLMWork();
118 }
119
120 ModelFit fit;
121 fit.coef = betaInit;
122
123 // if offset is empty, set to zeros
124 vec offset_(offset);
125 if( offset.is_empty() ){
126 offset_ = vec(y.n_elem, fill::zeros);
127 }
128
129 // if weights is empty, set to ones1
130 vec weights_(weights);
131 if( weights.is_empty() ){
132 weights_ = vec(y.n_elem, fill::ones);
133 }
134
135 int i;
136 for(i=0; i<maxit; i++){
137 if( i == 0 && betaInit.is_empty() ){
138 // initialize mu, essential for Poisson
139 // faster convergence than initializing beta to zero
140 work->mu = fam->initialize(y, weights_);
141 work->eta = fam->link(work->mu) ;
142 }else{
143 // linear predictor
144 work->eta = X * fit.coef + offset_;
145 work->mu = fam->linkinv( work->eta );
146 }
147 work->gprime= fam->mu_eta( work->eta );
148 work->z = (work->eta - offset_) + (y - work->mu) / work->gprime;
149 work->wsqrt = work->gprime % sqrt(weights_ / fam->variance( work->mu ));
150
151 vec beta_prev(fit.coef);
152
153 // Solve least squares system to get beta
154 fit = lm(scaleEachCol(X, work->wsqrt), work->z % work->wsqrt, LEAST, lambda, 0, work );
155
156 // if model is singular
157 if( ! fit.success ) break;
158
159 // stopping criterion
160 if( i > 0 && norm(fit.coef - beta_prev) < epsilon ) break;
161 }
162
163 // for last beta value
164 // linear predictor
165 work->eta = X * fit.coef + offset_;
166 work->mu = fam->linkinv( work->eta );
167
168 // Solve least squares system,
169 // estimate other parameters based on ModelDetail
170 // Estimate dispersion if needed
171 // Reduce residual degrees of freedom by the number of
172 // entries with zero weights
173 double rdf_offset = sum(work->wsqrt == 0);
174 fit = lm(scaleEachCol(X, work->wsqrt), work->z % work->wsqrt, md, lambda, rdf_offset, work, fam->estimateDispersion(), true);
175
176 if( md == MAX){
177 // compute raw deviance residuals
178 vec dr = fam->dev_resids(y, work->mu, weights_);
179
180 // transform and store residuals
181 fit.setDevResids( dr, y, work->mu, weights_);
182 }
183
184 if( md >= MOST ){
185 fit.setFittedValues( work->mu, weights_ );
186 }
187
188 if( md >= HIGH ){
189 // if weight is zero, set residuals to NAN
190 fit.residuals.elem(find(work->wsqrt == 0)).fill(datum::nan);
191 }
192
193 if( md >= LOW ){
194 // save variance of fitted eta:
195 // prediction on latent scale
196 fit.varFitted = wvar( work->eta, weights_);
197 }
198
199 // free work if allocated in this function
200 if( alloc_local) delete work;
201
202 return ModelFitGLM(fit, family, i);
203}
204
205
206
207
208
209
228static ModelFitGLM GLM_NB(
229 const mat& X,
230 const colvec& y,
231 const ModelDetail md = LOW,
232 const vec &weights = {},
233 const vec &offset = {},
234 const bool &doCoxReid = true,
235 GLMWork *work = nullptr,
236 const vec &betaInit = {},
237 const double &epsilon = 1e-8,
238 const double &maxit = 25,
239 const double &epsilon_nb = 1e-4,
240 const double &maxit_nb = 5,
241 const double &lambda = 0){
242
243 // allocate work, if not already alloc'd
244 bool alloc_local = false;
245 if( work == nullptr ){
246 alloc_local = true;
247 work = new GLMWork();
248 }
249
250 ModelFitGLM fit;
251 string family;
252 double theta;
253
254 // Fit Poisson regression to estimate coefficients
255 fit = GLM(X, y, "poisson/log", LEAST, weights, offset, work, betaInit, epsilon, maxit, lambda);
256 vec beta_prev(fit.coef);
257
258 // Lookup table to speed up estimation of theta
259 CountTable ct = CreateLUT(y, weights);
260
261 for(int i=0; i<maxit_nb; i++){
262
263 theta = nb_theta_ml(y, work->mu, y.n_elem, weights, X, doCoxReid, ct);
264
265 // Fit NB regression to estimate coefficients
266 beta_prev = fit.coef;
267 family = "nb:" + to_string(theta);
268 fit = GLM(X, y, family, LEAST, weights, offset, work, fit.coef, epsilon, maxit, lambda);
269
270 // if model is singular
271 if( !fit.success ) break;
272
273 // stopping criterion
274 if( norm(fit.coef - beta_prev) < epsilon_nb ) break;
275 }
276
277 // Estimate parameters based on ModelDetail
278 fit = GLM(X, y, family, md, weights, offset, work, fit.coef, epsilon, maxit, lambda);
279 fit.theta = theta;
280
281 // Since theta is estimated from the data
282 // set the dispersion to be 1
283 // So undo scaling by dispersion
284 if( md >= MEDIUM ){
285 fit.vcov = fit.vcov / fit.dispersion;
286 fit.se = sqrt(diagvec(fit.vcov));
287 }
288 fit.dispersion = 1.0;
289
290 // free work if allocated in this function
291 if( alloc_local) delete work;
292
293 return fit;
294}
295
296
297
298
299
300
301
322static ModelFitGLMList glmFitFeatures(
323 const arma::vec &y,
324 const arma::mat &X_design,
325 const arma::mat &X_features,
326 const vector<string> &ids,
327 string family,
328 arma::vec weights = {},
329 const vec &offset = {},
330 const ModelDetail md = LOW,
331 const bool &doCoxReid = true,
332 const bool &shareTheta = false,
333 const bool &fastApprox = false,
334 const int &nthreads = 1,
335 const double &epsilon = 1e-8,
336 const double &maxit = 25,
337 const double &epsilon_nb = 1e-4,
338 const double &maxit_nb = 5,
339 const double &lambda = 0){
340
341 // standardize weights
342 if( ! weights.is_empty() ){
343 weights = weights / mean(weights);
344 }
345
346 int n_covs = X_design.n_cols;
347
348 // Estimate coef using only design matrix
349 // use to initialize coefficients for each feature
350 ModelFitGLM fitInit;
351 GLMWork *work = new GLMWork();
352 if( family == "nb" ){
353 fitInit = GLM_NB(X_design, y, LEAST, weights, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
354
355 // if shareTheta, use same theta value across all features
356 if( shareTheta ){
357 family = "nb:" + to_string(fitInit.theta);
358 }
359 }else{
360 fitInit = GLM(X_design, y, family, LEAST, weights, offset, work, {}, epsilon, maxit, lambda);
361 }
362
363 // get working response
364 vec workingResponse(work->z);
365 vec workingWeights(square(work->wsqrt));
366 workingWeights = workingWeights / mean(workingWeights);
367 delete work;
368
369 ModelFitGLMList fitList(X_features.n_cols, ModelFitGLM());
370
371 if( fastApprox ){
372
373 // Pre-projection on working response
374 // when test of X_features is truely under the null,
375 // approximation is very good
376 ModelFitList mfl = lmFitFeatures_preproj(workingResponse, X_design, X_features, ids, workingWeights, md, nthreads);
377
378 for(int i=0; i<mfl.size(); i++){
379 fitList.at(i) = ModelFitGLM(mfl[i], family, 1);
380 }
381 }else{
382
383 // Full fit of each model
384
385 // set betaInit to [fitInit.coef,0]
386 vec betaInit(fitInit.coef);
387 betaInit.resize(betaInit.n_elem + 1);
388 betaInit[betaInit.n_elem] = 0;
389
390 // Parallel part using Thread Building Blocks
391 tbb::task_arena limited_arena(nthreads);
392 limited_arena.execute([&] {
393 tbb::parallel_for(
394 tbb::blocked_range<int>(0, X_features.n_cols, 100),
395 [&](const tbb::blocked_range<int>& r){
396
397 disable_parallel_blas();
398
399 // create design matrix with jth feature in the last column
400 // X = cbind(X_design, X_features[,0])
401 arma::mat X(X_design);
402 X.insert_cols(n_covs, X_features.col(0));
403
404 GLMWork *work = new GLMWork();
405
406 // iterate through features
407 for (int j = r.begin(); j != r.end(); ++j) {
408 // Create design matrix with intercept as first column
409 X.col(n_covs) = X_features.col(j);
410
411 // GLM regression
412 ModelFitGLM fit;
413 if( family == "nb" ){
414 fit = GLM_NB(X, y, md, weights, offset, doCoxReid, work, betaInit, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
415 }else{
416 fit = GLM(X, y, family, md, weights, offset, work, betaInit, epsilon, maxit, lambda);
417 }
418
419 // Save feature ID
420 fit.ID = ids[j];
421
422 // save result to list
423 fitList.at(j) = fit;
424 }
425
426 delete work;
427 }); });
428 }
429
430 return fitList;
431}
432
433
434
435
436
457static ModelFitGLMList glmFitResponses(
458 const arma::mat &Y,
459 const arma::mat &X,
460 const vector<string> &ids,
461 const vector<string> &family,
462 const arma::vec weights = {},
463 const vec &offset = {},
464 const ModelDetail md = LOW,
465 const bool &doCoxReid = true,
466 const int &nthreads = 1,
467 const double &epsilon = 1e-8,
468 const double &maxit = 25,
469 const double &epsilon_nb = 1e-4,
470 const double & maxit_nb = 5,
471 const double &lambda = 0){
472
473 // standardize weights
474 vec w_norm = weights;
475 if( ! w_norm.is_empty() ){
476 w_norm = w_norm / mean(w_norm);
477 }
478
479 ModelFitGLMList fitList(Y.n_cols, ModelFitGLM());
480
481 // find rows in X with NAN values
482 uvec idx_drop = rows_with_nan(X);
483 mat X_clean(X);
484 X_clean.rows(idx_drop).zeros();
485
486 // Parallel part using Thread Building Blocks
487 tbb::task_arena limited_arena(nthreads);
488 limited_arena.execute([&] {
489 tbb::parallel_for(
490 tbb::blocked_range<int>(0, Y.n_cols, 10),
491 [&](const tbb::blocked_range<int>& r){
492
493 disable_parallel_blas();
494
495 // local workspace
496 GLMWork *work = new GLMWork();
497 vec y, w;
498 uvec idx;
499
500 // iterate through responses
501 for (int j = r.begin(); j != r.end(); ++j) {
502
503 // identify samples with NAN entries
504 // set values and weights to zero
505 y = Y.col(j);
506 w = w_norm;
507 idx = unique(join_cols(find_nan(y), idx_drop));
508 y.elem(idx).zeros();
509 w.elem(idx).zeros();
510
511 // GLM regression
512 ModelFitGLM fit;
513 if( family[j] == "nb" ){
514 fit = GLM_NB(X_clean, y, md, w, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
515 }else{
516 fit = GLM(X_clean, y, family[j], md, w, offset, work, {}, epsilon, maxit, lambda);
517 }
518
519 // Save feature ID
520 fit.ID = ids[j];
521
522 // return mean of mu for jth response
523 fit.mu_mean = mean(work->mu);
524
525 // return mean of response
526 fit.y_mean = mean(y);
527
528 // save result to list
529 fitList.at(j) = fit;
530 }
531 delete work;
532 }); });
533
534 return fitList;
535 }
536
557static ModelFitGLMList glmFitResponses(
558 const arma::mat &Y,
559 const arma::mat &X,
560 const vector<string> &ids,
561 const string &family,
562 const arma::vec &weights = {},
563 const vec &offset = {},
564 const ModelDetail md = LOW,
565 const bool &doCoxReid = true,
566 const int &nthreads = 1,
567 const double &epsilon = 1e-8,
568 const double &maxit = 25,
569 const double &epsilon_nb = 1e-4,
570 const double & maxit_nb = 5,
571 const double &lambda = 0){
572
573 // all responses analyzed with same family value
574 vector<string> famVec(Y.n_cols, family);
575
576 return glmFitResponses( Y, X, ids, famVec, weights, offset, md, doCoxReid, nthreads, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
577}
578
579
580};
581
582#endif
Definition ModelFit.h:165
Definition ModelFit.h:37
vec coef
Definition ModelFit.h:40
mat scaleEachCol(const mat &X, const vec &w)
Definition misc.h:16
Definition CleanData.h:17
ModelDetail
Definition ModelFit.h:26
@ MOST
Definition ModelFit.h:31
@ MEDIUM
Definition ModelFit.h:29
@ LEAST
Definition ModelFit.h:27
@ HIGH
Definition ModelFit.h:30
@ MAX
Definition ModelFit.h:32
@ LOW
Definition ModelFit.h:28
vector< ModelFitGLM > ModelFitGLMList
Definition ModelFit.h:394
vector< ModelFit > ModelFitList
Definition ModelFit.h:393
unordered_map< long, double > CountTable
Definition nb_theta.h:18
Definition glm.h:74
vec w
Definition glm.h:75
GLMWork()
Definition glm.h:76
vec wsqrt
Definition glm.h:75
vec gprime
Definition glm.h:75
vec z
Definition glm.h:75
vec eta
Definition glm.h:75
vec mu
Definition glm.h:75
LMWork()
Definition linearRegression.h:40