17#include <RcppArmadillo.h>
18#include <RcppParallel.h>
39static void checkResponse(
const vec &y,
const string &family){
41 shared_ptr<GLMFamily> fam = getGLMFamily( family );
44 vec res = unique(omit_nan(y));
46 string famStr = fam->family();
48 if( famStr ==
"BinomialLogit" || famStr ==
"BinomialProbit" ){
50 bool valid_binary = (res.n_elem == 2 && (res[0] == 0 && res[1] == 1));
53 bool valid_beta = (res[0] >= 0 && res[res.n_elem-1] <= 1);
55 if( ! (valid_binary || valid_beta) ){
56 throw logic_error(
"Invalid response for binomial, must be only 0/1 or in interval [0,1]" );
59 else if( famStr ==
"PoissonLog" || famStr ==
"QuasipoissonLog" || famStr ==
"NB"){
62 throw logic_error(
"Invalid response for poisson/nb, must be non-negative" );
64 }
else if( famStr ==
"QuasibinomialLogit" ){
65 if( res[0] < 0 || res[res.n_elem-1] >=1 ){
66 throw logic_error(
"Invalid response for quasi-binomial, must be between 0 and 1" );
96static ModelFitGLM GLM(
101 const vec &weights = {},
102 const vec &offset = {},
104 const vec &betaInit = {},
105 const double &epsilon = 1e-8,
106 const double &maxit = 25,
107 const double &lambda = 0){
109 shared_ptr<GLMFamily> fam = getGLMFamily( family );
111 checkResponse(y, family);
114 bool alloc_local =
false;
115 if( work ==
nullptr ){
125 if( offset.is_empty() ){
126 offset_ = vec(y.n_elem, fill::zeros);
130 vec weights_(weights);
131 if( weights.is_empty() ){
132 weights_ = vec(y.n_elem, fill::ones);
136 for(i=0; i<maxit; i++){
137 if( i == 0 && betaInit.is_empty() ){
140 work->mu = fam->initialize(y, weights_);
141 work->eta = fam->link(work->mu) ;
144 work->eta = X * fit.coef + offset_;
145 work->mu = fam->linkinv( work->eta );
147 work->gprime= fam->mu_eta( work->eta );
148 work->z = (work->eta - offset_) + (y - work->mu) / work->gprime;
149 work->wsqrt = work->gprime % sqrt(weights_ / fam->variance( work->mu ));
151 vec beta_prev(fit.coef);
154 fit = lm(
scaleEachCol(X, work->wsqrt), work->z % work->wsqrt,
LEAST, lambda, 0, work );
157 if( ! fit.success )
break;
160 if( i > 0 && norm(fit.coef - beta_prev) < epsilon )
break;
165 work->eta = X * fit.coef + offset_;
166 work->mu = fam->linkinv( work->eta );
173 double rdf_offset = sum(work->wsqrt == 0);
174 fit = lm(
scaleEachCol(X, work->wsqrt), work->z % work->wsqrt, md, lambda, rdf_offset, work, fam->estimateDispersion(),
true);
178 vec dr = fam->dev_resids(y, work->mu, weights_);
181 fit.setDevResids( dr, y, work->mu, weights_);
185 fit.setFittedValues( work->mu, weights_ );
190 fit.residuals.elem(find(work->wsqrt == 0)).fill(datum::nan);
196 fit.varFitted = wvar( work->eta, weights_);
200 if( alloc_local)
delete work;
232 const vec &weights = {},
233 const vec &offset = {},
234 const bool &doCoxReid =
true,
236 const vec &betaInit = {},
237 const double &epsilon = 1e-8,
238 const double &maxit = 25,
239 const double &epsilon_nb = 1e-4,
240 const double &maxit_nb = 5,
241 const double &lambda = 0){
244 bool alloc_local =
false;
245 if( work ==
nullptr ){
255 fit = GLM(X, y,
"poisson/log",
LEAST, weights, offset, work, betaInit, epsilon, maxit, lambda);
256 vec beta_prev(fit.coef);
261 for(
int i=0; i<maxit_nb; i++){
263 theta = nb_theta_ml(y, work->mu, y.n_elem, weights, X, doCoxReid, ct);
266 beta_prev = fit.coef;
267 family =
"nb:" + to_string(theta);
268 fit = GLM(X, y, family,
LEAST, weights, offset, work, fit.coef, epsilon, maxit, lambda);
271 if( !fit.success )
break;
274 if( norm(fit.coef - beta_prev) < epsilon_nb )
break;
278 fit = GLM(X, y, family, md, weights, offset, work, fit.coef, epsilon, maxit, lambda);
285 fit.vcov = fit.vcov / fit.dispersion;
286 fit.se = sqrt(diagvec(fit.vcov));
288 fit.dispersion = 1.0;
291 if( alloc_local)
delete work;
324 const arma::mat &X_design,
325 const arma::mat &X_features,
326 const vector<string> &ids,
328 arma::vec weights = {},
329 const vec &offset = {},
331 const bool &doCoxReid =
true,
332 const bool &shareTheta =
false,
333 const bool &fastApprox =
false,
334 const int &nthreads = 1,
335 const double &epsilon = 1e-8,
336 const double &maxit = 25,
337 const double &epsilon_nb = 1e-4,
338 const double &maxit_nb = 5,
339 const double &lambda = 0){
342 if( ! weights.is_empty() ){
343 weights = weights / mean(weights);
346 int n_covs = X_design.n_cols;
352 if( family ==
"nb" ){
353 fitInit = GLM_NB(X_design, y,
LEAST, weights, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
357 family =
"nb:" + to_string(fitInit.theta);
360 fitInit = GLM(X_design, y, family,
LEAST, weights, offset, work, {}, epsilon, maxit, lambda);
364 vec workingResponse(work->z);
365 vec workingWeights(square(work->wsqrt));
366 workingWeights = workingWeights / mean(workingWeights);
376 ModelFitList mfl = lmFitFeatures_preproj(workingResponse, X_design, X_features, ids, workingWeights, md, nthreads);
378 for(
int i=0; i<mfl.size(); i++){
386 vec betaInit(fitInit.coef);
387 betaInit.resize(betaInit.n_elem + 1);
388 betaInit[betaInit.n_elem] = 0;
391 tbb::task_arena limited_arena(nthreads);
392 limited_arena.execute([&] {
394 tbb::blocked_range<int>(0, X_features.n_cols, 100),
395 [&](
const tbb::blocked_range<int>& r){
397 disable_parallel_blas();
401 arma::mat X(X_design);
402 X.insert_cols(n_covs, X_features.col(0));
404 GLMWork *work = new GLMWork();
407 for (int j = r.begin(); j != r.end(); ++j) {
409 X.col(n_covs) = X_features.col(j);
413 if( family ==
"nb" ){
414 fit = GLM_NB(X, y, md, weights, offset, doCoxReid, work, betaInit, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
416 fit = GLM(X, y, family, md, weights, offset, work, betaInit, epsilon, maxit, lambda);
460 const vector<string> &ids,
461 const vector<string> &family,
462 const arma::vec weights = {},
463 const vec &offset = {},
465 const bool &doCoxReid =
true,
466 const int &nthreads = 1,
467 const double &epsilon = 1e-8,
468 const double &maxit = 25,
469 const double &epsilon_nb = 1e-4,
470 const double & maxit_nb = 5,
471 const double &lambda = 0){
474 vec w_norm = weights;
475 if( ! w_norm.is_empty() ){
476 w_norm = w_norm / mean(w_norm);
482 uvec idx_drop = rows_with_nan(X);
484 X_clean.rows(idx_drop).zeros();
487 tbb::task_arena limited_arena(nthreads);
488 limited_arena.execute([&] {
490 tbb::blocked_range<int>(0, Y.n_cols, 10),
491 [&](
const tbb::blocked_range<int>& r){
493 disable_parallel_blas();
496 GLMWork *work = new GLMWork();
501 for (int j = r.begin(); j != r.end(); ++j) {
507 idx = unique(join_cols(find_nan(y), idx_drop));
513 if( family[j] ==
"nb" ){
514 fit = GLM_NB(X_clean, y, md, w, offset, doCoxReid, work, {}, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
516 fit = GLM(X_clean, y, family[j], md, w, offset, work, {}, epsilon, maxit, lambda);
523 fit.mu_mean = mean(work->mu);
526 fit.y_mean = mean(y);
560 const vector<string> &ids,
561 const string &family,
562 const arma::vec &weights = {},
563 const vec &offset = {},
565 const bool &doCoxReid =
true,
566 const int &nthreads = 1,
567 const double &epsilon = 1e-8,
568 const double &maxit = 25,
569 const double &epsilon_nb = 1e-4,
570 const double & maxit_nb = 5,
571 const double &lambda = 0){
574 vector<string> famVec(Y.n_cols, family);
576 return glmFitResponses( Y, X, ids, famVec, weights, offset, md, doCoxReid, nthreads, epsilon, maxit, epsilon_nb, maxit_nb, lambda);
Definition ModelFit.h:165
vec coef
Definition ModelFit.h:40
mat scaleEachCol(const mat &X, const vec &w)
Definition misc.h:16
Definition CleanData.h:17
ModelDetail
Definition ModelFit.h:26
@ MOST
Definition ModelFit.h:31
@ MEDIUM
Definition ModelFit.h:29
@ LEAST
Definition ModelFit.h:27
@ HIGH
Definition ModelFit.h:30
@ MAX
Definition ModelFit.h:32
@ LOW
Definition ModelFit.h:28
vector< ModelFitGLM > ModelFitGLMList
Definition ModelFit.h:394
vector< ModelFit > ModelFitList
Definition ModelFit.h:393
unordered_map< long, double > CountTable
Definition nb_theta.h:18
GLMWork()
Definition glm.h:76
vec wsqrt
Definition glm.h:75
vec gprime
Definition glm.h:75
vec eta
Definition glm.h:75
vec mu
Definition glm.h:75
LMWork()
Definition linearRegression.h:40