11#include <unordered_set>
14#include <boost/algorithm/string.hpp>
27static vector<double> GP_to_dosage(
const vector<T> &v,
const bool &missingToMean) {
28 vector<double> res( v.size() / 3.0);
32 int runningSum = 0, nValid = 0;
37 for(
int i=0; i<res.size(); i++){
39 value = v[3*i]*0 + v[3*i+1]*1 + v[3*i+2]*2;
44 value = std::numeric_limits<double>::quiet_NaN();
57 double mu = runningSum / (double) nValid;
62 for(
const int& i : missing) res[i] = mu;
72static vector<double> intToDosage(
const vector<int> &v,
const bool &missingToMean) {
75 vector<double> res( v.size() / 2.0);
79 int runningSum = 0, nValid = 0;
84 for(
int i=0; i<res.size(); i++){
85 value = v[2*i] + v[2*i+1];
90 value = std::numeric_limits<double>::quiet_NaN();
103 double mu = runningSum / (double) nValid;
108 for(
const int& i : missing) res[i] = mu;
118static size_t removeDuplicates(vector<T>& vec){
119 unordered_set<T> seen;
121 auto newEnd = remove_if(vec.begin(), vec.end(), [&seen](
const T& value)
123 if (seen.find(value) != end(seen))
130 vec.erase(newEnd, vec.end());
139static arma::vec colSums(
const arma::mat &X){
142 arma::rowvec ONE(X.n_rows, arma::fill::ones);
145 arma::mat tmp = ONE * X;
147 return arma::conv_to<arma::vec>::from( tmp );
158static void standardize( arma::mat &X,
const bool ¢er =
true,
const bool &scale =
true,
const double tol = 1e-10 ){
160 double sqrt_rdf = sqrt(X.n_rows - 1.0);
167 for(
size_t j=0; j<X.n_cols; j++){
168 if( center ) X.col(j) -= mean(X.col(j));
169 double sd = norm(X.col(j)) / sqrt_rdf;
170 if( scale && sd > tol ) X.col(j) /= sd;
175static void standardize( Eigen::MatrixXd &X,
const bool ¢er =
true,
const bool &scale =
true,
const double tol = 1e-10 ){
177 double sqrt_rdf = sqrt(X.rows() - 1.0);
178 if(center) X.rowwise() -= X.colwise().mean();
181 for(
size_t j=0; j<X.cols(); j++) {
182 double sd = X.col(j).norm() / sqrt_rdf ;
183 if(sd > tol) X.col(j) /= sd;
189static void standardize_rows( Eigen::MatrixXd &X,
const bool ¢er =
true,
const bool &scale =
true,
const double tol = 1e-10 ){
191 double sqrt_rdf = sqrt(X.cols() - 1.0);
192 if(center) X.colwise() -= X.rowwise().mean();
195 for(
size_t j=0; j<X.rows(); j++) {
196 double sd = X.row(j).norm() / sqrt_rdf ;
197 if(sd > tol) X.row(j) /= sd;
205static bool isOnlyDigits(
const std::string& s){
206 int n = count_if(s.begin(), s.end(),
207 [](
unsigned char c){ return isdigit(c); }
209 return( n == s.size());
215static void nanToMean( arma::vec & v){
217 arma::uvec idx = arma::find_finite(v);
220 if( idx.n_elem < v.n_elem ){
222 double mu = arma::mean( v.elem(idx));
225 v.replace(arma::datum::nan, mu);
247 case VCF:
return "vcf";
248 case VCFGZ:
return "vcf.gz";
249 case BCF:
return "bcf";
250 case BGEN:
return "bgen";
251 case PGEN:
return "pgen";
252 case PBED:
return "bed";
253 case OTHER:
return "other";
254 default:
return "other";
262static FileType getFileType(
const string &file ){
266 if( regex_search( file, regex(
"\\.vcf$")) ){
268 }
else if( regex_search( file, regex(
"\\.vcf\\.gz$")) ){
270 }
else if( regex_search( file, regex(
"\\.bcf$")) ) {
272 }
else if( regex_search( file, regex(
"\\.bgen$")) ){
274 }
else if( regex_search( file, regex(
"\\.pgen$")) ){
276 }
if( regex_search( file, regex(
"\\.bed$")) ){
288static vector<T> cast_elements(
const vector<string> &v ){
289 vector<T> output(0, v.size());
292 stringstream parser(s);
303static vector<int> stoi_vec(
const vector<string> &v ){
306 output.reserve(v.size());
308 for (
const string& s : v) {
309 output.push_back(std::stoi(s));
321static vector<string> splitRegionString(
string regionString){
323 vector<string> regions;
327 boost::erase_all(regionString,
" ");
328 boost::split(regions, regionString, boost::is_any_of(
"\t,\n"));
331 removeDuplicates( regions );
339template<
typename T,
typename T2>
340static vector<T> subset_vector(
const vector<T> &x,
const vector<T2> &idx){
344 x_subset.reserve(idx.size());
348 throw std::out_of_range(
"Index is out of bounds");
350 x_subset.push_back( x[i] );
358static void print_vec(
const string &title,
const vector<T> &x){
359 Rcpp::Rcout << title <<
"\n";
360 for(
auto a: x) Rcpp::Rcout << a <<
" ";
368std::vector<std::vector<T>>
chunk_vector(
const std::vector<T>& vec,
int k) {
369 std::vector<std::vector<T> > result;
370 int size = vec.size();
371 int chunk_size = size / k;
372 int remainder = size % k;
375 for (
int i = 0; i < k; ++i) {
376 int current_chunk_size = chunk_size;
378 current_chunk_size++;
380 std::vector<T> chunk(vec.begin() + start, vec.begin() + start + current_chunk_size);
381 result.push_back(chunk);
382 start += current_chunk_size;
390static vector<unsigned int> which_in(
const vector<T> &v1,
const vector<T> &v2){
393 unordered_map<T, bool> v2_elements;
395 v2_elements[x] =
true;
399 vector<unsigned int> shared_indices;
400 for (
int i = 0; i < v1.size(); ++i) {
402 if (v2_elements.count(v1[i])) {
404 shared_indices.push_back(i);
408 return shared_indices;
Definition bgenstream.h:33
FileType
Definition utils.h:232
@ OTHER
Definition utils.h:239
@ BGEN
Definition utils.h:236
@ BCF
Definition utils.h:235
@ PBED
Definition utils.h:238
@ PGEN
Definition utils.h:237
@ VCFGZ
Definition utils.h:234
@ VCF
Definition utils.h:233
std::vector< std::vector< T > > chunk_vector(const std::vector< T > &vec, int k)
Definition utils.h:368