2#include <RcppArmadillo.h>
12#include <unordered_set>
15#include <boost/algorithm/string.hpp>
28static vector<double> GP_to_dosage(
const vector<T> &v,
const bool &missingToMean) {
29 vector<double> res( v.size() / 3.0);
33 int runningSum = 0, nValid = 0;
38 for(
int i=0; i<res.size(); i++){
40 value = v[3*i]*0 + v[3*i+1]*1 + v[3*i+2]*2;
45 value = std::numeric_limits<double>::quiet_NaN();
58 double mu = runningSum / (double) nValid;
63 for(
const int& i : missing) res[i] = mu;
73static vector<double> intToDosage(
const vector<int> &v,
const bool &missingToMean) {
76 vector<double> res( v.size() / 2.0);
80 int runningSum = 0, nValid = 0;
85 for(
int i=0; i<res.size(); i++){
86 value = v[2*i] + v[2*i+1];
91 value = std::numeric_limits<double>::quiet_NaN();
104 double mu = runningSum / (double) nValid;
109 for(
const int& i : missing) res[i] = mu;
119static size_t removeDuplicates(vector<T>& vec){
120 unordered_set<T> seen;
122 auto newEnd = remove_if(vec.begin(), vec.end(), [&seen](
const T& value)
124 if (seen.find(value) != end(seen))
131 vec.erase(newEnd, vec.end());
140static arma::vec colSums(
const arma::mat &X){
143 arma::rowvec ONE(X.n_rows, arma::fill::ones);
146 arma::mat tmp = ONE * X;
148 return arma::conv_to<arma::vec>::from( tmp );
159static void standardize( arma::mat &X,
const bool ¢er =
true,
const bool &scale =
true,
const double tol = 1e-10 ){
161 double sqrt_rdf = sqrt(X.n_rows - 1.0);
168 for(
size_t j=0; j<X.n_cols; j++){
169 if( center ) X.col(j) -= mean(X.col(j));
170 double sd = norm(X.col(j)) / sqrt_rdf;
171 if( scale && sd > tol ) X.col(j) /= sd;
176static void standardize( Eigen::MatrixXd &X,
const bool ¢er =
true,
const bool &scale =
true,
const double tol = 1e-10 ){
178 double sqrt_rdf = sqrt(X.rows() - 1.0);
179 if(center) X.rowwise() -= X.colwise().mean();
182 for(
size_t j=0; j<X.cols(); j++) {
183 double sd = X.col(j).norm() / sqrt_rdf ;
184 if(sd > tol) X.col(j) /= sd;
190static void standardize_rows( Eigen::MatrixXd &X,
const bool ¢er =
true,
const bool &scale =
true,
const double tol = 1e-10 ){
192 double sqrt_rdf = sqrt(X.cols() - 1.0);
193 if(center) X.colwise() -= X.rowwise().mean();
196 for(
size_t j=0; j<X.rows(); j++) {
197 double sd = X.row(j).norm() / sqrt_rdf ;
198 if(sd > tol) X.row(j) /= sd;
206static bool isOnlyDigits(
const std::string& s){
207 int n = count_if(s.begin(), s.end(),
208 [](
unsigned char c){ return isdigit(c); }
210 return( n == s.size());
216static void nanToMean( arma::vec & v){
218 arma::uvec idx = arma::find_finite(v);
221 if( idx.n_elem < v.n_elem ){
223 double mu = arma::mean( v.elem(idx));
226 v.replace(arma::datum::nan, mu);
248 case VCF:
return "vcf";
249 case VCFGZ:
return "vcf.gz";
250 case BCF:
return "bcf";
251 case BGEN:
return "bgen";
252 case PGEN:
return "pgen";
253 case PBED:
return "bed";
254 case OTHER:
return "other";
255 default:
return "other";
263static FileType getFileType(
const string &file ){
267 if( regex_search( file, regex(
"\\.vcf$")) ){
269 }
else if( regex_search( file, regex(
"\\.vcf\\.gz$")) ){
271 }
else if( regex_search( file, regex(
"\\.bcf$")) ) {
273 }
else if( regex_search( file, regex(
"\\.bgen$")) ){
275 }
else if( regex_search( file, regex(
"\\.pgen$")) ){
277 }
if( regex_search( file, regex(
"\\.bed$")) ){
289static vector<T> cast_elements(
const vector<string> &v ){
290 vector<T> output(0, v.size());
293 stringstream parser(s);
304static vector<int> stoi_vec(
const vector<string> &v ){
307 output.reserve(v.size());
309 for (
const string& s : v) {
310 output.push_back(std::stoi(s));
322static vector<string> splitRegionString(
string regionString){
324 vector<string> regions;
328 boost::erase_all(regionString,
" ");
329 boost::split(regions, regionString, boost::is_any_of(
"\t,\n"));
332 removeDuplicates( regions );
340template<
typename T,
typename T2>
341static vector<T> subset_vector(
const vector<T> &x,
const vector<T2> &idx){
345 x_subset.reserve(idx.size());
349 throw std::out_of_range(
"Index is out of bounds");
351 x_subset.push_back( x[i] );
359static void print_vec(
const string &title,
const vector<T> &x){
360 Rcpp::Rcout << title <<
"\n";
361 for(
auto a: x) Rcpp::Rcout << a <<
" ";
369std::vector<std::vector<T>>
chunk_vector(
const std::vector<T>& vec,
int k) {
370 std::vector<std::vector<T> > result;
371 int size = vec.size();
372 int chunk_size = size / k;
373 int remainder = size % k;
376 for (
int i = 0; i < k; ++i) {
377 int current_chunk_size = chunk_size;
379 current_chunk_size++;
381 std::vector<T> chunk(vec.begin() + start, vec.begin() + start + current_chunk_size);
382 result.push_back(chunk);
383 start += current_chunk_size;
391static vector<unsigned int> which_in(
const vector<T> &v1,
const vector<T> &v2){
394 unordered_map<T, bool> v2_elements;
396 v2_elements[x] =
true;
400 vector<unsigned int> shared_indices;
401 for (
int i = 0; i < v1.size(); ++i) {
403 if (v2_elements.count(v1[i])) {
405 shared_indices.push_back(i);
409 return shared_indices;
Definition bgenstream.h:33
FileType
Definition utils.h:233
@ OTHER
Definition utils.h:240
@ BGEN
Definition utils.h:237
@ BCF
Definition utils.h:236
@ PBED
Definition utils.h:239
@ PGEN
Definition utils.h:238
@ VCFGZ
Definition utils.h:235
@ VCF
Definition utils.h:234
std::vector< std::vector< T > > chunk_vector(const std::vector< T > &vec, int k)
Definition utils.h:369