GenomicDataStream
A scalable interface between data and analysis
Loading...
Searching...
No Matches
utils.h
Go to the documentation of this file.
1
2#include <RcppArmadillo.h>
3// [[Rcpp::depends(RcppArmadillo)]]
4
5#ifndef DISABLE_EIGEN
6#include <Eigen/Core>
7#endif
8
9#include <vector>
10#include <random>
11#include <algorithm>
12#include <unordered_set>
13#include <regex>
14
15#include <boost/algorithm/string.hpp>
16
17#ifndef UTILS_H_
18#define UTILS_H_
19
20namespace gds {
21
26
27template<typename T>
28static vector<double> GP_to_dosage( const vector<T> &v, const bool &missingToMean) {
29 vector<double> res( v.size() / 3.0);
30 vector<int> missing;
31
32 // initialize
33 int runningSum = 0, nValid = 0;
34 double value;
35
36 // for each entry in result
37 // use two adjacent values
38 for(int i=0; i<res.size(); i++){
39 // compute dosage from genotype probabilties
40 value = v[3*i]*0 + v[3*i+1]*1 + v[3*i+2]*2;
41
42 // -9 is the missing value, so -18 is diploid
43 if( value == -18){
44 // if missing, set to NaN
45 value = std::numeric_limits<double>::quiet_NaN();
46 missing.push_back(i);
47 }else{
48 // for computing mean
49 runningSum += value;
50 nValid++;
51 }
52
53 // set dosage value
54 res[i] = value;
55 }
56
57 // mean excluding NaNs
58 double mu = runningSum / (double) nValid;
59
60 // if missing values should be set to mean
61 if( missingToMean ){
62 // for each entry with a missing value, set to mean
63 for(const int& i : missing) res[i] = mu;
64 }
65
66 return res;
67}
68
73static vector<double> intToDosage( const vector<int> &v, const bool &missingToMean) {
74
75 // store and return result
76 vector<double> res( v.size() / 2.0);
77 vector<int> missing;
78
79 // initialize
80 int runningSum = 0, nValid = 0;
81 double value;
82
83 // for each entry in result
84 // use two adjacent values
85 for(int i=0; i<res.size(); i++){
86 value = v[2*i] + v[2*i+1];
87
88 // -9 is the missing value, so -18 is diploid
89 if( value == -18){
90 // if missing, set to NaN
91 value = std::numeric_limits<double>::quiet_NaN();
92 missing.push_back(i);
93 }else{
94 // for computing mean
95 runningSum += value;
96 nValid++;
97 }
98
99 // set dosage value
100 res[i] = value;
101 }
102
103 // mean excluding NaNs
104 double mu = runningSum / (double) nValid;
105
106 // if missing values should be set to mean
107 if( missingToMean ){
108 // for each entry with a missing value, set to mean
109 for(const int& i : missing) res[i] = mu;
110 }
111
112 return res;
113}
114
118template<typename T>
119static size_t removeDuplicates(vector<T>& vec){
120 unordered_set<T> seen;
121
122 auto newEnd = remove_if(vec.begin(), vec.end(), [&seen](const T& value)
123 {
124 if (seen.find(value) != end(seen))
125 return true;
126
127 seen.insert(value);
128 return false;
129 });
130
131 vec.erase(newEnd, vec.end());
132
133 return vec.size();
134}
135
136
140static arma::vec colSums( const arma::mat &X){
141
142 // row vector of 1's
143 arma::rowvec ONE(X.n_rows, arma::fill::ones);
144
145 // matrix multiplication to get sums
146 arma::mat tmp = ONE * X;
147
148 return arma::conv_to<arma::vec>::from( tmp );
149}
150
151
159static void standardize( arma::mat &X, const bool &center = true, const bool &scale = true, const double tol = 1e-10 ){
160
161 double sqrt_rdf = sqrt(X.n_rows - 1.0);
162
163 // if center, subtract mean of each column
164 // if scale, divide by sd of each column
165 // Note, norm() does not center the column
166 // this give results consistent with base::scale()
167 // when scale is FALSE
168 for(size_t j=0; j<X.n_cols; j++){
169 if( center ) X.col(j) -= mean(X.col(j));
170 double sd = norm(X.col(j)) / sqrt_rdf;
171 if( scale && sd > tol ) X.col(j) /= sd;
172 }
173}
174
175#ifndef DISABLE_EIGEN
176static void standardize( Eigen::MatrixXd &X, const bool &center = true, const bool &scale = true, const double tol = 1e-10 ){
177
178 double sqrt_rdf = sqrt(X.rows() - 1.0);
179 if(center) X.rowwise() -= X.colwise().mean(); // centering
180 if(scale) {
181 // if X is centered, then we can convert norm to sd
182 for(size_t j=0; j<X.cols(); j++) {
183 double sd = X.col(j).norm() / sqrt_rdf ; // sd
184 if(sd > tol) X.col(j) /= sd;
185 }
186 }
187}
188
189
190static void standardize_rows( Eigen::MatrixXd &X, const bool &center = true, const bool &scale = true, const double tol = 1e-10 ){
191
192 double sqrt_rdf = sqrt(X.cols() - 1.0);
193 if(center) X.colwise() -= X.rowwise().mean(); // centering
194 if(scale) {
195 // if X is centered, then we can convert norm to sd
196 for(size_t j=0; j<X.rows(); j++) {
197 double sd = X.row(j).norm() / sqrt_rdf ; // sd
198 if(sd > tol) X.row(j) /= sd;
199 }
200 }
201}
202#endif
203
206static bool isOnlyDigits(const std::string& s){
207 int n = count_if(s.begin(), s.end(),
208 [](unsigned char c){ return isdigit(c); }
209 );
210 return( n == s.size());
211}
212
213
216static void nanToMean( arma::vec & v){
217 // get indeces of finite elements
218 arma::uvec idx = arma::find_finite(v);
219
220 // if number of finite elements is less than the total
221 if( idx.n_elem < v.n_elem ){
222 // compute mean from finite elements
223 double mu = arma::mean( v.elem(idx));
224
225 // replace nan with mu
226 v.replace(arma::datum::nan, mu);
227 }
228}
229
230
242
245static string toString( FileType x){
246
247 switch(x){
248 case VCF: return "vcf";
249 case VCFGZ: return "vcf.gz";
250 case BCF: return "bcf";
251 case BGEN: return "bgen";
252 case PGEN: return "pgen";
253 case PBED: return "bed";
254 case OTHER: return "other";
255 default: return "other";
256 }
257}
258
259
260
263static FileType getFileType( const string &file ){
264
265 FileType ft = OTHER;
266
267 if( regex_search( file, regex("\\.vcf$")) ){
268 ft = VCF;
269 }else if( regex_search( file, regex("\\.vcf\\.gz$")) ){
270 ft = VCFGZ;
271 }else if( regex_search( file, regex("\\.bcf$")) ) {
272 ft = BCF;
273 }else if( regex_search( file, regex("\\.bgen$")) ){
274 ft = BGEN;
275 }else if( regex_search( file, regex("\\.pgen$")) ){
276 ft = PGEN;
277 } if( regex_search( file, regex("\\.bed$")) ){
278 ft = PBED;
279 }
280
281 return ft;
282}
283
284
285
288template<typename T>
289static vector<T> cast_elements( const vector<string> &v ){
290 vector<T> output(0, v.size());
291
292 for (auto &s : v) {
293 stringstream parser(s);
294 T x = 0;
295 parser >> x;
296 output.push_back(x);
297 }
298 return output;
299}
300
301
304static vector<int> stoi_vec( const vector<string> &v ){
305
306 vector<int> output;
307 output.reserve(v.size());
308
309 for (const string& s : v) {
310 output.push_back(std::stoi(s));
311 }
312 return output;
313}
314
315
316
322static vector<string> splitRegionString( string regionString){
323
324 vector<string> regions;
325
326 // regionString is string of chr:start-end delim by "\t,\n"
327 // remove spaces, then split based on delim
328 boost::erase_all(regionString, " ");
329 boost::split(regions, regionString, boost::is_any_of("\t,\n"));
330
331 // remove duplicate regions, but preserve order
332 removeDuplicates( regions );
333
334 return regions;
335}
336
337
340template<typename T, typename T2>
341static vector<T> subset_vector(const vector<T> &x, const vector<T2> &idx){
342
343 // initialize x_subset to have size idx.size()
344 vector<T> x_subset;
345 x_subset.reserve(idx.size());
346
347 for(auto i: idx){
348 if( i > x.size() ){
349 throw std::out_of_range("Index is out of bounds");
350 }
351 x_subset.push_back( x[i] );
352 }
353
354 return x_subset;
355}
356
357
358template<typename T>
359static void print_vec(const string &title, const vector<T> &x){
360 Rcpp::Rcout << title << "\n";
361 for(auto a: x) Rcpp::Rcout << a << " ";
362 Rcpp::Rcout << endl;
363}
364
365
368template<typename T>
369std::vector<std::vector<T>> chunk_vector(const std::vector<T>& vec, int k) {
370 std::vector<std::vector<T> > result;
371 int size = vec.size();
372 int chunk_size = size / k;
373 int remainder = size % k;
374
375 int start = 0;
376 for (int i = 0; i < k; ++i) {
377 int current_chunk_size = chunk_size;
378 if (i < remainder) {
379 current_chunk_size++;
380 }
381 std::vector<T> chunk(vec.begin() + start, vec.begin() + start + current_chunk_size);
382 result.push_back(chunk);
383 start += current_chunk_size;
384 }
385 return result;
386}
387
390template<typename T>
391static vector<unsigned int> which_in( const vector<T> &v1, const vector<T> &v2){
392
393 // Create a hash map (unordered_map) of elements in v2 for efficient lookups
394 unordered_map<T, bool> v2_elements;
395 for (auto &x : v2) {
396 v2_elements[x] = true;
397 }
398
399 // Iterate through v1 and check if the element exists in the hash map
400 vector<unsigned int> shared_indices;
401 for (int i = 0; i < v1.size(); ++i) {
402 // Check if the element exists in the map
403 if (v2_elements.count(v1[i])) {
404 // Add the index from v1
405 shared_indices.push_back(i);
406 }
407 }
408
409 return shared_indices;
410}
411
412
413
414
415
416
417} // end namespace
418#endif
Definition bgenstream.h:33
FileType
Definition utils.h:233
@ OTHER
Definition utils.h:240
@ BGEN
Definition utils.h:237
@ BCF
Definition utils.h:236
@ PBED
Definition utils.h:239
@ PGEN
Definition utils.h:238
@ VCFGZ
Definition utils.h:235
@ VCF
Definition utils.h:234
std::vector< std::vector< T > > chunk_vector(const std::vector< T > &vec, int k)
Definition utils.h:369