GenomicDataStream
A scalable interface between data and analysis
Loading...
Searching...
No Matches
utils.h
Go to the documentation of this file.
1
2#include <armadillo>
3
4#ifndef DISABLE_EIGEN
5#include <Eigen/Core>
6#endif
7
8#include <vector>
9#include <random>
10#include <algorithm>
11#include <unordered_set>
12#include <regex>
13
14#include <boost/algorithm/string.hpp>
15
16#ifndef UTILS_H_
17#define UTILS_H_
18
19namespace gds {
20
25
26template<typename T>
27static vector<double> GP_to_dosage( const vector<T> &v, const bool &missingToMean) {
28 vector<double> res( v.size() / 3.0);
29 vector<int> missing;
30
31 // initialize
32 int runningSum = 0, nValid = 0;
33 double value;
34
35 // for each entry in result
36 // use two adjacent values
37 for(int i=0; i<res.size(); i++){
38 // compute dosage from genotype probabilties
39 value = v[3*i]*0 + v[3*i+1]*1 + v[3*i+2]*2;
40
41 // -9 is the missing value, so -18 is diploid
42 if( value == -18){
43 // if missing, set to NaN
44 value = std::numeric_limits<double>::quiet_NaN();
45 missing.push_back(i);
46 }else{
47 // for computing mean
48 runningSum += value;
49 nValid++;
50 }
51
52 // set dosage value
53 res[i] = value;
54 }
55
56 // mean excluding NaNs
57 double mu = runningSum / (double) nValid;
58
59 // if missing values should be set to mean
60 if( missingToMean ){
61 // for each entry with a missing value, set to mean
62 for(const int& i : missing) res[i] = mu;
63 }
64
65 return res;
66}
67
72static vector<double> intToDosage( const vector<int> &v, const bool &missingToMean) {
73
74 // store and return result
75 vector<double> res( v.size() / 2.0);
76 vector<int> missing;
77
78 // initialize
79 int runningSum = 0, nValid = 0;
80 double value;
81
82 // for each entry in result
83 // use two adjacent values
84 for(int i=0; i<res.size(); i++){
85 value = v[2*i] + v[2*i+1];
86
87 // -9 is the missing value, so -18 is diploid
88 if( value == -18){
89 // if missing, set to NaN
90 value = std::numeric_limits<double>::quiet_NaN();
91 missing.push_back(i);
92 }else{
93 // for computing mean
94 runningSum += value;
95 nValid++;
96 }
97
98 // set dosage value
99 res[i] = value;
100 }
101
102 // mean excluding NaNs
103 double mu = runningSum / (double) nValid;
104
105 // if missing values should be set to mean
106 if( missingToMean ){
107 // for each entry with a missing value, set to mean
108 for(const int& i : missing) res[i] = mu;
109 }
110
111 return res;
112}
113
117template<typename T>
118static size_t removeDuplicates(vector<T>& vec){
119 unordered_set<T> seen;
120
121 auto newEnd = remove_if(vec.begin(), vec.end(), [&seen](const T& value)
122 {
123 if (seen.find(value) != end(seen))
124 return true;
125
126 seen.insert(value);
127 return false;
128 });
129
130 vec.erase(newEnd, vec.end());
131
132 return vec.size();
133}
134
135
139static arma::vec colSums( const arma::mat &X){
140
141 // row vector of 1's
142 arma::rowvec ONE(X.n_rows, arma::fill::ones);
143
144 // matrix multiplication to get sums
145 arma::mat tmp = ONE * X;
146
147 return arma::conv_to<arma::vec>::from( tmp );
148}
149
150
158static void standardize( arma::mat &X, const bool &center = true, const bool &scale = true, const double tol = 1e-10 ){
159
160 double sqrt_rdf = sqrt(X.n_rows - 1.0);
161
162 // if center, subtract mean of each column
163 // if scale, divide by sd of each column
164 // Note, norm() does not center the column
165 // this give results consistent with base::scale()
166 // when scale is FALSE
167 for(size_t j=0; j<X.n_cols; j++){
168 if( center ) X.col(j) -= mean(X.col(j));
169 double sd = norm(X.col(j)) / sqrt_rdf;
170 if( scale && sd > tol ) X.col(j) /= sd;
171 }
172}
173
174#ifndef DISABLE_EIGEN
175static void standardize( Eigen::MatrixXd &X, const bool &center = true, const bool &scale = true, const double tol = 1e-10 ){
176
177 double sqrt_rdf = sqrt(X.rows() - 1.0);
178 if(center) X.rowwise() -= X.colwise().mean(); // centering
179 if(scale) {
180 // if X is centered, then we can convert norm to sd
181 for(size_t j=0; j<X.cols(); j++) {
182 double sd = X.col(j).norm() / sqrt_rdf ; // sd
183 if(sd > tol) X.col(j) /= sd;
184 }
185 }
186}
187
188
189static void standardize_rows( Eigen::MatrixXd &X, const bool &center = true, const bool &scale = true, const double tol = 1e-10 ){
190
191 double sqrt_rdf = sqrt(X.cols() - 1.0);
192 if(center) X.colwise() -= X.rowwise().mean(); // centering
193 if(scale) {
194 // if X is centered, then we can convert norm to sd
195 for(size_t j=0; j<X.rows(); j++) {
196 double sd = X.row(j).norm() / sqrt_rdf ; // sd
197 if(sd > tol) X.row(j) /= sd;
198 }
199 }
200}
201#endif
202
205static bool isOnlyDigits(const std::string& s){
206 int n = count_if(s.begin(), s.end(),
207 [](unsigned char c){ return isdigit(c); }
208 );
209 return( n == s.size());
210}
211
212
215static void nanToMean( arma::vec & v){
216 // get indeces of finite elements
217 arma::uvec idx = arma::find_finite(v);
218
219 // if number of finite elements is less than the total
220 if( idx.n_elem < v.n_elem ){
221 // compute mean from finite elements
222 double mu = arma::mean( v.elem(idx));
223
224 // replace nan with mu
225 v.replace(arma::datum::nan, mu);
226 }
227}
228
229
241
244static string toString( FileType x){
245
246 switch(x){
247 case VCF: return "vcf";
248 case VCFGZ: return "vcf.gz";
249 case BCF: return "bcf";
250 case BGEN: return "bgen";
251 case PGEN: return "pgen";
252 case PBED: return "bed";
253 case OTHER: return "other";
254 default: return "other";
255 }
256}
257
258
259
262static FileType getFileType( const string &file ){
263
264 FileType ft = OTHER;
265
266 if( regex_search( file, regex("\\.vcf$")) ){
267 ft = VCF;
268 }else if( regex_search( file, regex("\\.vcf\\.gz$")) ){
269 ft = VCFGZ;
270 }else if( regex_search( file, regex("\\.bcf$")) ) {
271 ft = BCF;
272 }else if( regex_search( file, regex("\\.bgen$")) ){
273 ft = BGEN;
274 }else if( regex_search( file, regex("\\.pgen$")) ){
275 ft = PGEN;
276 } if( regex_search( file, regex("\\.bed$")) ){
277 ft = PBED;
278 }
279
280 return ft;
281}
282
283
284
287template<typename T>
288static vector<T> cast_elements( const vector<string> &v ){
289 vector<T> output(0, v.size());
290
291 for (auto &s : v) {
292 stringstream parser(s);
293 T x = 0;
294 parser >> x;
295 output.push_back(x);
296 }
297 return output;
298}
299
300
303static vector<int> stoi_vec( const vector<string> &v ){
304
305 vector<int> output;
306 output.reserve(v.size());
307
308 for (const string& s : v) {
309 output.push_back(std::stoi(s));
310 }
311 return output;
312}
313
314
315
321static vector<string> splitRegionString( string regionString){
322
323 vector<string> regions;
324
325 // regionString is string of chr:start-end delim by "\t,\n"
326 // remove spaces, then split based on delim
327 boost::erase_all(regionString, " ");
328 boost::split(regions, regionString, boost::is_any_of("\t,\n"));
329
330 // remove duplicate regions, but preserve order
331 removeDuplicates( regions );
332
333 return regions;
334}
335
336
339template<typename T, typename T2>
340static vector<T> subset_vector(const vector<T> &x, const vector<T2> &idx){
341
342 // initialize x_subset to have size idx.size()
343 vector<T> x_subset;
344 x_subset.reserve(idx.size());
345
346 for(auto i: idx){
347 if( i > x.size() ){
348 throw std::out_of_range("Index is out of bounds");
349 }
350 x_subset.push_back( x[i] );
351 }
352
353 return x_subset;
354}
355
356
357template<typename T>
358static void print_vec(const string &title, const vector<T> &x){
359 Rcpp::Rcout << title << "\n";
360 for(auto a: x) Rcpp::Rcout << a << " ";
361 Rcpp::Rcout << endl;
362}
363
364
367template<typename T>
368std::vector<std::vector<T>> chunk_vector(const std::vector<T>& vec, int k) {
369 std::vector<std::vector<T> > result;
370 int size = vec.size();
371 int chunk_size = size / k;
372 int remainder = size % k;
373
374 int start = 0;
375 for (int i = 0; i < k; ++i) {
376 int current_chunk_size = chunk_size;
377 if (i < remainder) {
378 current_chunk_size++;
379 }
380 std::vector<T> chunk(vec.begin() + start, vec.begin() + start + current_chunk_size);
381 result.push_back(chunk);
382 start += current_chunk_size;
383 }
384 return result;
385}
386
389template<typename T>
390static vector<unsigned int> which_in( const vector<T> &v1, const vector<T> &v2){
391
392 // Create a hash map (unordered_map) of elements in v2 for efficient lookups
393 unordered_map<T, bool> v2_elements;
394 for (auto &x : v2) {
395 v2_elements[x] = true;
396 }
397
398 // Iterate through v1 and check if the element exists in the hash map
399 vector<unsigned int> shared_indices;
400 for (int i = 0; i < v1.size(); ++i) {
401 // Check if the element exists in the map
402 if (v2_elements.count(v1[i])) {
403 // Add the index from v1
404 shared_indices.push_back(i);
405 }
406 }
407
408 return shared_indices;
409}
410
411
412
413
414
415
416} // end namespace
417#endif
Definition bgenstream.h:33
FileType
Definition utils.h:232
@ OTHER
Definition utils.h:239
@ BGEN
Definition utils.h:236
@ BCF
Definition utils.h:235
@ PBED
Definition utils.h:238
@ PGEN
Definition utils.h:237
@ VCFGZ
Definition utils.h:234
@ VCF
Definition utils.h:233
std::vector< std::vector< T > > chunk_vector(const std::vector< T > &vec, int k)
Definition utils.h:368