13#include <Eigen/Sparse>
18#include "genfile/bgen/View.hpp"
19#include "genfile/bgen/IndexQuery.hpp"
21#include <boost/algorithm/string.hpp>
31using namespace genfile::bgen;
39static genfile::bgen::View::UniquePtr construct_view(
40 const string & filename) {
42 if( ! filesystem::exists( filename ) ){
43 throw runtime_error(
"File does not exist: " + filename);
46 View::UniquePtr view = genfile::bgen::View::create( filename ) ;
51static IndexQuery::UniquePtr construct_query(
const string &index_filename){
53 if( ! filesystem::exists( index_filename ) ){
54 throw runtime_error(
"File does not exist: " + index_filename);
57 IndexQuery::UniquePtr query = IndexQuery::create( index_filename ) ;
70static genfile::bgen::View::UniquePtr construct_view(
71 const string & filename,
72 const string & index_filename,
77 View::UniquePtr view = construct_view( filename );
81 IndexQuery::UniquePtr query = construct_query(index_filename);
82 for(
int i = 0; i < gr.
size(); i++ ) {
87 view->set_query( query ) ;
93static std::shared_ptr<VariantSet> getVariantSet(
94 const string & index_filename){
97 SqliteIndexQuery query(index_filename);
99 std::vector<std::string> rsid, chrom;
100 std::vector<int> position;
103 query.get_variant_info(&rsid, &chrom, &position);
109 return std::make_shared<VariantSet>(vs);
130 view = construct_view(
param.file ) ;
142 if(
param.samples.compare(
"-") == 0 ){
143 get_all_samples( *view, &number_of_samples, &sampleNames, &requestedSamplesByIndexInDataIndex ) ;
147 vector<string> requestedSamples;
150 boost::split(requestedSamples,
param.samples, boost::is_any_of(
"\t,\n"));
152 get_requested_samples( *view, requestedSamples, &number_of_samples, &sampleNames, &requestedSamplesByIndexInDataIndex ) ;
158 probs.reserve(
n_samples() *
param.chunkSize * max_entries_per_sample);
167 if( vInfo !=
nullptr)
delete vInfo;
181 varIdx = vs->getIndeces( gr );
184 vector<string> variantIDs = vs->getVariantIDs(varIdx);
187 query->include_rsids( variantIDs );
188 query->initialise() ;
189 view->set_query( query ) ;
193 n_variants_total = view->number_of_variants() ;
196 variant_idx_start = 0;
202 return number_of_samples;
208 return vInfo->sampleNames;
214 return toString(
param.fileType);
218 return vs->getChromRanges();
224 bool ret = getNextChunk_helper();
231 arma::mat M(matDosage.data(), number_of_samples, vInfo->size(),
false,
true);
240 bool ret = getNextChunk_helper();
247 arma::mat M(matDosage.data(), number_of_samples, vInfo->size(),
false,
true);
254 #ifndef DISABLE_EIGEN
258 bool ret = getNextChunk_helper();
265 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), number_of_samples, vInfo->size());
276 bool ret = getNextChunk_helper();
283 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), number_of_samples, vInfo->size());
295 bool ret = getNextChunk_helper();
302 Rcpp::NumericMatrix M(number_of_samples, vInfo->size(), matDosage.data());
303 colnames(M) = Rcpp::wrap( vInfo->getFeatureNames() );
304 rownames(M) = Rcpp::wrap( vInfo->sampleNames );
315 bool ret = getNextChunk_helper();
328 View::UniquePtr view =
nullptr;
330 std::shared_ptr<VariantSet> vs;
331 size_t number_of_samples = 0;
332 vector<string> sampleNames;
333 map<size_t, size_t> requestedSamplesByIndexInDataIndex;
335 vector<double> probs;
336 vector<double> matDosage;
337 size_t max_entries_per_sample = 4;
338 int n_variants_total;
339 int variant_idx_start;
342 bool getNextChunk_helper(){
349 int chunkSize = min(
param.
chunkSize, n_variants_total - variant_idx_start);
350 chunkSize = max(chunkSize, 0);
353 if( chunkSize == 0)
return false;
355 vector<int> data_dimension;
356 data_dimension.push_back(chunkSize);
357 data_dimension.push_back(number_of_samples);
358 data_dimension.push_back(max_entries_per_sample);
360 vector<int> ploidy_dimension;
361 ploidy_dimension.push_back( chunkSize ) ;
362 ploidy_dimension.push_back( number_of_samples ) ;
364 vector<int> ploidy(ploidy_dimension[0]*ploidy_dimension[1], numeric_limits<int>::quiet_NaN());
365 vector<bool> phased( chunkSize, numeric_limits<bool>::quiet_NaN() ) ;
367 string SNPID, rsid, chromosome;
368 genfile::bgen::uint32_t position;
369 vector<string> alleles;
373 for(
size_t j = variant_idx_start; j < variant_idx_start + chunkSize; j++ ) {
376 view->read_variant( &SNPID, &rsid, &chromosome, &position, &alleles ) ;
379 vInfo->
addVariant(chromosome, position, rsid, alleles[0], alleles[1] );
383 &ploidy, ploidy_dimension,
384 &probs, data_dimension,
387 requestedSamplesByIndexInDataIndex
390 view->read_genotype_data_block( setter ) ;
393 variant_idx_start += chunkSize;
399 cube C(probs.data(), chunkSize, number_of_samples, max_entries_per_sample,
true,
true);
418 vec w_unph = {0,1,2};
419 vec w_ph = {0,1,0,1};
424 for(
int j=0; j<chunkSize; j++){
426 dsg = C.row_as_mat(j).t() * w_ph;
430 m = C.row_as_mat(j).t();
431 dsg = m.cols(0,2) * w_unph;
435 if(
param.missingToMean ) nanToMean( dsg );
438 memcpy(matDosage.data() + number_of_samples*j, dsg.memptr(), number_of_samples*
sizeof(
double));
Definition GenomicDataStream_virtual.h:34
double getMinVariance()
Definition GenomicDataStream_virtual.h:200
double getMAF() const
Definition GenomicDataStream_virtual.h:206
Param param
Definition GenomicDataStream_virtual.h:254
GenomicDataStream()
Definition GenomicDataStream_virtual.h:168
Definition GenomicRanges.h:25
vector< size_t > get_end() const
Definition GenomicRanges.h:55
vector< string > get_chrom() const
Definition GenomicRanges.h:53
vector< size_t > get_start() const
Definition GenomicRanges.h:54
const int size() const
Definition GenomicRanges.h:61
Definition VariantInfo.h:24
void clear()
Definition VariantInfo.h:137
void addVariant(const string &chr, const int &pos, const string &id, const string &allele1, const string &allele2)
Definition VariantInfo.h:38
Definition VariantSet.h:45
bool getNextChunk(DataChunk< Rcpp::NumericMatrix > &chunk, const bool &useFilter=true) override
Definition bgenstream.h:292
bool getNextChunk(DataChunk< arma::mat > &chunk, const bool &useFilter=true) override
Definition bgenstream.h:221
int n_samples() override
Definition bgenstream.h:201
bool getNextChunk(DataChunk< vector< double > > &chunk, const bool &useFilter=true) override
Definition bgenstream.h:312
bgenstream()
Definition bgenstream.h:123
bool getNextChunk(DataChunk< Eigen::MatrixXd > &chunk, const bool &useFilter=true) override
Definition bgenstream.h:255
bool getNextChunk(DataChunk< Eigen::SparseMatrix< double > > &chunk, const bool &useFilter=true) override
Definition bgenstream.h:273
GenomicRanges getChromRanges() override
Definition bgenstream.h:217
string getStreamType() override
Definition bgenstream.h:213
~bgenstream()
Definition bgenstream.h:166
bool getNextChunk(DataChunk< arma::sp_mat > &chunk, const bool &useFilter=true) override
Definition bgenstream.h:237
void setRegions(const vector< string > ®ions) override
Definition bgenstream.h:172
string filenameIdxGlobal
Definition bgenstream.h:121
bgenstream(const Param ¶m)
Definition bgenstream.h:127
vector< string > getSampleNames() override
Definition bgenstream.h:207
Definition bgenstream.h:33
Definition GenomicDataStream_virtual.h:65
int chunkSize
Definition GenomicDataStream_virtual.h:157