14#include <Eigen/Sparse>
46 if( ! filesystem::exists(
param.file ) ){
47 throw runtime_error(
"File does not exist: " + param.file);
51 if(
param.field.compare(
"") == 0 ){
52 throw runtime_error(
"Field for VCF/BCF not specified");
56 reader =
new BcfReader(
param.file );
61 reader->setSamples(
param.samples );
64 record =
new BcfRecord( reader->header );
67 fieldType = reader->header.getFormatType(
param.field );
69 if( fieldType == 3 &&
param.field.compare(
"GT"))
70 throw std::runtime_error(
"field GT is the only supported string type");
72 throw std::runtime_error(
"field not found: " +
param.field);
86 if( reader !=
nullptr)
delete reader;
87 if( record !=
nullptr)
delete record;
88 if( vInfo !=
nullptr)
delete vInfo;
93 void setRegions(
const vector<string> ®ions)
override {
95 validRegions.reserve(regions.size());
97 copy(regions.begin(), regions.end(), back_inserter(validRegions));
104 continueIterating =
false;
107 if( validRegions.size() > 0 ){
109 itReg = validRegions.begin();
111 reader->setRegion( *itReg );
113 continueIterating =
true;
120 return reader->nsamples;
126 return reader->header.getSamples();
132 return toString(
param.fileType);
142 bool ret = getNextChunk_helper();
150 bool copy_aux_mem =
false;
152 arma::mat M(matDosage.data(), reader->nsamples, vInfo->size(), copy_aux_mem,
true);
162 bool ret = getNextChunk_helper();
170 arma::mat M(matDosage.data(), reader->nsamples, vInfo->size(),
false,
true);
178 #ifndef DISABLE_EIGEN
182 bool ret = getNextChunk_helper();
189 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), reader->nsamples, vInfo->size());
199 bool ret = getNextChunk_helper();
206 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), reader->nsamples, vInfo->size());
218 bool ret = getNextChunk_helper();
225 Rcpp::NumericMatrix M(reader->nsamples, vInfo->size(), matDosage.data());
226 colnames(M) = Rcpp::wrap( vInfo->getFeatureNames() );
227 rownames(M) = Rcpp::wrap( vInfo->sampleNames );
238 bool ret = getNextChunk_helper();
255 string s = record.CHROM() +
":" + to_string(record.POS()) +
" " + record.ID() +
" " + record.REF() +
" " + record.ALT();
260 BcfReader *reader =
nullptr;
261 BcfRecord *record =
nullptr;
263 vector<string>::iterator itReg;
264 vector<string> validRegions;
266 bool continueIterating;
275 vector<int> values_int;
276 vector<float> values_fl;
284 vector<double> matDosage;
286 bool getNextChunk_helper(){
289 if( validRegions.size() == 0)
return false;
292 if( ! continueIterating )
return continueIterating;
304 if( ! reader->getNextVariant( *record ) ){
313 if( itReg == validRegions.end()){
314 continueIterating =
false;
320 reader->setRegion( *itReg );
321 reader->getNextVariant( *record );
329 record->getFORMAT(
param.field, values_int);
330 matDosage.insert(matDosage.end(), values_int.begin(), values_int.end());
333 record->getFORMAT(
param.field, values_fl);
335 if(
param.field.compare(
"DS") == 0){
337 matDosage.insert(matDosage.end(), values_fl.begin(), values_fl.end());
338 }
else if(
param.field.compare(
"GP") == 0){
340 if( record->ploidy() > 2 ){
341 throw std::runtime_error(
"GP is not supported for site with ploidy > 2\n " +
variantToString(*record));
345 vector<double> dsg = GP_to_dosage(values_fl,
param.missingToMean);
347 matDosage.insert(matDosage.end(), dsg.begin(), dsg.end());
353 if( record->isMultiAllelics() ){
354 throw std::runtime_error(
"GT is not supported for multi-allelic site\n " +
variantToString(*record));
357 if( record->ploidy() > 2 ){
358 throw std::runtime_error(
"GT is not supported for site with ploidy > 2\n " +
variantToString(*record));
362 record->getGenotypes(values_int);
363 tmp = intToDosage( values_int,
param.missingToMean );
364 matDosage.insert(matDosage.end(), tmp.begin(), tmp.end());
369 vInfo->addVariant(record->CHROM(),
379 if( vInfo->size() == 0) ret =
false;
385 void checkStatus(
const string & region){
386 switch( reader->getStatus( region ) ){
394 throw runtime_error(
"Could not retrieve index file");
398 Rcpp::Rcout <<
"region:" + region +
";\n";
399 throw runtime_error(
"region was not found: " + region );
Definition GenomicDataStream_virtual.h:34
double getMinVariance()
Definition GenomicDataStream_virtual.h:213
double getMAF() const
Definition GenomicDataStream_virtual.h:219
Param param
Definition GenomicDataStream_virtual.h:267
GenomicDataStream()
Definition GenomicDataStream_virtual.h:181
Definition GenomicRanges.h:25
Definition VariantInfo.h:24
void clear()
Definition VariantInfo.h:142
GenomicRanges getChromRanges() override
Definition vcfstream.h:135
string getStreamType() override
Definition vcfstream.h:131
bool getNextChunk(DataChunk< arma::sp_mat > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:159
bool getNextChunk(DataChunk< Rcpp::NumericMatrix > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:215
bool getNextChunk(DataChunk< vector< double > > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:235
vcfstream(const Param ¶m)
Definition vcfstream.h:43
static string variantToString(const BcfRecord &record)
Definition vcfstream.h:254
vector< string > getSampleNames() override
Definition vcfstream.h:125
vcfstream()
Definition vcfstream.h:39
bool getNextChunk(DataChunk< arma::mat > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:139
bool getNextChunk(DataChunk< Eigen::SparseMatrix< double > > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:196
~vcfstream() override
Definition vcfstream.h:85
void setRegions(const vector< string > ®ions) override
Definition vcfstream.h:93
bool getNextChunk(DataChunk< Eigen::MatrixXd > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:179
int n_samples() override
Definition vcfstream.h:119
Definition bgenstream.h:33
Definition GenomicDataStream_virtual.h:78
int chunkSize
Definition GenomicDataStream_virtual.h:170