14#include <Eigen/Sparse>
46 if( ! filesystem::exists(
param.file ) ){
47 throw runtime_error(
"File does not exist: " + param.file);
51 if(
param.field.compare(
"") == 0 ){
52 throw runtime_error(
"Field for VCF/BCF not specified");
56 reader =
new BcfReader(
param.file );
61 reader->setSamples(
param.samples );
64 record =
new BcfRecord( reader->header );
67 fieldType = reader->header.getFormatType(
param.field );
69 if( fieldType == 3 &&
param.field.compare(
"GT"))
70 throw std::runtime_error(
"field GT is the only supported string type");
72 throw std::runtime_error(
"field not found: " +
param.field);
86 if( reader !=
nullptr)
delete reader;
87 if( record !=
nullptr)
delete record;
88 if( vInfo !=
nullptr)
delete vInfo;
93 void setRegions(
const vector<string> ®ions)
override {
95 validRegions.reserve(regions.size());
97 copy(regions.begin(), regions.end(), back_inserter(validRegions));
104 continueIterating =
false;
107 if( validRegions.size() > 0 ){
109 itReg = validRegions.begin();
111 reader->setRegion( *itReg );
113 continueIterating =
true;
120 return reader->nsamples;
126 return reader->header.getSamples();
132 return toString(
param.fileType);
142 bool ret = getNextChunk_helper();
150 bool copy_aux_mem =
false;
151 arma::mat M(matDosage.data(), reader->nsamples, vInfo->size(), copy_aux_mem,
true);
161 bool ret = getNextChunk_helper();
169 arma::mat M(matDosage.data(), reader->nsamples, vInfo->size(),
false,
true);
177 #ifndef DISABLE_EIGEN
181 bool ret = getNextChunk_helper();
188 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), reader->nsamples, vInfo->size());
198 bool ret = getNextChunk_helper();
205 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), reader->nsamples, vInfo->size());
217 bool ret = getNextChunk_helper();
224 Rcpp::NumericMatrix M(reader->nsamples, vInfo->size(), matDosage.data());
225 colnames(M) = Rcpp::wrap( vInfo->getFeatureNames() );
226 rownames(M) = Rcpp::wrap( vInfo->sampleNames );
237 bool ret = getNextChunk_helper();
254 string s = record.CHROM() +
":" + to_string(record.POS()) +
" " + record.ID() +
" " + record.REF() +
" " + record.ALT();
259 BcfReader *reader =
nullptr;
260 BcfRecord *record =
nullptr;
262 vector<string>::iterator itReg;
263 vector<string> validRegions;
265 bool continueIterating;
274 vector<int> values_int;
275 vector<float> values_fl;
283 vector<double> matDosage;
285 bool getNextChunk_helper(){
288 if( validRegions.size() == 0)
return false;
291 if( ! continueIterating )
return continueIterating;
303 if( ! reader->getNextVariant( *record ) ){
312 if( itReg == validRegions.end()){
313 continueIterating =
false;
319 reader->setRegion( *itReg );
320 reader->getNextVariant( *record );
328 record->getFORMAT(
param.field, values_int);
329 matDosage.insert(matDosage.end(), values_int.begin(), values_int.end());
332 record->getFORMAT(
param.field, values_fl);
334 if(
param.field.compare(
"DS") == 0){
336 matDosage.insert(matDosage.end(), values_fl.begin(), values_fl.end());
337 }
else if(
param.field.compare(
"GP") == 0){
339 if( record->ploidy() > 2 ){
340 throw std::runtime_error(
"GP is not supported for site with ploidy > 2\n " +
variantToString(*record));
344 vector<double> dsg = GP_to_dosage(values_fl,
param.missingToMean);
346 matDosage.insert(matDosage.end(), dsg.begin(), dsg.end());
352 if( record->isMultiAllelics() ){
353 throw std::runtime_error(
"GT is not supported for multi-allelic site\n " +
variantToString(*record));
356 if( record->ploidy() > 2 ){
357 throw std::runtime_error(
"GT is not supported for site with ploidy > 2\n " +
variantToString(*record));
361 record->getGenotypes(values_int);
362 tmp = intToDosage( values_int,
param.missingToMean );
363 matDosage.insert(matDosage.end(), tmp.begin(), tmp.end());
368 vInfo->addVariant(record->CHROM(),
378 if( vInfo->size() == 0) ret =
false;
384 void checkStatus(
const string & region){
385 switch( reader->getStatus( region ) ){
393 throw runtime_error(
"Could not retrieve index file");
397 Rcpp::Rcout <<
"region:" + region +
";\n";
398 throw runtime_error(
"region was not found: " + region );
Definition GenomicDataStream_virtual.h:34
double getMinVariance()
Definition GenomicDataStream_virtual.h:200
double getMAF() const
Definition GenomicDataStream_virtual.h:206
Param param
Definition GenomicDataStream_virtual.h:254
GenomicDataStream()
Definition GenomicDataStream_virtual.h:168
Definition GenomicRanges.h:25
Definition VariantInfo.h:24
void clear()
Definition VariantInfo.h:137
GenomicRanges getChromRanges() override
Definition vcfstream.h:135
string getStreamType() override
Definition vcfstream.h:131
bool getNextChunk(DataChunk< arma::sp_mat > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:158
bool getNextChunk(DataChunk< Rcpp::NumericMatrix > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:214
bool getNextChunk(DataChunk< vector< double > > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:234
vcfstream(const Param ¶m)
Definition vcfstream.h:43
static string variantToString(const BcfRecord &record)
Definition vcfstream.h:253
vector< string > getSampleNames() override
Definition vcfstream.h:125
vcfstream()
Definition vcfstream.h:39
bool getNextChunk(DataChunk< arma::mat > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:139
bool getNextChunk(DataChunk< Eigen::SparseMatrix< double > > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:195
~vcfstream() override
Definition vcfstream.h:85
void setRegions(const vector< string > ®ions) override
Definition vcfstream.h:93
bool getNextChunk(DataChunk< Eigen::MatrixXd > &chunk, const bool &useFilter=true) override
Definition vcfstream.h:178
int n_samples() override
Definition vcfstream.h:119
Definition bgenstream.h:33
Definition GenomicDataStream_virtual.h:65
int chunkSize
Definition GenomicDataStream_virtual.h:157